From 19caf4e4f0eae82ff5f36e5bc99463b6677467e6 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 28 Jan 2008 15:48:51 +0900
Subject: Clone vf module.

---
 src/mesa/pipe/draw/draw_vf.c         | 374 +++++++++++++
 src/mesa/pipe/draw/draw_vf.h         | 249 +++++++++
 src/mesa/pipe/draw/draw_vf_generic.c | 983 +++++++++++++++++++++++++++++++++++
 src/mesa/pipe/draw/draw_vf_sse.c     | 664 +++++++++++++++++++++++
 src/mesa/sources                     |   3 +
 5 files changed, 2273 insertions(+)
 create mode 100644 src/mesa/pipe/draw/draw_vf.c
 create mode 100644 src/mesa/pipe/draw/draw_vf.h
 create mode 100644 src/mesa/pipe/draw/draw_vf_generic.c
 create mode 100644 src/mesa/pipe/draw/draw_vf_sse.c

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
new file mode 100644
index 0000000000..f758460b5f
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "context.h"
+#include "colormac.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+#define DBG 0
+
+
+
+static boolean match_fastpath( struct draw_vertex_fetch *vf,
+				 const struct draw_vf_fastpath *fp)
+{
+   unsigned j;
+
+   if (vf->attr_count != fp->attr_count) 
+      return FALSE;
+
+   for (j = 0; j < vf->attr_count; j++) 
+      if (vf->attr[j].format != fp->attr[j].format ||
+	  vf->attr[j].inputsize != fp->attr[j].size ||
+	  vf->attr[j].vertoffset != fp->attr[j].offset) 
+	 return FALSE;
+      
+   if (fp->match_strides) {
+      if (vf->vertex_stride != fp->vertex_stride)
+	 return FALSE;
+
+      for (j = 0; j < vf->attr_count; j++) 
+	 if (vf->attr[j].inputstride != fp->attr[j].stride) 
+	    return FALSE;
+   }
+   
+   return TRUE;
+}
+
+static boolean search_fastpath_emit( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp = vf->fastpath;
+
+   for ( ; fp ; fp = fp->next) {
+      if (match_fastpath(vf, fp)) {
+         vf->emit = fp->func;
+	 return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
+			     boolean match_strides )
+{
+   struct draw_vf_fastpath *fastpath = CALLOC_STRUCT(draw_vf_fastpath);
+   unsigned i;
+
+   fastpath->vertex_stride = vf->vertex_stride;
+   fastpath->attr_count = vf->attr_count;
+   fastpath->match_strides = match_strides;
+   fastpath->func = vf->emit;
+   fastpath->attr = (struct draw_vf_attr_type *)
+      _mesa_malloc(vf->attr_count * sizeof(fastpath->attr[0]));
+
+   for (i = 0; i < vf->attr_count; i++) {
+      fastpath->attr[i].format = vf->attr[i].format;
+      fastpath->attr[i].stride = vf->attr[i].inputstride;
+      fastpath->attr[i].size = vf->attr[i].inputsize;
+      fastpath->attr[i].offset = vf->attr[i].vertoffset;
+   }
+
+   fastpath->next = vf->fastpath;
+   vf->fastpath = fastpath;
+}
+
+
+
+
+/***********************************************************************
+ * Build codegen functions or return generic ones:
+ */
+static void choose_emit_func( struct draw_vertex_fetch *vf, 
+			      unsigned count, 
+			      uint8_t *dest)
+{
+   vf->emit = NULL;
+   
+   /* Does this match an existing (hardwired, codegen or known-bad)
+    * fastpath?
+    */
+   if (search_fastpath_emit(vf)) {
+      /* Use this result.  If it is null, then it is already known
+       * that the current state will fail for codegen and there is no
+       * point trying again.
+       */
+   }
+   else if (vf->codegen_emit) {
+      vf->codegen_emit( vf );
+   }
+
+   if (!vf->emit) {
+      draw_vf_generate_hardwired_emit(vf);
+   }
+
+   /* Otherwise use the generic version:
+    */
+   if (!vf->emit)
+      vf->emit = draw_vf_generic_emit;
+
+   vf->emit( vf, count, dest );
+}
+
+
+
+
+
+/***********************************************************************
+ * Public entrypoints, mostly dispatch to the above:
+ */
+
+
+
+unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
+				 const struct draw_vf_attr_map *map,
+				 unsigned nr, 
+				 unsigned vertex_stride )
+{
+   unsigned offset = 0;
+   unsigned i, j;
+
+   assert(nr < DRAW_VF_ATTRIB_MAX);
+
+   memset(vf->lookup, 0, sizeof(vf->lookup));
+
+   for (j = 0, i = 0; i < nr; i++) {
+      const unsigned format = map[i].format;
+      if (format == EMIT_PAD) {
+	 if (DBG)
+	    _mesa_printf("%d: pad %d, offset %d\n", i,  
+			 map[i].offset, offset);  
+
+	 offset += map[i].offset;
+
+      }
+      else {
+	 assert(vf->lookup[map[i].attrib] == 0);
+	 vf->lookup[map[i].attrib] = &vf->attr[j];
+
+	 vf->attr[j].attrib = map[i].attrib;
+	 vf->attr[j].format = format;
+	 vf->attr[j].insert = draw_vf_format_info[format].insert;
+	 vf->attr[j].extract = draw_vf_format_info[format].extract;
+	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
+	 vf->attr[j].vertoffset = offset;
+	 
+	 if (DBG)
+	    _mesa_printf("%d: %s, offset %d\n", i,  
+			 draw_vf_format_info[format].name,
+			 vf->attr[j].vertoffset);   
+
+	 offset += draw_vf_format_info[format].attrsize;
+	 j++;
+      }
+   }
+
+   vf->attr_count = j;
+   vf->vertex_stride = vertex_stride ? vertex_stride : offset;
+   vf->emit = choose_emit_func;
+
+   assert(vf->vertex_stride >= offset);
+   return vf->vertex_stride;
+}
+
+
+
+void draw_vf_set_vp_matrix( struct draw_vertex_fetch *vf,
+		       const float *viewport )
+{
+   assert(vf->allow_viewport_emits);
+
+   /* scale */
+   vf->vp[0] = viewport[MAT_SX];
+   vf->vp[1] = viewport[MAT_SY];
+   vf->vp[2] = viewport[MAT_SZ];
+   vf->vp[3] = 1.0;
+
+   /* translate */
+   vf->vp[4] = viewport[MAT_TX];
+   vf->vp[5] = viewport[MAT_TY];
+   vf->vp[6] = viewport[MAT_TZ];
+   vf->vp[7] = 0.0;
+}
+
+void draw_vf_set_vp_scale_translate( struct draw_vertex_fetch *vf,
+				const float *scale,
+				const float *translate )
+{
+   assert(vf->allow_viewport_emits);
+
+   vf->vp[0] = scale[0];
+   vf->vp[1] = scale[1];
+   vf->vp[2] = scale[2];
+   vf->vp[3] = scale[3];
+
+   vf->vp[4] = translate[0];
+   vf->vp[5] = translate[1];
+   vf->vp[6] = translate[2];
+   vf->vp[7] = translate[3];
+}
+
+
+/* Set attribute pointers, adjusted for start position:
+ */
+void draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const sources[],
+		     unsigned start )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      const GLvector4f *vptr = sources[a[j].attrib];
+      
+      if ((a[j].inputstride != vptr->stride) ||
+	  (a[j].inputsize != vptr->size))
+	 vf->emit = choose_emit_func;
+      
+      a[j].inputstride = vptr->stride;
+      a[j].inputsize = vptr->size;
+      a[j].do_insert = a[j].insert[vptr->size - 1]; 
+      a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
+   }
+}
+
+
+
+/* Emit count VB vertices to dest.  
+ */
+void draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
+		       unsigned count,
+		       void *dest )
+{
+   vf->emit( vf, count, (uint8_t*) dest );	
+}
+
+
+/* Extract a named attribute from a hardware vertex.  Will have to
+ * reverse any viewport transformation, swizzling or other conversions
+ * which may have been applied.
+ *
+ * This is mainly required for on-the-fly vertex translations to
+ * swrast format.
+ */
+void draw_vf_get_attr( struct draw_vertex_fetch *vf,
+		  const void *vertex,
+		  GLenum attr, 
+		  const float *dflt,
+		  float *dest )
+{
+   const struct draw_vf_attr *a = vf->attr;
+   const unsigned attr_count = vf->attr_count;
+   unsigned j;
+
+   for (j = 0; j < attr_count; j++) {
+      if (a[j].attrib == attr) {
+	 a[j].extract( &a[j], dest, (uint8_t *)vertex + a[j].vertoffset );
+	 return;
+      }
+   }
+
+   /* Else return the value from ctx->Current.
+    */
+   _mesa_memcpy( dest, dflt, 4*sizeof(float));
+}
+
+
+
+
+struct draw_vertex_fetch *draw_vf_create( boolean allow_viewport_emits )
+{
+   struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
+   unsigned i;
+
+   for (i = 0; i < DRAW_VF_ATTRIB_MAX; i++)
+      vf->attr[i].vf = vf;
+
+   vf->allow_viewport_emits = allow_viewport_emits;
+
+   switch(CHAN_TYPE) {
+   case GL_UNSIGNED_BYTE:
+      vf->chan_scale[0] = 255.0;
+      vf->chan_scale[1] = 255.0;
+      vf->chan_scale[2] = 255.0;
+      vf->chan_scale[3] = 255.0;
+      break;
+   case GL_UNSIGNED_SHORT:
+      vf->chan_scale[0] = 65535.0;
+      vf->chan_scale[1] = 65535.0;
+      vf->chan_scale[2] = 65535.0;
+      vf->chan_scale[3] = 65535.0;
+      break;
+   default:
+      vf->chan_scale[0] = 1.0;
+      vf->chan_scale[1] = 1.0;
+      vf->chan_scale[2] = 1.0;
+      vf->chan_scale[3] = 1.0;
+      break;
+   }
+
+   vf->identity[0] = 0.0;
+   vf->identity[1] = 0.0;
+   vf->identity[2] = 0.0;
+   vf->identity[3] = 1.0;
+
+   vf->codegen_emit = NULL;
+
+#ifdef USE_SSE_ASM
+   if (!_mesa_getenv("MESA_NO_CODEGEN"))
+      vf->codegen_emit = draw_vf_generate_sse_emit;
+#endif
+
+   return vf;
+}
+
+
+void draw_vf_destroy( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp, *tmp;
+
+   for (fp = vf->fastpath ; fp ; fp = tmp) {
+      tmp = fp->next;
+      FREE(fp->attr);
+
+      /* KW: At the moment, fp->func is constrained to be allocated by
+       * _mesa_exec_alloc(), as the hardwired fastpaths in
+       * t_vertex_generic.c are handled specially.  It would be nice
+       * to unify them, but this probably won't change until this
+       * module gets another overhaul.
+       */
+      _mesa_exec_free((void *) fp->func);
+      FREE(fp);
+   }
+   
+   vf->fastpath = NULL;
+   FREE(vf);
+}
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
new file mode 100644
index 0000000000..279570aad5
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VF_H
+#define DRAW_VF_H
+
+
+#include "pipe/p_compiler.h"
+#include "math/m_vector.h"
+
+
+enum {
+   DRAW_VF_ATTRIB_POS = 0,
+   DRAW_VF_ATTRIB_WEIGHT = 1,
+   DRAW_VF_ATTRIB_NORMAL = 2,
+   DRAW_VF_ATTRIB_COLOR0 = 3,
+   DRAW_VF_ATTRIB_COLOR1 = 4,
+   DRAW_VF_ATTRIB_FOG = 5,
+   DRAW_VF_ATTRIB_COLOR_INDEX = 6,
+   DRAW_VF_ATTRIB_EDGEFLAG = 7,
+   DRAW_VF_ATTRIB_TEX0 = 8,
+   DRAW_VF_ATTRIB_TEX1 = 9,
+   DRAW_VF_ATTRIB_TEX2 = 10,
+   DRAW_VF_ATTRIB_TEX3 = 11,
+   DRAW_VF_ATTRIB_TEX4 = 12,
+   DRAW_VF_ATTRIB_TEX5 = 13,
+   DRAW_VF_ATTRIB_TEX6 = 14,
+   DRAW_VF_ATTRIB_TEX7 = 15,
+   DRAW_VF_ATTRIB_VAR0 = 16,
+   DRAW_VF_ATTRIB_VAR1 = 17,
+   DRAW_VF_ATTRIB_VAR2 = 18,
+   DRAW_VF_ATTRIB_VAR3 = 19,
+   DRAW_VF_ATTRIB_VAR4 = 20,
+   DRAW_VF_ATTRIB_VAR5 = 21,
+   DRAW_VF_ATTRIB_VAR6 = 22,
+   DRAW_VF_ATTRIB_VAR7 = 23,
+   DRAW_VF_ATTRIB_POINTSIZE = 24,
+   DRAW_VF_ATTRIB_BFC0 = 25,
+   DRAW_VF_ATTRIB_BFC1 = 26,
+   DRAW_VF_ATTRIB_CLIP_POS = 27,
+   DRAW_VF_ATTRIB_VERTEX_HEADER = 28,
+   DRAW_VF_ATTRIB_MAX = 29
+};
+
+enum draw_vf_attr_format {
+   EMIT_1F,
+   EMIT_2F,
+   EMIT_3F,
+   EMIT_4F,
+   EMIT_2F_VIEWPORT,		/**< do viewport transform and emit */
+   EMIT_3F_VIEWPORT,		/**< do viewport transform and emit */
+   EMIT_4F_VIEWPORT,		/**< do viewport transform and emit */
+   EMIT_3F_XYW,			/**< for projective texture */
+   EMIT_1UB_1F,			/**< for fog coordinate */
+   EMIT_3UB_3F_RGB,		/**< for specular color */
+   EMIT_3UB_3F_BGR,		/**< for specular color */
+   EMIT_4UB_4F_RGBA,		/**< for color */
+   EMIT_4UB_4F_BGRA,		/**< for color */
+   EMIT_4UB_4F_ARGB,		/**< for color */
+   EMIT_4UB_4F_ABGR,		/**< for color */
+   EMIT_4CHAN_4F_RGBA,		/**< for swrast color */
+   EMIT_PAD,			/**< leave a hole of 'offset' bytes */
+   EMIT_MAX
+};
+
+struct draw_vf_attr_map {
+   unsigned attrib;
+   enum draw_vf_attr_format format;
+   unsigned offset;
+};
+
+struct draw_vertex_fetch;
+
+
+void 
+draw_vf_set_vp_matrix( struct draw_vertex_fetch *vf,
+                       const float *viewport );
+
+void 
+draw_vf_set_vp_scale_translate( struct draw_vertex_fetch *vf,
+				const float *scale,
+				const float *translate );
+
+unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride );
+
+void 
+draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const attrib[],
+		     unsigned start ); 
+
+void 
+draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
+		       unsigned count,
+		       void *dest );
+
+void 
+draw_vf_get_attr( struct draw_vertex_fetch *vf,
+		  const void *vertex,
+		  GLenum attr, 
+		  const float *dflt,
+		  float *dest );
+
+struct draw_vertex_fetch *
+draw_vf_create( boolean allow_viewport_emits );
+
+void 
+draw_vf_destroy( struct draw_vertex_fetch *vf );
+
+
+
+/***********************************************************************
+ * Internal functions and structs:
+ */
+
+struct draw_vf_attr;
+
+typedef void (*draw_vf_extract_func)( const struct draw_vf_attr *a, 
+				      float *out, 
+				      const uint8_t *v );
+
+typedef void (*draw_vf_insert_func)( const struct draw_vf_attr *a, 
+				     uint8_t *v, 
+				     const float *in );
+
+typedef void (*draw_vf_emit_func)( struct draw_vertex_fetch *vf,
+      				   unsigned count, 
+      				   uint8_t *dest );
+
+
+
+/**
+ * Describes how to convert/move a vertex attribute from a vertex
+ * array to a vertex structure.
+ */
+struct draw_vf_attr
+{
+   struct draw_vertex_fetch *vf;
+
+   unsigned format;
+   unsigned inputsize;
+   unsigned inputstride;
+   unsigned vertoffset;      /**< position of the attrib in the vertex struct */
+
+   unsigned attrib;          /**< which vertex attrib (0=position, etc) */
+   unsigned vertattrsize;    /**< size of the attribute in bytes */
+
+   uint8_t *inputptr;
+   const draw_vf_insert_func *insert;
+   draw_vf_insert_func do_insert;
+   draw_vf_extract_func extract;
+};
+
+struct draw_vertex_fetch
+{
+   struct draw_vf_attr attr[DRAW_VF_ATTRIB_MAX];
+   unsigned attr_count;
+   unsigned vertex_stride;
+
+   struct draw_vf_attr *lookup[DRAW_VF_ATTRIB_MAX];
+   
+   draw_vf_emit_func emit;
+
+   /* Parameters and constants for codegen:
+    */
+   boolean allow_viewport_emits;
+   float vp[8];		
+   float chan_scale[4];
+   float identity[4];
+
+   struct draw_vf_fastpath *fastpath;
+   
+   void (*codegen_emit)( struct draw_vertex_fetch *vf );
+};
+
+
+struct draw_vf_attr_type {
+   unsigned format;
+   unsigned size;
+   unsigned stride;
+   unsigned offset;
+};
+
+struct draw_vf_fastpath {
+   unsigned vertex_stride;
+   unsigned attr_count;
+   boolean match_strides;
+
+   struct draw_vf_attr_type *attr;
+
+   draw_vf_emit_func func;
+   struct draw_vf_fastpath *next;
+};
+
+
+void 
+draw_vf_register_fastpath( struct draw_vertex_fetch *vtx,
+                           boolean match_strides );
+
+void 
+draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+                      unsigned count,
+                      uint8_t *v );
+
+void 
+draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf );
+
+void 
+draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
+
+
+struct draw_vf_format_info {
+   const char *name;
+   draw_vf_extract_func extract;
+   draw_vf_insert_func insert[4];
+   const unsigned attrsize;
+};
+
+const struct draw_vf_format_info draw_vf_format_info[EMIT_MAX];
+
+
+#endif
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
new file mode 100644
index 0000000000..19e6c587e5
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -0,0 +1,983 @@
+
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "context.h"
+#include "colormac.h"
+#include "simple_list.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+
+/*
+ * These functions take the NDC coordinates pointed to by 'in', apply the
+ * NDC->Viewport mapping and store the results at 'v'.
+ */
+
+static INLINE void insert_4f_viewport_4( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] = scale[2] * in[2] + trans[2];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_viewport_3( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] = scale[2] * in[2] + trans[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] =                    trans[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] =                    trans[1];
+   out[2] =                    trans[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_viewport_3( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] = scale[2] * in[2] + trans[2];
+}
+
+static INLINE void insert_3f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+   out[2] = scale[2] * in[2] + trans[2];
+}
+
+static INLINE void insert_3f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] =                    trans[1];
+   out[2] =                    trans[2];
+}
+
+static INLINE void insert_2f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = scale[1] * in[1] + trans[1];
+}
+
+static INLINE void insert_2f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
+					 const float *in )
+{
+   float *out = (float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = scale[0] * in[0] + trans[0];
+   out[1] = trans[1];
+}
+
+
+/*
+ * These functions do the same as above, except for the viewport mapping.
+ */
+
+static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[3];
+}
+
+static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+   _mesa_exit(1);
+}
+
+static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+}
+
+static INLINE void insert_3f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+}
+
+static INLINE void insert_3f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+}
+
+
+static INLINE void insert_2f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+}
+
+static INLINE void insert_2f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+}
+
+static INLINE void insert_1f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+
+   out[0] = in[0];
+}
+
+static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+}
+
+static INLINE void insert_4chan_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
+					   const float *in )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[3], in[3]);
+}
+
+static INLINE void insert_4chan_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
+					   const float *in )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
+   c[3] = CHAN_MAX;
+}
+
+static INLINE void insert_4chan_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
+					   const float *in )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
+   c[2] = 0;
+   c[3] = CHAN_MAX;
+}
+
+static INLINE void insert_4chan_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
+					   const float *in )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
+   c[1] = 0;
+   c[2] = 0;
+   c[3] = CHAN_MAX;
+}
+
+static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_bgra_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_argb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   v[2] = 0x00;
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_abgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   v[2] = 0x00;
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_3ub_3f_rgb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+}
+
+static INLINE void insert_3ub_3f_rgb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_rgb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+}
+
+static INLINE void insert_3ub_3f_bgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+}
+
+
+static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v, 
+				    const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+}
+
+
+/***********************************************************************
+ * Functions to perform the reverse operations to the above, for
+ * swrast translation and clip-interpolation.
+ * 
+ * Currently always extracts a full 4 floats.
+ */
+
+static void extract_4f_viewport( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   /* Although included for completeness, the position coordinate is
+    * usually handled differently during clipping.
+    */
+   out[0] = (in[0] - trans[0]) / scale[0];
+   out[1] = (in[1] - trans[1]) / scale[1];
+   out[2] = (in[2] - trans[2]) / scale[2];
+   out[3] = in[3];
+}
+
+static void extract_3f_viewport( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = (in[0] - trans[0]) / scale[0];
+   out[1] = (in[1] - trans[1]) / scale[1];
+   out[2] = (in[2] - trans[2]) / scale[2];
+   out[3] = 1;
+}
+
+
+static void extract_2f_viewport( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   const float *scale = a->vf->vp;
+   const float *trans = a->vf->vp + 4;
+   
+   out[0] = (in[0] - trans[0]) / scale[0];
+   out[1] = (in[1] - trans[1]) / scale[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+
+static void extract_4f( const struct draw_vf_attr *a, float *out, const uint8_t *v  )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static void extract_3f_xyw( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = in[2];
+}
+
+
+static void extract_3f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+
+static void extract_2f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static void extract_1f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   const float *in = (const float *)v;
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static void extract_4chan_4f_rgba( const struct draw_vf_attr *a, float *out, 
+				   const uint8_t *v )
+{
+   GLchan *c = (GLchan *)v;
+   (void) a;
+
+   out[0] = CHAN_TO_FLOAT(c[0]);
+   out[1] = CHAN_TO_FLOAT(c[1]);
+   out[2] = CHAN_TO_FLOAT(c[2]);
+   out[3] = CHAN_TO_FLOAT(c[3]);
+}
+
+static void extract_4ub_4f_rgba( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   (void) a;
+   out[0] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[2] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_4ub_4f_bgra( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   (void) a;
+   out[2] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[0] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_4ub_4f_argb( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   (void) a;
+   out[3] = UBYTE_TO_FLOAT(v[0]);
+   out[0] = UBYTE_TO_FLOAT(v[1]);
+   out[1] = UBYTE_TO_FLOAT(v[2]);
+   out[2] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_4ub_4f_abgr( const struct draw_vf_attr *a, float *out, 
+				 const uint8_t *v )
+{
+   (void) a;
+   out[3] = UBYTE_TO_FLOAT(v[0]);
+   out[2] = UBYTE_TO_FLOAT(v[1]);
+   out[1] = UBYTE_TO_FLOAT(v[2]);
+   out[0] = UBYTE_TO_FLOAT(v[3]);
+}
+
+static void extract_3ub_3f_rgb( const struct draw_vf_attr *a, float *out, 
+				const uint8_t *v )
+{
+   (void) a;
+   out[0] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[2] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = 1;
+}
+
+static void extract_3ub_3f_bgr( const struct draw_vf_attr *a, float *out, 
+				const uint8_t *v )
+{
+   (void) a;
+   out[2] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = UBYTE_TO_FLOAT(v[1]);
+   out[0] = UBYTE_TO_FLOAT(v[2]);
+   out[3] = 1;
+}
+
+static void extract_1ub_1f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
+{
+   (void) a;
+   out[0] = UBYTE_TO_FLOAT(v[0]);
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+
+const struct draw_vf_format_info draw_vf_format_info[EMIT_MAX] = 
+{
+   { "1f",
+     extract_1f,
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float) },
+
+   { "2f",
+     extract_2f,
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float) },
+
+   { "3f",
+     extract_3f,
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float) },
+
+   { "4f",
+     extract_4f,
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float) },
+
+   { "2f_viewport",
+     extract_2f_viewport,
+     { insert_2f_viewport_1, insert_2f_viewport_2, insert_2f_viewport_2,
+       insert_2f_viewport_2 },
+     2 * sizeof(float) },
+
+   { "3f_viewport",
+     extract_3f_viewport,
+     { insert_3f_viewport_1, insert_3f_viewport_2, insert_3f_viewport_3,
+       insert_3f_viewport_3 },
+     3 * sizeof(float) },
+
+   { "4f_viewport",
+     extract_4f_viewport,
+     { insert_4f_viewport_1, insert_4f_viewport_2, insert_4f_viewport_3,
+       insert_4f_viewport_4 }, 
+     4 * sizeof(float) },
+
+   { "3f_xyw",
+     extract_3f_xyw,
+     { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
+       insert_3f_xyw_4 },
+     3 * sizeof(float) },
+
+   { "1ub_1f",
+     extract_1ub_1f,
+     { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
+     sizeof(uint8_t) },
+
+   { "3ub_3f_rgb",
+     extract_3ub_3f_rgb,
+     { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
+       insert_3ub_3f_rgb_3 },
+     3 * sizeof(uint8_t) },
+
+   { "3ub_3f_bgr",
+     extract_3ub_3f_bgr,
+     { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
+       insert_3ub_3f_bgr_3 },
+     3 * sizeof(uint8_t) },
+
+   { "4ub_4f_rgba",
+     extract_4ub_4f_rgba,
+     { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
+       insert_4ub_4f_rgba_4 },
+     4 * sizeof(uint8_t) },
+
+   { "4ub_4f_bgra",
+     extract_4ub_4f_bgra,
+     { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
+       insert_4ub_4f_bgra_4 },
+     4 * sizeof(uint8_t) },
+
+   { "4ub_4f_argb",
+     extract_4ub_4f_argb,
+     { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
+       insert_4ub_4f_argb_4 },
+     4 * sizeof(uint8_t) },
+
+   { "4ub_4f_abgr",
+     extract_4ub_4f_abgr,
+     { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
+       insert_4ub_4f_abgr_4 },
+     4 * sizeof(uint8_t) },
+
+   { "4chan_4f_rgba",
+     extract_4chan_4f_rgba,
+     { insert_4chan_4f_rgba_1, insert_4chan_4f_rgba_2, insert_4chan_4f_rgba_3,
+       insert_4chan_4f_rgba_4 },
+     4 * sizeof(GLchan) },
+
+   { "pad",
+     NULL,
+     { NULL, NULL, NULL, NULL },
+     0 }
+
+};
+
+
+
+    
+/***********************************************************************
+ * Hardwired fastpaths for emitting whole vertices or groups of
+ * vertices
+ */
+#define EMIT5(NR, F0, F1, F2, F3, F4, NAME)				\
+static void NAME( struct draw_vertex_fetch *vf,				\
+		  unsigned count,						\
+		  uint8_t *v )						\
+{									\
+   struct draw_vf_attr *a = vf->attr;				\
+   unsigned i;								\
+									\
+   for (i = 0 ; i < count ; i++, v += vf->vertex_stride) {		\
+      if (NR > 0) {							\
+	 F0( &a[0], v + a[0].vertoffset, (float *)a[0].inputptr );	\
+	 a[0].inputptr += a[0].inputstride;				\
+      }									\
+      									\
+      if (NR > 1) {							\
+	 F1( &a[1], v + a[1].vertoffset, (float *)a[1].inputptr );	\
+	 a[1].inputptr += a[1].inputstride;				\
+      }									\
+      									\
+      if (NR > 2) {							\
+	 F2( &a[2], v + a[2].vertoffset, (float *)a[2].inputptr );	\
+	 a[2].inputptr += a[2].inputstride;				\
+      }									\
+      									\
+      if (NR > 3) {							\
+	 F3( &a[3], v + a[3].vertoffset, (float *)a[3].inputptr );	\
+	 a[3].inputptr += a[3].inputstride;				\
+      }									\
+									\
+      if (NR > 4) {							\
+	 F4( &a[4], v + a[4].vertoffset, (float *)a[4].inputptr );	\
+	 a[4].inputptr += a[4].inputstride;				\
+      }									\
+   }									\
+}
+
+   
+#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
+				  insert_null, insert_null, NAME)
+
+#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
+				      insert_null, NAME)
+   
+#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
+				          insert_null, NAME)
+   
+
+EMIT2(insert_3f_viewport_3, insert_4ub_4f_rgba_4, emit_viewport3_rgba4)
+EMIT2(insert_3f_viewport_3, insert_4ub_4f_bgra_4, emit_viewport3_bgra4)
+EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
+
+EMIT3(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_viewport4_rgba4_st2)
+EMIT3(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2,  emit_viewport4_bgra4_st2)
+EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
+
+EMIT4(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_viewport4_rgba4_st2_st2)
+EMIT4(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2, insert_2f_2,  emit_viewport4_bgra4_st2_st2)
+EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
+
+
+/* Use the codegen paths to select one of a number of hardwired
+ * fastpaths.
+ */
+void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
+{
+   draw_vf_emit_func func = NULL;
+
+   /* Does it fit a hardwired fastpath?  Help! this is growing out of
+    * control!
+    */
+   switch (vf->attr_count) {
+   case 2:
+      if (vf->attr[0].do_insert == insert_3f_viewport_3) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4) 
+	    func = emit_viewport3_bgra4;
+	 else if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) 
+	    func = emit_viewport3_rgba4;
+      }
+      else if (vf->attr[0].do_insert == insert_3f_3 &&
+	       vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+ 	 func = emit_xyz3_rgba4; 
+      }
+      break;
+   case 3:
+      if (vf->attr[2].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_viewport_4)
+	       func = emit_viewport4_rgba4_st2;
+	    else if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2;
+	 }
+	 else if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4 &&
+		  vf->attr[0].do_insert == insert_4f_viewport_4)
+	    func = emit_viewport4_bgra4_st2;
+      }
+      break;
+   case 4:
+      if (vf->attr[2].do_insert == insert_2f_2 &&
+	  vf->attr[3].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_viewport_4)
+	       func = emit_viewport4_rgba4_st2_st2;
+	    else if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2_st2;
+	 }
+	 else if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4 &&
+		  vf->attr[0].do_insert == insert_4f_viewport_4)
+	    func = emit_viewport4_bgra4_st2_st2;
+      }
+      break;
+   }
+
+   vf->emit = func;
+}
+
+/***********************************************************************
+ * Generic (non-codegen) functions for whole vertices or groups of
+ * vertices
+ */
+
+void draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+		      unsigned count,
+		      uint8_t *v )
+{
+   struct draw_vf_attr *a = vf->attr;
+   const unsigned attr_count = vf->attr_count;
+   const unsigned stride = vf->vertex_stride;
+   unsigned i, j;
+
+   for (i = 0 ; i < count ; i++, v += stride) {
+      for (j = 0; j < attr_count; j++) {
+	 float *in = (float *)a[j].inputptr;
+	 a[j].inputptr += a[j].inputstride;
+	 a[j].do_insert( &a[j], v + a[j].vertoffset, in );
+      }
+   }
+}
+
+
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
new file mode 100644
index 0000000000..2cf3a45ff9
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -0,0 +1,664 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#include "glheader.h"
+#include "colormac.h"
+#include "simple_list.h"
+#include "enums.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+#if defined(USE_SSE_ASM)
+
+#include "x86/rtasm/x86sse.h"
+#include "x86/common_x86_asm.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct x86_program {
+   struct x86_function func;
+
+   struct draw_vertex_fetch *vf;
+   boolean inputs_safe;
+   boolean outputs_safe;
+   boolean have_sse2;
+   
+   struct x86_reg identity;
+   struct x86_reg chan0;
+};
+
+
+static struct x86_reg get_identity( struct x86_program *p )
+{
+   return p->identity;
+}
+
+static void emit_load4f_4( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_load4f_3( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Initialize from identity, then pull in low two words:
+    */
+   sse_movups(&p->func, dest, get_identity(p));
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Pull in low word, then swizzle in identity */
+   sse_movss(&p->func, dest, arg0);
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+}
+
+
+
+static void emit_load3f_3( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Over-reads by 1 dword - potential SEGV if input is a vertex
+    * array.
+    */
+   if (p->inputs_safe) {
+      sse_movups(&p->func, dest, arg0);
+   } 
+   else {
+      /* c 0 0 0
+       * c c c c
+       * a b c c 
+       */
+      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
+      sse_movlps(&p->func, dest, arg0);
+   }
+}
+
+static void emit_load3f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_2(p, dest, arg0);
+}
+
+static void emit_load3f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load2f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load2f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load1f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+static void (*load[4][4])( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 ) = {
+   { emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1 },
+
+   { emit_load2f_1, 
+     emit_load2f_2, 
+     emit_load2f_2, 
+     emit_load2f_2 },
+
+   { emit_load3f_1, 
+     emit_load3f_2, 
+     emit_load3f_3, 
+     emit_load3f_3 },
+
+   { emit_load4f_1, 
+     emit_load4f_2, 
+     emit_load4f_3, 
+     emit_load4f_4 } 
+};
+
+static void emit_load( struct x86_program *p,
+		       struct x86_reg dest,
+		       unsigned sz,
+		       struct x86_reg src,
+		       unsigned src_sz)
+{
+   load[sz-1][src_sz-1](p, dest, src);
+}
+
+static void emit_store4f( struct x86_program *p, 			   
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_store3f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   if (p->outputs_safe) {
+      /* Emit the extra dword anyway.  This may hurt writecombining,
+       * may cause other problems.
+       */
+      sse_movups(&p->func, dest, arg0);
+   }
+   else {
+      /* Alternate strategy - emit two, shuffle, emit one.
+       */
+      sse_movlps(&p->func, dest, arg0);
+      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
+   }
+}
+
+static void emit_store2f( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_store1f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+
+static void (*store[4])( struct x86_program *p, 
+			 struct x86_reg dest,
+			 struct x86_reg arg0 ) = 
+{
+   emit_store1f, 
+   emit_store2f, 
+   emit_store3f, 
+   emit_store4f 
+};
+
+static void emit_store( struct x86_program *p,
+			struct x86_reg dest,
+			unsigned sz,
+			struct x86_reg temp )
+
+{
+   store[sz-1](p, dest, temp);
+}
+
+static void emit_pack_store_4ub( struct x86_program *p,
+				 struct x86_reg dest,
+				 struct x86_reg temp )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(&p->func, temp, p->chan0);
+
+   if (p->have_sse2) {
+      sse2_cvtps2dq(&p->func, temp, temp);
+      sse2_packssdw(&p->func, temp, temp);
+      sse2_packuswb(&p->func, temp, temp);
+      sse_movss(&p->func, dest, temp);
+   }
+   else {
+      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
+      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
+      sse_cvtps2pi(&p->func, mmx0, temp);
+      sse_movhlps(&p->func, temp, temp);
+      sse_cvtps2pi(&p->func, mmx1, temp);
+      mmx_packssdw(&p->func, mmx0, mmx1);
+      mmx_packuswb(&p->func, mmx0, mmx0);
+      mmx_movd(&p->func, dest, mmx0);
+   }
+}
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+/* Not much happens here.  Eventually use this function to try and
+ * avoid saving/reloading the source pointers each vertex (if some of
+ * them can fit in registers).
+ */
+static void get_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+   /* Load current a[j].inputptr
+    */
+   x86_mov(&p->func, srcREG, ptr_to_src);
+}
+
+static void update_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   if (a->inputstride) {
+      struct draw_vertex_fetch *vf = p->vf;
+      struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+      /* add a[j].inputstride (hardcoded value - could just as easily
+       * pull the stride value from memory each time).
+       */
+      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
+      
+      /* save new value of a[j].inputptr 
+       */
+      x86_mov(&p->func, ptr_to_src, srcREG);
+   }
+}
+
+
+/* Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct x86_program *p )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   unsigned j = 0;
+
+   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
+   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
+   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
+   struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
+   struct x86_reg temp = x86_make_reg(file_XMM, 0);
+   struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
+   struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
+   uint8_t *fixup, *label;
+
+   /* Push a few regs?
+    */
+   x86_push(&p->func, countEBP);
+   x86_push(&p->func, vfESI);
+
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(&p->func, srcECX, srcECX);
+   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
+   x86_cmp(&p->func, countEBP, srcECX);
+   fixup = x86_jcc_forward(&p->func, cc_E);
+
+   /* Initialize destination register. 
+    */
+   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
+
+   /* Move argument 1 (vf) into a reg:
+    */
+   x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
+
+   
+   /* Possibly load vp0, vp1 for viewport calcs:
+    */
+   if (vf->allow_viewport_emits) {
+      sse_movups(&p->func, vp0, x86_make_disp(vfESI, get_offset(vf, &vf->vp[0])));
+      sse_movups(&p->func, vp1, x86_make_disp(vfESI, get_offset(vf, &vf->vp[4])));
+   }
+
+   /* always load, needed or not:
+    */
+   sse_movups(&p->func, p->chan0, x86_make_disp(vfESI, get_offset(vf, &vf->chan_scale[0])));
+   sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
+
+   /* Note address for loop jump */
+   label = x86_get_label(&p->func);
+
+   /* Emit code for each of the attributes.  Currently routes
+    * everything through SSE registers, even when it might be more
+    * efficient to stick with regular old x86.  No optimization or
+    * other tricks - enough new ground to cover here just getting
+    * things working.
+    */
+   while (j < vf->attr_count) {
+      struct draw_vf_attr *a = &vf->attr[j];
+      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
+
+      /* Now, load an XMM reg from src, perhaps transform, then save.
+       * Could be shortcircuited in specific cases:
+       */
+      switch (a->format) {
+      case EMIT_1F:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 1, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_2F:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_3F:
+	 /* Potentially the worst case - hardcode 2+1 copying:
+	  */
+	 if (0) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 3, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 2, temp);
+	    if (a->inputsize > 2) {
+	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
+	       emit_store(p, x86_make_disp(dest,8), 1, temp);
+	    }
+	    else {
+	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
+	    }
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 break;
+      case EMIT_4F:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_2F_VIEWPORT: 
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 sse_mulps(&p->func, temp, vp0);
+	 sse_addps(&p->func, temp, vp1);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_3F_VIEWPORT: 
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	 sse_mulps(&p->func, temp, vp0);
+	 sse_addps(&p->func, temp, vp1);
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4F_VIEWPORT: 
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_mulps(&p->func, temp, vp0);
+	 sse_addps(&p->func, temp, vp1);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_3F_XYW:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+
+      case EMIT_1UB_1F:	 
+	 /* Test for PAD3 + 1UB:
+	  */
+	 if (j > 0 &&
+	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
+	 {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
+	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+	    return FALSE;
+	 }
+	 break;
+      case EMIT_3UB_3F_RGB:
+      case EMIT_3UB_3F_BGR:
+	 /* Test for 3UB + PAD1:
+	  */
+	 if (j == vf->attr_count - 1 ||
+	     a[1].vertoffset >= a->vertoffset + 4) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    if (a->format == EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 /* Test for 3UB + 1UB:
+	  */
+	 else if (j < vf->attr_count - 1 &&
+		  a[1].format == EMIT_1UB_1F &&
+		  a[1].vertoffset == a->vertoffset + 3) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    update_src_ptr(p, srcECX, vfESI, a);
+
+	    /* Make room for incoming value:
+	     */
+	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+
+	    get_src_ptr(p, srcECX, vfESI, &a[1]);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
+	    update_src_ptr(p, srcECX, vfESI, &a[1]);
+
+	    /* Rearrange and possibly do BGR conversion:
+	     */
+	    if (a->format == EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	    else
+	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
+
+	    emit_pack_store_4ub(p, dest, temp);
+	    j++;		/* NOTE: two attrs consumed */
+	 }
+	 else {
+	    _mesa_printf("Can't emit 3ub\n");
+	 }
+	 return FALSE;	/* add this later */
+	 break;
+
+      case EMIT_4UB_4F_RGBA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4UB_4F_BGRA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4UB_4F_ARGB:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4UB_4F_ABGR:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case EMIT_4CHAN_4F_RGBA:
+	 switch (CHAN_TYPE) {
+	 case GL_UNSIGNED_BYTE:
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	    break;
+	 case GL_FLOAT:
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 4, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	    break;
+	 case GL_UNSIGNED_SHORT:
+	 default:
+	    _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
+	    return FALSE;
+	 }
+	 break;
+      default:
+	 _mesa_printf("unknown a[%d].format %d\n", j, a->format);
+	 return FALSE;	/* catch any new opcodes */
+      }
+      
+      /* Increment j by at least 1 - may have been incremented above also:
+       */
+      j++;
+   }
+
+   /* Next vertex:
+    */
+   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(&p->func, countEBP);
+   x86_test(&p->func, countEBP, countEBP); 
+   x86_jcc(&p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func.need_emms)
+      mmx_emms(&p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(&p->func, fixup);
+
+   /* Pop regs and return
+    */
+   x86_pop(&p->func, x86_get_base_reg(vfESI));
+   x86_pop(&p->func, countEBP);
+   x86_ret(&p->func);
+
+   vf->emit = (draw_vf_emit_func)x86_get_func(&p->func);
+   return TRUE;
+}
+
+
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   struct x86_program p;   
+
+   if (!cpu_has_xmm) {
+      vf->codegen_emit = NULL;
+      return;
+   }
+
+   _mesa_memset(&p, 0, sizeof(p));
+
+   p.vf = vf;
+   p.inputs_safe = 0;		/* for now */
+   p.outputs_safe = 1;		/* for now */
+   p.have_sse2 = cpu_has_xmm2;
+   p.identity = x86_make_reg(file_XMM, 6);
+   p.chan0 = x86_make_reg(file_XMM, 7);
+
+   x86_init_func(&p.func);
+
+   if (build_vertex_emit(&p)) {
+      draw_vf_register_fastpath( vf, TRUE );
+   }
+   else {
+      /* Note the failure so that we don't keep trying to codegen an
+       * impossible state:
+       */
+      draw_vf_register_fastpath( vf, FALSE );
+      x86_release_func(&p.func);
+   }
+}
+
+#else
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   /* Dummy version for when USE_SSE_ASM not defined */
+}
+
+#endif
diff --git a/src/mesa/sources b/src/mesa/sources
index 97ef7e1936..e31d8cc466 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -175,6 +175,9 @@ DRAW_SOURCES = \
 	pipe/draw/draw_vertex_fetch.c \
 	pipe/draw/draw_vertex_shader.c \
 	pipe/draw/draw_vertex_shader_llvm.c \
+	pipe/draw/draw_vf.c \
+	pipe/draw/draw_vf_generic.c \
+	pipe/draw/draw_vf_sse.c \
 	pipe/draw/draw_wide_prims.c
 
 TGSIEXEC_SOURCES = \
-- 
cgit v1.2.3


From 09059259bed779360158664625e41a67f7496a74 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:46:21 +0900
Subject: First stab at hooking draw_vbuf & vf.

Emit disabled for now. Tested with softpipe. Only one vertex at a time for now (slow).
---
 src/mesa/pipe/draw/draw_vbuf.c       | 183 ++++++++++++++++++++++++++++++++++-
 src/mesa/pipe/draw/draw_vf.c         |  18 +++-
 src/mesa/pipe/draw/draw_vf.h         |  46 +++++----
 src/mesa/pipe/draw/draw_vf_generic.c |   2 +-
 src/mesa/pipe/draw/draw_vf_sse.c     |  38 ++++----
 5 files changed, 241 insertions(+), 46 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 1e260c6156..a3d0b5bca3 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -35,12 +35,15 @@
 
 
 #include <assert.h>
+#include <stddef.h>
 
-#include "pipe/draw/draw_vbuf.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vertex.h"
 #include "pipe/p_util.h"
 
+#include "draw_vbuf.h"
+#include "draw_private.h"
+#include "draw_vertex.h"
+#include "draw_vf.h"
+
 
 /**
  * Vertex buffer emit stage.
@@ -55,6 +58,8 @@ struct vbuf_stage {
    /** Vertex size in bytes */
    unsigned vertex_size;
 
+   struct draw_vertex_fetch *vf;
+   
    /* FIXME: we have no guarantee that 'unsigned' is 32bit */
 
    /** Vertices in hardware format */
@@ -121,6 +126,7 @@ static INLINE void
 emit_vertex( struct vbuf_stage *vbuf,
              struct vertex_header *vertex )
 {
+#if 0
    const struct vertex_info *vinfo = vbuf->vinfo;
 
    uint i;
@@ -151,9 +157,11 @@ emit_vertex( struct vbuf_stage *vbuf,
       case EMIT_ALL:
          /* just copy the whole vertex as-is to the vbuf */
          assert(i == 0);
+         assert(j == 0);
          memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
          vbuf->vertex_ptr += vinfo->size;
-         return;
+         count += vinfo->size;
+         break;
       case EMIT_1F:
          *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
          count++;
@@ -192,6 +200,156 @@ emit_vertex( struct vbuf_stage *vbuf,
       }
    }
    assert(count == vinfo->size);
+#else
+   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
+      if(vertex->vertex_id < vbuf->nr_vertices)
+	 return;
+      else
+	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	         vertex->vertex_id, vbuf->nr_vertices);
+      return;
+   }
+      
+   vertex->vertex_id = vbuf->nr_vertices++;
+
+   draw_vf_set_data(vbuf->vf, vertex->data);
+   draw_vf_emit_vertices(vbuf->vf, 1, vbuf->vertex_ptr);
+
+   vbuf->vertex_ptr += vbuf->vertex_size/4;
+#endif
+}
+
+
+static void
+vbuf_set_vf_attributes(struct vbuf_stage *vbuf ) 
+{
+   const struct vertex_info *vinfo = vbuf->vinfo;
+   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
+   uint i;
+   uint count = 0;  /* for debug/sanity */
+   unsigned nr_attrs = 0;
+   
+//   fprintf(stderr, "emit vertex %d to %p\n", 
+//           vbuf->nr_vertices, vbuf->vertex_ptr);
+
+#if 0
+   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
+      if(vertex->vertex_id < vbuf->nr_vertices)
+	 return;
+      else
+	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	         vertex->vertex_id, vbuf->nr_vertices);
+      return;
+   }
+#endif
+   
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      uint j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         /* no-op */
+         break;
+      case EMIT_ALL: {
+         /* just copy the whole vertex as-is to the vbuf */
+	 unsigned k, s = vinfo->size;
+         assert(i == 0);
+         assert(j == 0);
+         /* copy the vertex header */
+         /* XXX: we actually don't copy the header, just pad it */
+	 attrs[nr_attrs].attrib = 0;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
+	 s -= offsetof(struct vertex_header, data)/4;
+         count += offsetof(struct vertex_header, data)/4;
+	 nr_attrs++;
+	 /* copy the vertex data */
+         for(k = 0; k < (s & ~0x3); k += 4) {
+      	    attrs[nr_attrs].attrib = k/4;
+      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
+      	    attrs[nr_attrs].offset = 0;
+      	    nr_attrs++;
+            count += 4;
+         }
+         /* tail */
+         /* XXX: actually, this shouldn't be needed */
+ 	 attrs[nr_attrs].attrib = k/4;
+  	 attrs[nr_attrs].offset = 0;
+         switch(s & 0x3) {
+         case 0:
+            break;
+         case 1:
+      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
+      	    nr_attrs++;
+            count += 1;
+            break;
+         case 2:
+      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
+      	    nr_attrs++;
+            count += 2;
+            break;
+         case 3:
+      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
+      	    nr_attrs++;
+            count += 3;
+            break;
+         }
+         break;
+      }
+      case EMIT_1F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_1F_PSIZE:
+	 /* FIXME */
+	 assert(0);
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_2F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_2F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 2;
+         break;
+      case EMIT_3F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_3F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 3;
+         break;
+      case EMIT_4F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 4;
+         break;
+      case EMIT_4UB:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   
+   assert(count == vinfo->size);  
+   
+   draw_vf_set_vertex_attributes(vbuf->vf, 
+                                 attrs, 
+                                 nr_attrs, 
+                                 vbuf->vertex_size);
 }
 
 
@@ -269,6 +427,7 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
 
    vbuf->vinfo = vinfo;
    vbuf->vertex_size = vertex_size;
+   vbuf_set_vf_attributes(vbuf);
    
    if (!vbuf->vertices)
       vbuf_alloc_vertices(vbuf);
@@ -423,7 +582,12 @@ static void vbuf_destroy( struct draw_stage *stage )
 {
    struct vbuf_stage *vbuf = vbuf_stage( stage );
 
-   align_free( vbuf->indices );
+   if(vbuf->indices)
+      align_free( vbuf->indices );
+   
+   if(vbuf->vf)
+      draw_vf_destroy( vbuf->vf );
+
    FREE( stage );
 }
 
@@ -436,6 +600,9 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
 {
    struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
 
+   if(!vbuf)
+      return NULL;
+   
    vbuf->stage.draw = draw;
    vbuf->stage.point = vbuf_first_point;
    vbuf->stage.line = vbuf_first_line;
@@ -450,11 +617,17 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
    vbuf->max_indices = render->max_indices;
    vbuf->indices = (ushort *)
       align_malloc( vbuf->max_indices * sizeof(vbuf->indices[0]), 16 );
+   if(!vbuf->indices)
+      vbuf_destroy(&vbuf->stage);
    
    vbuf->vertices = NULL;
    vbuf->vertex_ptr = vbuf->vertices;
 
    vbuf->prim = ~0;
    
+   vbuf->vf = draw_vf_create(FALSE);
+   if(!vbuf->vf)
+      vbuf_destroy(&vbuf->stage);
+   
    return &vbuf->stage;
 }
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index f758460b5f..675974c6bc 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -162,7 +162,7 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
    for (j = 0, i = 0; i < nr; i++) {
       const unsigned format = map[i].format;
-      if (format == EMIT_PAD) {
+      if (format == DRAW_EMIT_PAD) {
 	 if (DBG)
 	    _mesa_printf("%d: pad %d, offset %d\n", i,  
 			 map[i].offset, offset);  
@@ -261,6 +261,22 @@ void draw_vf_set_sources( struct draw_vertex_fetch *vf,
 }
 
 
+/* Set attribute pointers, adjusted for start position:
+ */
+void draw_vf_set_data( struct draw_vertex_fetch *vf,
+                       float data[][4])
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      a[j].inputstride = 0; /* XXX: one-vertex-max ATM */ 
+      a[j].inputsize = 4;
+      a[j].do_insert = a[j].insert[4 - 1]; 
+      a[j].inputptr = (uint8_t *)&data[a[j].attrib][0];
+   }
+}
+
 
 /* Emit count VB vertices to dest.  
  */
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 279570aad5..7619c0ee27 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -29,9 +29,11 @@
 #define DRAW_VF_H
 
 
-#include "pipe/p_compiler.h"
 #include "math/m_vector.h"
 
+#include "pipe/p_compiler.h"
+#include "draw_vertex.h"
+
 
 enum {
    DRAW_VF_ATTRIB_POS = 0,
@@ -67,24 +69,24 @@ enum {
 };
 
 enum draw_vf_attr_format {
-   EMIT_1F,
-   EMIT_2F,
-   EMIT_3F,
-   EMIT_4F,
-   EMIT_2F_VIEWPORT,		/**< do viewport transform and emit */
-   EMIT_3F_VIEWPORT,		/**< do viewport transform and emit */
-   EMIT_4F_VIEWPORT,		/**< do viewport transform and emit */
-   EMIT_3F_XYW,			/**< for projective texture */
-   EMIT_1UB_1F,			/**< for fog coordinate */
-   EMIT_3UB_3F_RGB,		/**< for specular color */
-   EMIT_3UB_3F_BGR,		/**< for specular color */
-   EMIT_4UB_4F_RGBA,		/**< for color */
-   EMIT_4UB_4F_BGRA,		/**< for color */
-   EMIT_4UB_4F_ARGB,		/**< for color */
-   EMIT_4UB_4F_ABGR,		/**< for color */
-   EMIT_4CHAN_4F_RGBA,		/**< for swrast color */
-   EMIT_PAD,			/**< leave a hole of 'offset' bytes */
-   EMIT_MAX
+   DRAW_EMIT_1F,
+   DRAW_EMIT_2F,
+   DRAW_EMIT_3F,
+   DRAW_EMIT_4F,
+   DRAW_EMIT_2F_VIEWPORT,		/**< do viewport transform and emit */
+   DRAW_EMIT_3F_VIEWPORT,		/**< do viewport transform and emit */
+   DRAW_EMIT_4F_VIEWPORT,		/**< do viewport transform and emit */
+   DRAW_EMIT_3F_XYW,			/**< for projective texture */
+   DRAW_EMIT_1UB_1F,			/**< for fog coordinate */
+   DRAW_EMIT_3UB_3F_RGB,		/**< for specular color */
+   DRAW_EMIT_3UB_3F_BGR,		/**< for specular color */
+   DRAW_EMIT_4UB_4F_RGBA,		/**< for color */
+   DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
+   DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
+   DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
+   DRAW_EMIT_4CHAN_4F_RGBA,		/**< for swrast color */
+   DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
+   DRAW_EMIT_MAX
 };
 
 struct draw_vf_attr_map {
@@ -116,6 +118,10 @@ draw_vf_set_sources( struct draw_vertex_fetch *vf,
 		     GLvector4f * const attrib[],
 		     unsigned start ); 
 
+void 
+draw_vf_set_data( struct draw_vertex_fetch *vf,
+                  float data[][4]);
+
 void 
 draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
 		       unsigned count,
@@ -243,7 +249,7 @@ struct draw_vf_format_info {
    const unsigned attrsize;
 };
 
-const struct draw_vf_format_info draw_vf_format_info[EMIT_MAX];
+const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX];
 
 
 #endif
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 19e6c587e5..42effc0c65 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -735,7 +735,7 @@ static void extract_1ub_1f( const struct draw_vf_attr *a, float *out, const uint
 }
 
 
-const struct draw_vf_format_info draw_vf_format_info[EMIT_MAX] = 
+const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] = 
 {
    { "1f",
      extract_1f,
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index 2cf3a45ff9..a7019a47e6 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -398,19 +398,19 @@ static boolean build_vertex_emit( struct x86_program *p )
        * Could be shortcircuited in specific cases:
        */
       switch (a->format) {
-      case EMIT_1F:
+      case DRAW_EMIT_1F:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 1, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_2F:
+      case DRAW_EMIT_2F:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 2, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_3F:
+      case DRAW_EMIT_3F:
 	 /* Potentially the worst case - hardcode 2+1 copying:
 	  */
 	 if (0) {
@@ -433,13 +433,13 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 break;
-      case EMIT_4F:
+      case DRAW_EMIT_4F:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 4, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_2F_VIEWPORT: 
+      case DRAW_EMIT_2F_VIEWPORT: 
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 	 sse_mulps(&p->func, temp, vp0);
@@ -447,7 +447,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_store(p, dest, 2, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_3F_VIEWPORT: 
+      case DRAW_EMIT_3F_VIEWPORT: 
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 	 sse_mulps(&p->func, temp, vp0);
@@ -455,7 +455,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_store(p, dest, 3, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4F_VIEWPORT: 
+      case DRAW_EMIT_4F_VIEWPORT: 
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_mulps(&p->func, temp, vp0);
@@ -463,7 +463,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_store(p, dest, 4, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_3F_XYW:
+      case DRAW_EMIT_3F_XYW:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
@@ -471,7 +471,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
 
-      case EMIT_1UB_1F:	 
+      case DRAW_EMIT_1UB_1F:	 
 	 /* Test for PAD3 + 1UB:
 	  */
 	 if (j > 0 &&
@@ -488,15 +488,15 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    return FALSE;
 	 }
 	 break;
-      case EMIT_3UB_3F_RGB:
-      case EMIT_3UB_3F_BGR:
+      case DRAW_EMIT_3UB_3F_RGB:
+      case DRAW_EMIT_3UB_3F_BGR:
 	 /* Test for 3UB + PAD1:
 	  */
 	 if (j == vf->attr_count - 1 ||
 	     a[1].vertoffset >= a->vertoffset + 4) {
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	    if (a->format == EMIT_3UB_3F_BGR)
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
 	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 	    emit_pack_store_4ub(p, dest, temp);
 	    update_src_ptr(p, srcECX, vfESI, a);
@@ -504,7 +504,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 /* Test for 3UB + 1UB:
 	  */
 	 else if (j < vf->attr_count - 1 &&
-		  a[1].format == EMIT_1UB_1F &&
+		  a[1].format == DRAW_EMIT_1UB_1F &&
 		  a[1].vertoffset == a->vertoffset + 3) {
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
@@ -520,7 +520,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 
 	    /* Rearrange and possibly do BGR conversion:
 	     */
-	    if (a->format == EMIT_3UB_3F_BGR)
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
 	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 	    else
 	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
@@ -534,34 +534,34 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 return FALSE;	/* add this later */
 	 break;
 
-      case EMIT_4UB_4F_RGBA:
+      case DRAW_EMIT_4UB_4F_RGBA:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4UB_4F_BGRA:
+      case DRAW_EMIT_4UB_4F_BGRA:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4UB_4F_ARGB:
+      case DRAW_EMIT_4UB_4F_ARGB:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4UB_4F_ABGR:
+      case DRAW_EMIT_4UB_4F_ABGR:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case EMIT_4CHAN_4F_RGBA:
+      case DRAW_EMIT_4CHAN_4F_RGBA:
 	 switch (CHAN_TYPE) {
 	 case GL_UNSIGNED_BYTE:
 	    get_src_ptr(p, srcECX, vfESI, a);
-- 
cgit v1.2.3


From 5abc8d9e23b1b8cde9c4183b73bfced3d4f01c87 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:14:13 +0000
Subject: gallium: remove dead code from draw_vf*

---
 src/mesa/pipe/draw/Makefile          |   2 +
 src/mesa/pipe/draw/draw_vbuf.c       |   2 +-
 src/mesa/pipe/draw/draw_vf.c         |  90 +-------
 src/mesa/pipe/draw/draw_vf.h         |  18 +-
 src/mesa/pipe/draw/draw_vf_generic.c | 420 +----------------------------------
 src/mesa/pipe/draw/draw_vf_sse.c     |  51 -----
 6 files changed, 9 insertions(+), 574 deletions(-)
 create mode 100644 src/mesa/pipe/draw/Makefile

(limited to 'src')

diff --git a/src/mesa/pipe/draw/Makefile b/src/mesa/pipe/draw/Makefile
new file mode 100644
index 0000000000..451911a354
--- /dev/null
+++ b/src/mesa/pipe/draw/Makefile
@@ -0,0 +1,2 @@
+default:
+	cd .. ; make
diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index a3d0b5bca3..8ca225c65a 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -625,7 +625,7 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
 
    vbuf->prim = ~0;
    
-   vbuf->vf = draw_vf_create(FALSE);
+   vbuf->vf = draw_vf_create();
    if(!vbuf->vf)
       vbuf_destroy(&vbuf->stage);
    
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 675974c6bc..deedfc7bc7 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -177,7 +177,6 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 	 vf->attr[j].attrib = map[i].attrib;
 	 vf->attr[j].format = format;
 	 vf->attr[j].insert = draw_vf_format_info[format].insert;
-	 vf->attr[j].extract = draw_vf_format_info[format].extract;
 	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
 	 vf->attr[j].vertoffset = offset;
 	 
@@ -201,41 +200,6 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
 
-void draw_vf_set_vp_matrix( struct draw_vertex_fetch *vf,
-		       const float *viewport )
-{
-   assert(vf->allow_viewport_emits);
-
-   /* scale */
-   vf->vp[0] = viewport[MAT_SX];
-   vf->vp[1] = viewport[MAT_SY];
-   vf->vp[2] = viewport[MAT_SZ];
-   vf->vp[3] = 1.0;
-
-   /* translate */
-   vf->vp[4] = viewport[MAT_TX];
-   vf->vp[5] = viewport[MAT_TY];
-   vf->vp[6] = viewport[MAT_TZ];
-   vf->vp[7] = 0.0;
-}
-
-void draw_vf_set_vp_scale_translate( struct draw_vertex_fetch *vf,
-				const float *scale,
-				const float *translate )
-{
-   assert(vf->allow_viewport_emits);
-
-   vf->vp[0] = scale[0];
-   vf->vp[1] = scale[1];
-   vf->vp[2] = scale[2];
-   vf->vp[3] = scale[3];
-
-   vf->vp[4] = translate[0];
-   vf->vp[5] = translate[1];
-   vf->vp[6] = translate[2];
-   vf->vp[7] = translate[3];
-}
-
 
 /* Set attribute pointers, adjusted for start position:
  */
@@ -288,39 +252,10 @@ void draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
 }
 
 
-/* Extract a named attribute from a hardware vertex.  Will have to
- * reverse any viewport transformation, swizzling or other conversions
- * which may have been applied.
- *
- * This is mainly required for on-the-fly vertex translations to
- * swrast format.
- */
-void draw_vf_get_attr( struct draw_vertex_fetch *vf,
-		  const void *vertex,
-		  GLenum attr, 
-		  const float *dflt,
-		  float *dest )
-{
-   const struct draw_vf_attr *a = vf->attr;
-   const unsigned attr_count = vf->attr_count;
-   unsigned j;
-
-   for (j = 0; j < attr_count; j++) {
-      if (a[j].attrib == attr) {
-	 a[j].extract( &a[j], dest, (uint8_t *)vertex + a[j].vertoffset );
-	 return;
-      }
-   }
-
-   /* Else return the value from ctx->Current.
-    */
-   _mesa_memcpy( dest, dflt, 4*sizeof(float));
-}
-
 
 
-struct draw_vertex_fetch *draw_vf_create( boolean allow_viewport_emits )
+struct draw_vertex_fetch *draw_vf_create( void )
 {
    struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
    unsigned i;
@@ -328,29 +263,6 @@ struct draw_vertex_fetch *draw_vf_create( boolean allow_viewport_emits )
    for (i = 0; i < DRAW_VF_ATTRIB_MAX; i++)
       vf->attr[i].vf = vf;
 
-   vf->allow_viewport_emits = allow_viewport_emits;
-
-   switch(CHAN_TYPE) {
-   case GL_UNSIGNED_BYTE:
-      vf->chan_scale[0] = 255.0;
-      vf->chan_scale[1] = 255.0;
-      vf->chan_scale[2] = 255.0;
-      vf->chan_scale[3] = 255.0;
-      break;
-   case GL_UNSIGNED_SHORT:
-      vf->chan_scale[0] = 65535.0;
-      vf->chan_scale[1] = 65535.0;
-      vf->chan_scale[2] = 65535.0;
-      vf->chan_scale[3] = 65535.0;
-      break;
-   default:
-      vf->chan_scale[0] = 1.0;
-      vf->chan_scale[1] = 1.0;
-      vf->chan_scale[2] = 1.0;
-      vf->chan_scale[3] = 1.0;
-      break;
-   }
-
    vf->identity[0] = 0.0;
    vf->identity[1] = 0.0;
    vf->identity[2] = 0.0;
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 7619c0ee27..c6a8fe0d53 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -73,9 +73,6 @@ enum draw_vf_attr_format {
    DRAW_EMIT_2F,
    DRAW_EMIT_3F,
    DRAW_EMIT_4F,
-   DRAW_EMIT_2F_VIEWPORT,		/**< do viewport transform and emit */
-   DRAW_EMIT_3F_VIEWPORT,		/**< do viewport transform and emit */
-   DRAW_EMIT_4F_VIEWPORT,		/**< do viewport transform and emit */
    DRAW_EMIT_3F_XYW,			/**< for projective texture */
    DRAW_EMIT_1UB_1F,			/**< for fog coordinate */
    DRAW_EMIT_3UB_3F_RGB,		/**< for specular color */
@@ -84,7 +81,6 @@ enum draw_vf_attr_format {
    DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
    DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
    DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
-   DRAW_EMIT_4CHAN_4F_RGBA,		/**< for swrast color */
    DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
    DRAW_EMIT_MAX
 };
@@ -98,14 +94,6 @@ struct draw_vf_attr_map {
 struct draw_vertex_fetch;
 
 
-void 
-draw_vf_set_vp_matrix( struct draw_vertex_fetch *vf,
-                       const float *viewport );
-
-void 
-draw_vf_set_vp_scale_translate( struct draw_vertex_fetch *vf,
-				const float *scale,
-				const float *translate );
 
 unsigned 
 draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
@@ -135,7 +123,7 @@ draw_vf_get_attr( struct draw_vertex_fetch *vf,
 		  float *dest );
 
 struct draw_vertex_fetch *
-draw_vf_create( boolean allow_viewport_emits );
+draw_vf_create( void );
 
 void 
 draw_vf_destroy( struct draw_vertex_fetch *vf );
@@ -196,9 +184,6 @@ struct draw_vertex_fetch
 
    /* Parameters and constants for codegen:
     */
-   boolean allow_viewport_emits;
-   float vp[8];		
-   float chan_scale[4];
    float identity[4];
 
    struct draw_vf_fastpath *fastpath;
@@ -244,7 +229,6 @@ draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
 
 struct draw_vf_format_info {
    const char *name;
-   draw_vf_extract_func extract;
    draw_vf_insert_func insert[4];
    const unsigned attrsize;
 };
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 42effc0c65..343428d26c 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -36,125 +36,6 @@
 #include "draw_vf.h"
 
 
-/*
- * These functions take the NDC coordinates pointed to by 'in', apply the
- * NDC->Viewport mapping and store the results at 'v'.
- */
-
-static INLINE void insert_4f_viewport_4( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] = scale[2] * in[2] + trans[2];
-   out[3] = in[3];
-}
-
-static INLINE void insert_4f_viewport_3( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] = scale[2] * in[2] + trans[2];
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] =                    trans[2];
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] =                    trans[1];
-   out[2] =                    trans[2];
-   out[3] = 1;
-}
-
-static INLINE void insert_3f_viewport_3( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] = scale[2] * in[2] + trans[2];
-}
-
-static INLINE void insert_3f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-   out[2] = scale[2] * in[2] + trans[2];
-}
-
-static INLINE void insert_3f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] =                    trans[1];
-   out[2] =                    trans[2];
-}
-
-static INLINE void insert_2f_viewport_2( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = scale[1] * in[1] + trans[1];
-}
-
-static INLINE void insert_2f_viewport_1( const struct draw_vf_attr *a, uint8_t *v,
-					 const float *in )
-{
-   float *out = (float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = scale[0] * in[0] + trans[0];
-   out[1] = trans[1];
-}
-
-
-/*
- * These functions do the same as above, except for the viewport mapping.
- */
 
 static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
 {
@@ -278,50 +159,6 @@ static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const
    (void) a; (void) v; (void) in;
 }
 
-static INLINE void insert_4chan_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
-					   const float *in )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[3], in[3]);
-}
-
-static INLINE void insert_4chan_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
-					   const float *in )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[2], in[2]); 
-   c[3] = CHAN_MAX;
-}
-
-static INLINE void insert_4chan_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
-					   const float *in )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
-   c[2] = 0;
-   c[3] = CHAN_MAX;
-}
-
-static INLINE void insert_4chan_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
-					   const float *in )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-   UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
-   c[1] = 0;
-   c[2] = 0;
-   c[3] = CHAN_MAX;
-}
-
 static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
 					 const float *in )
 {
@@ -545,291 +382,64 @@ static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v,
 }
 
 
-/***********************************************************************
- * Functions to perform the reverse operations to the above, for
- * swrast translation and clip-interpolation.
- * 
- * Currently always extracts a full 4 floats.
- */
-
-static void extract_4f_viewport( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   /* Although included for completeness, the position coordinate is
-    * usually handled differently during clipping.
-    */
-   out[0] = (in[0] - trans[0]) / scale[0];
-   out[1] = (in[1] - trans[1]) / scale[1];
-   out[2] = (in[2] - trans[2]) / scale[2];
-   out[3] = in[3];
-}
-
-static void extract_3f_viewport( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = (in[0] - trans[0]) / scale[0];
-   out[1] = (in[1] - trans[1]) / scale[1];
-   out[2] = (in[2] - trans[2]) / scale[2];
-   out[3] = 1;
-}
-
-
-static void extract_2f_viewport( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   const float *scale = a->vf->vp;
-   const float *trans = a->vf->vp + 4;
-   
-   out[0] = (in[0] - trans[0]) / scale[0];
-   out[1] = (in[1] - trans[1]) / scale[1];
-   out[2] = 0;
-   out[3] = 1;
-}
-
-
-static void extract_4f( const struct draw_vf_attr *a, float *out, const uint8_t *v  )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = in[3];
-}
-
-static void extract_3f_xyw( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-   out[3] = in[2];
-}
-
-
-static void extract_3f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = 1;
-}
-
-
-static void extract_2f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static void extract_1f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   const float *in = (const float *)v;
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = 0;
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static void extract_4chan_4f_rgba( const struct draw_vf_attr *a, float *out, 
-				   const uint8_t *v )
-{
-   GLchan *c = (GLchan *)v;
-   (void) a;
-
-   out[0] = CHAN_TO_FLOAT(c[0]);
-   out[1] = CHAN_TO_FLOAT(c[1]);
-   out[2] = CHAN_TO_FLOAT(c[2]);
-   out[3] = CHAN_TO_FLOAT(c[3]);
-}
-
-static void extract_4ub_4f_rgba( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   (void) a;
-   out[0] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[2] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_4ub_4f_bgra( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   (void) a;
-   out[2] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[0] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_4ub_4f_argb( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   (void) a;
-   out[3] = UBYTE_TO_FLOAT(v[0]);
-   out[0] = UBYTE_TO_FLOAT(v[1]);
-   out[1] = UBYTE_TO_FLOAT(v[2]);
-   out[2] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_4ub_4f_abgr( const struct draw_vf_attr *a, float *out, 
-				 const uint8_t *v )
-{
-   (void) a;
-   out[3] = UBYTE_TO_FLOAT(v[0]);
-   out[2] = UBYTE_TO_FLOAT(v[1]);
-   out[1] = UBYTE_TO_FLOAT(v[2]);
-   out[0] = UBYTE_TO_FLOAT(v[3]);
-}
-
-static void extract_3ub_3f_rgb( const struct draw_vf_attr *a, float *out, 
-				const uint8_t *v )
-{
-   (void) a;
-   out[0] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[2] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = 1;
-}
-
-static void extract_3ub_3f_bgr( const struct draw_vf_attr *a, float *out, 
-				const uint8_t *v )
-{
-   (void) a;
-   out[2] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = UBYTE_TO_FLOAT(v[1]);
-   out[0] = UBYTE_TO_FLOAT(v[2]);
-   out[3] = 1;
-}
-
-static void extract_1ub_1f( const struct draw_vf_attr *a, float *out, const uint8_t *v )
-{
-   (void) a;
-   out[0] = UBYTE_TO_FLOAT(v[0]);
-   out[1] = 0;
-   out[2] = 0;
-   out[3] = 1;
-}
-
-
 const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] = 
 {
    { "1f",
-     extract_1f,
      { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
      sizeof(float) },
 
    { "2f",
-     extract_2f,
      { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
      2 * sizeof(float) },
 
    { "3f",
-     extract_3f,
      { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
      3 * sizeof(float) },
 
    { "4f",
-     extract_4f,
      { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
      4 * sizeof(float) },
 
-   { "2f_viewport",
-     extract_2f_viewport,
-     { insert_2f_viewport_1, insert_2f_viewport_2, insert_2f_viewport_2,
-       insert_2f_viewport_2 },
-     2 * sizeof(float) },
-
-   { "3f_viewport",
-     extract_3f_viewport,
-     { insert_3f_viewport_1, insert_3f_viewport_2, insert_3f_viewport_3,
-       insert_3f_viewport_3 },
-     3 * sizeof(float) },
-
-   { "4f_viewport",
-     extract_4f_viewport,
-     { insert_4f_viewport_1, insert_4f_viewport_2, insert_4f_viewport_3,
-       insert_4f_viewport_4 }, 
-     4 * sizeof(float) },
-
    { "3f_xyw",
-     extract_3f_xyw,
      { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
        insert_3f_xyw_4 },
      3 * sizeof(float) },
 
    { "1ub_1f",
-     extract_1ub_1f,
      { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
      sizeof(uint8_t) },
 
    { "3ub_3f_rgb",
-     extract_3ub_3f_rgb,
      { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
        insert_3ub_3f_rgb_3 },
      3 * sizeof(uint8_t) },
 
    { "3ub_3f_bgr",
-     extract_3ub_3f_bgr,
      { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
        insert_3ub_3f_bgr_3 },
      3 * sizeof(uint8_t) },
 
    { "4ub_4f_rgba",
-     extract_4ub_4f_rgba,
      { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
        insert_4ub_4f_rgba_4 },
      4 * sizeof(uint8_t) },
 
    { "4ub_4f_bgra",
-     extract_4ub_4f_bgra,
      { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
        insert_4ub_4f_bgra_4 },
      4 * sizeof(uint8_t) },
 
    { "4ub_4f_argb",
-     extract_4ub_4f_argb,
      { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
        insert_4ub_4f_argb_4 },
      4 * sizeof(uint8_t) },
 
    { "4ub_4f_abgr",
-     extract_4ub_4f_abgr,
      { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
        insert_4ub_4f_abgr_4 },
      4 * sizeof(uint8_t) },
 
-   { "4chan_4f_rgba",
-     extract_4chan_4f_rgba,
-     { insert_4chan_4f_rgba_1, insert_4chan_4f_rgba_2, insert_4chan_4f_rgba_3,
-       insert_4chan_4f_rgba_4 },
-     4 * sizeof(GLchan) },
-
    { "pad",
-     NULL,
      { NULL, NULL, NULL, NULL },
      0 }
 
@@ -889,16 +499,10 @@ static void NAME( struct draw_vertex_fetch *vf,				\
 				          insert_null, NAME)
    
 
-EMIT2(insert_3f_viewport_3, insert_4ub_4f_rgba_4, emit_viewport3_rgba4)
-EMIT2(insert_3f_viewport_3, insert_4ub_4f_bgra_4, emit_viewport3_bgra4)
 EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
 
-EMIT3(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_viewport4_rgba4_st2)
-EMIT3(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2,  emit_viewport4_bgra4_st2)
 EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
 
-EMIT4(insert_4f_viewport_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_viewport4_rgba4_st2_st2)
-EMIT4(insert_4f_viewport_4, insert_4ub_4f_bgra_4, insert_2f_2, insert_2f_2,  emit_viewport4_bgra4_st2_st2)
 EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
 
 
@@ -914,42 +518,26 @@ void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
     */
    switch (vf->attr_count) {
    case 2:
-      if (vf->attr[0].do_insert == insert_3f_viewport_3) {
-	 if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4) 
-	    func = emit_viewport3_bgra4;
-	 else if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) 
-	    func = emit_viewport3_rgba4;
-      }
-      else if (vf->attr[0].do_insert == insert_3f_3 &&
-	       vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+      if (vf->attr[0].do_insert == insert_3f_3 &&
+	  vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
  	 func = emit_xyz3_rgba4; 
       }
       break;
    case 3:
       if (vf->attr[2].do_insert == insert_2f_2) {
 	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
-	    if (vf->attr[0].do_insert == insert_4f_viewport_4)
-	       func = emit_viewport4_rgba4_st2;
-	    else if (vf->attr[0].do_insert == insert_4f_4) 
+	    if (vf->attr[0].do_insert == insert_4f_4) 
 	       func = emit_xyzw4_rgba4_st2;
 	 }
-	 else if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4 &&
-		  vf->attr[0].do_insert == insert_4f_viewport_4)
-	    func = emit_viewport4_bgra4_st2;
       }
       break;
    case 4:
       if (vf->attr[2].do_insert == insert_2f_2 &&
 	  vf->attr[3].do_insert == insert_2f_2) {
 	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
-	    if (vf->attr[0].do_insert == insert_4f_viewport_4)
-	       func = emit_viewport4_rgba4_st2_st2;
-	    else if (vf->attr[0].do_insert == insert_4f_4) 
+	    if (vf->attr[0].do_insert == insert_4f_4) 
 	       func = emit_xyzw4_rgba4_st2_st2;
 	 }
-	 else if (vf->attr[1].do_insert == insert_4ub_4f_bgra_4 &&
-		  vf->attr[0].do_insert == insert_4f_viewport_4)
-	    func = emit_viewport4_bgra4_st2_st2;
       }
       break;
    }
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index a7019a47e6..b238b542e7 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -369,13 +369,6 @@ static boolean build_vertex_emit( struct x86_program *p )
    x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
 
    
-   /* Possibly load vp0, vp1 for viewport calcs:
-    */
-   if (vf->allow_viewport_emits) {
-      sse_movups(&p->func, vp0, x86_make_disp(vfESI, get_offset(vf, &vf->vp[0])));
-      sse_movups(&p->func, vp1, x86_make_disp(vfESI, get_offset(vf, &vf->vp[4])));
-   }
-
    /* always load, needed or not:
     */
    sse_movups(&p->func, p->chan0, x86_make_disp(vfESI, get_offset(vf, &vf->chan_scale[0])));
@@ -439,30 +432,6 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_store(p, dest, 4, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case DRAW_EMIT_2F_VIEWPORT: 
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
-	 sse_mulps(&p->func, temp, vp0);
-	 sse_addps(&p->func, temp, vp1);
-	 emit_store(p, dest, 2, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_3F_VIEWPORT: 
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	 sse_mulps(&p->func, temp, vp0);
-	 sse_addps(&p->func, temp, vp1);
-	 emit_store(p, dest, 3, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_4F_VIEWPORT: 
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_mulps(&p->func, temp, vp0);
-	 sse_addps(&p->func, temp, vp1);
-	 emit_store(p, dest, 4, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
       case DRAW_EMIT_3F_XYW:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
@@ -561,26 +530,6 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
-      case DRAW_EMIT_4CHAN_4F_RGBA:
-	 switch (CHAN_TYPE) {
-	 case GL_UNSIGNED_BYTE:
-	    get_src_ptr(p, srcECX, vfESI, a);
-	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	    emit_pack_store_4ub(p, dest, temp);
-	    update_src_ptr(p, srcECX, vfESI, a);
-	    break;
-	 case GL_FLOAT:
-	    get_src_ptr(p, srcECX, vfESI, a);
-	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	    emit_store(p, dest, 4, temp);
-	    update_src_ptr(p, srcECX, vfESI, a);
-	    break;
-	 case GL_UNSIGNED_SHORT:
-	 default:
-	    _mesa_printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
-	    return FALSE;
-	 }
-	 break;
       default:
 	 _mesa_printf("unknown a[%d].format %d\n", j, a->format);
 	 return FALSE;	/* catch any new opcodes */
-- 
cgit v1.2.3


From 4141ebdf59cddbb412b388c7f38f50e5e80c49d2 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:46:52 +0000
Subject: gallium: fill in missing formats for vertex_fetch

---
 src/mesa/pipe/draw/draw_vertex_fetch.c | 220 ++++++++++++++++++++++++++++++---
 1 file changed, 203 insertions(+), 17 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index fb64723a19..0789dc8e8c 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -62,50 +62,236 @@ fetch_##NAME(const void *ptr, float *attrib)		\
    }							\
 }
 
+#define CVT_64_FLOAT   ((double *) ptr)[i]
 #define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
 #define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
 #define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
 
 FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
 FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
 FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
 FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
 FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
-FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
 
 
 static fetch_func get_fetch_func( enum pipe_format format )
 {
    switch (format) {
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return fetch_R32G32B32A32_FLOAT;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return fetch_R32G32B32_FLOAT;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
    case PIPE_FORMAT_R32_FLOAT:
       return fetch_R32_FLOAT;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return fetch_R32G32B32A32_SSCALED;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-      return fetch_R32G32B32_SSCALED;
-   case PIPE_FORMAT_R32G32_SSCALED:
-      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
    case PIPE_FORMAT_R32_SSCALED:
       return fetch_R32_SSCALED;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return fetch_A8R8G8B8_UNORM;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
       return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
    case 0:
-      return NULL;
+      return NULL;		/* not sure why this is needed */
+
    default:
-      /* Lots of missing cases! */
       assert(0);
       return NULL;
    }
-- 
cgit v1.2.3


From a46181044fd5573895180ee5f1a016c4c1e4a653 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:48:22 +0000
Subject: gallium: explictly cast double to float in vertex fetch

---
 src/mesa/pipe/draw/draw_vertex_fetch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index 0789dc8e8c..af3983b7f0 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -62,7 +62,7 @@ fetch_##NAME(const void *ptr, float *attrib)		\
    }							\
 }
 
-#define CVT_64_FLOAT   ((double *) ptr)[i]
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
 #define CVT_32_FLOAT   ((float *) ptr)[i]
 
 #define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
-- 
cgit v1.2.3


From 99f297651198c3424aab202595064d6f5596b2fc Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 11:12:11 +0000
Subject: gallium: only call vertex/prim queue flush when there is something to
 flush

---
 src/mesa/pipe/draw/draw_prim.c          |  9 +++++----
 src/mesa/pipe/draw/draw_vertex_fetch.c  | 10 ++++++++++
 src/mesa/pipe/draw/draw_vertex_shader.c |  2 ++
 3 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 243381aec0..2a612a1673 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -63,8 +63,7 @@ static void draw_prim_queue_flush( struct draw_context *draw )
       fprintf(stdout,"Flushing with %d prims, %d verts\n",
              draw->pq.queue_nr, draw->vs.queue_nr);
 
-   if (draw->pq.queue_nr == 0)
-      return;
+   assert (draw->pq.queue_nr != 0);
 
    /* NOTE: we cannot save draw->pipeline->first in a local var because
     * draw->pipeline->first is often changed by the first call to tri(),
@@ -109,10 +108,12 @@ void draw_do_flush( struct draw_context *draw, unsigned flags )
 
 
    if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
-      draw_vertex_shader_queue_flush(draw);
+      if (draw->vs.queue_nr)
+	 draw_vertex_shader_queue_flush(draw);
 
       if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
-         draw_prim_queue_flush(draw);
+	 if (draw->pq.queue_nr)
+	    draw_prim_queue_flush(draw);
 
 	 if (flags >= DRAW_FLUSH_VERTEX_CACHE) {
             draw_vertex_cache_invalidate(draw);
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index af3983b7f0..143acdd3b4 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -158,6 +158,14 @@ FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
 
 static fetch_func get_fetch_func( enum pipe_format format )
 {
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
    switch (format) {
    case PIPE_FORMAT_R64_FLOAT:
       return fetch_R64_FLOAT;
@@ -317,6 +325,8 @@ void draw_update_vertex_fetch( struct draw_context *draw )
 {
    unsigned nr_attrs, i;
 
+//   _mesa_printf("%s\n", __FUNCTION__);
+   
    /* this may happend during context init */
    if (!draw->vertex_shader)
       return;
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 3041974b9a..289c35c7ae 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -201,6 +201,8 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
 {
    unsigned i, j;
 
+   assert(draw->vs.queue_nr != 0);
+
    /* XXX: do this on statechange: 
     */
    draw_update_vertex_fetch( draw );
-- 
cgit v1.2.3


From 88858e046888d0bcb763537adc74a64e564678df Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 12:40:29 +0000
Subject: gallium: add a couple of hardwired vertex fetch functions

---
 src/mesa/pipe/draw/draw_private.h       |   9 +-
 src/mesa/pipe/draw/draw_vertex_fetch.c  | 150 +++++++++++++++++++++++++++-----
 src/mesa/pipe/draw/draw_vertex_shader.c |  12 ++-
 3 files changed, 141 insertions(+), 30 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index 1e59f5bd8d..21de400676 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -141,6 +141,10 @@ struct draw_vertex_shader {
 /* Internal function for vertex fetch.
  */
 typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*full_fetch_func)( struct draw_context *draw,
+				 struct tgsi_exec_machine *machine,
+				 const unsigned *elts,
+				 unsigned count );
 
 
@@ -210,6 +214,7 @@ struct draw_context
       unsigned pitch[PIPE_ATTRIB_MAX];
       fetch_func fetch[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
+      full_fetch_func fetch_func;
    } vertex_fetch;
 
    /* Post-tnl vertex cache:
@@ -287,10 +292,6 @@ extern void draw_vertex_shader_queue_flush_llvm( struct draw_context *draw );
 struct tgsi_exec_machine;
 
 extern void draw_update_vertex_fetch( struct draw_context *draw );
-extern void draw_vertex_fetch( struct draw_context *draw,
-			       struct tgsi_exec_machine *machine,
-			       const unsigned *elts,
-			       unsigned count );
 
 
 #define DRAW_FLUSH_SHADER_QUEUE              0x1 /* sized not to overflow, never raised */
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index 143acdd3b4..afdf1971d2 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -320,42 +320,101 @@ transpose_4x4( float *out, const float *in )
 }
 
 
-			       
-void draw_update_vertex_fetch( struct draw_context *draw )
+
+static void fetch_xyz_rgb( struct draw_context *draw,
+			   struct tgsi_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
 {
-   unsigned nr_attrs, i;
+   assert(count <= 4);
 
 //   _mesa_printf("%s\n", __FUNCTION__);
-   
-   /* this may happend during context init */
-   if (!draw->vertex_shader)
-      return;
 
-   nr_attrs = draw->vertex_shader->state->num_inputs;
+   /* loop over vertex attributes (vertex shader inputs)
+    */
 
-   for (i = 0; i < nr_attrs; i++) {
-      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
-      enum pipe_format format  = draw->vertex_element[i].src_format;
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+   }
+}
 
-      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
-						       draw->vertex_buffer[buf].buffer_offset + 
-						       draw->vertex_element[i].src_offset;
 
-      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
-      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
-   }
 
-   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+static void fetch_xyz_rgb_st( struct draw_context *draw,
+			      struct tgsi_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
 }
 
 
+
+
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-void draw_vertex_fetch( struct draw_context *draw,
-			struct tgsi_exec_machine *machine,
-			const unsigned *elts,
-			unsigned count )
+static void generic_vertex_fetch( struct draw_context *draw,
+				  struct tgsi_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
 {
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
@@ -402,3 +461,50 @@ void draw_vertex_fetch( struct draw_context *draw,
    }
 }
 
+
+			       
+void draw_update_vertex_fetch( struct draw_context *draw )
+{
+   unsigned nr_attrs, i;
+
+//   _mesa_printf("%s\n", __FUNCTION__);
+   
+   /* this may happend during context init */
+   if (!draw->vertex_shader)
+      return;
+
+   nr_attrs = draw->vertex_shader->state->num_inputs;
+
+   for (i = 0; i < nr_attrs; i++) {
+      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+      enum pipe_format format  = draw->vertex_element[i].src_format;
+
+      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
+						       draw->vertex_buffer[buf].buffer_offset + 
+						       draw->vertex_element[i].src_offset;
+
+      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+   }
+
+   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (nr_attrs) {
+   case 2:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+
+}
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 289c35c7ae..0806e23d6c 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -110,7 +110,7 @@ run_vertex_program(struct draw_context *draw,
    machine->Inputs = ALIGN16_ASSIGN(inputs);
    machine->Outputs = ALIGN16_ASSIGN(outputs);
 
-   draw_vertex_fetch( draw, machine, elts, count );
+   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
 
    /* run shader */
 #if defined(__i386__) || defined(__386__)
@@ -219,14 +219,18 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
    for (i = 0; i < draw->vs.queue_nr; i += 4) {
       struct vertex_header *dests[4];
       unsigned elts[4];
-      int n;
+      int n = MIN2(4, draw->vs.queue_nr - i);
 
-      for (j = 0; j < 4; j++) {
+      for (j = 0; j < n; j++) {
          elts[j] = draw->vs.queue[i + j].elt;
          dests[j] = draw->vs.queue[i + j].dest;
       }
 
-      n = MIN2(4, draw->vs.queue_nr - i);
+      for ( ; j < 4; j++) {
+	 elts[j] = elts[0];
+	 dests[j] = dests[0];
+      }
+
       assert(n > 0);
       assert(n <= 4);
 
-- 
cgit v1.2.3


From 82d9063708539d53c7670b2ab732bed24230b94d Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 15:50:02 +0000
Subject: gallium: handle flatshading explicitly in clipper stage

We can do a better job in the clip stage than just relying on the
brute-force approach of copying colors to all incoming vertices applied
in the flatshade stage.

At very least, it is only necessary to do this in the clipper when a
primitive is actually being clipped.
---
 src/mesa/pipe/draw/draw_clip.c     | 136 +++++++++++++++++++++++++------------
 src/mesa/pipe/draw/draw_validate.c |  10 +--
 2 files changed, 99 insertions(+), 47 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_clip.c b/src/mesa/pipe/draw/draw_clip.c
index 2d410e3244..da20028904 100644
--- a/src/mesa/pipe/draw/draw_clip.c
+++ b/src/mesa/pipe/draw/draw_clip.c
@@ -33,6 +33,8 @@
 
 
 #include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
 #include "draw_context.h"
 #include "draw_private.h"
 
@@ -54,6 +56,12 @@
 struct clipper {
    struct draw_stage stage;      /**< base class */
 
+   /* Basically duplicate some of the flatshading logic here:
+    */
+   boolean flat;
+   uint num_color_attribs;
+   uint color_attribs[4];  /* front/back primary/secondary colors */
+
    float (*plane)[4];
 };
 
@@ -82,6 +90,17 @@ static void interp_attr( float *fdst,
    fdst[3] = LINTERP( t, fout[3], fin[3] );
 }
 
+static void copy_colors( struct draw_stage *stage,
+			 struct vertex_header *dst,
+			 const struct vertex_header *src )
+{
+   const struct clipper *clipper = clipper_stage(stage);
+   uint i;
+   for (i = 0; i < clipper->num_color_attribs; i++) {
+      const uint attr = clipper->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+}
 
 
@@ -134,27 +153,11 @@ static void interp( const struct clipper *clip,
    }
 }
 
-#if 0   
-static INLINE void do_tri( struct draw_stage *next,
-			   struct prim_header *header )
-{
-   unsigned i;
-   for (i = 0; i < 3; i++) {
-      float *ndc = header->v[i]->data[0];
-      _mesa_printf("ndc %f %f %f\n", ndc[0], ndc[1], ndc[2]);
-      assert(ndc[0] >= -1 && ndc[0] <= 641);
-      assert(ndc[1] >= 30 && ndc[1] <= 481);
-   }
-   _mesa_printf("\n");
-   next->tri(next, header);
-}
-#endif
-
 
 static void emit_poly( struct draw_stage *stage,
 		       struct vertex_header **inlist,
 		       unsigned n,
-                       const struct prim_header *origPrim)
+		       const struct prim_header *origPrim)
 {
    struct prim_header header;
    unsigned i;
@@ -163,16 +166,16 @@ static void emit_poly( struct draw_stage *stage,
    header.det = origPrim->det;
 
    for (i = 2; i < n; i++) {
-      header.v[0] = inlist[0];
-      header.v[1] = inlist[i-1];
-      header.v[2] = inlist[i];
+      header.v[0] = inlist[i-1];
+      header.v[1] = inlist[i];
+      header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
 	
       {
-	 unsigned tmp0 = header.v[0]->edgeflag;
+	 unsigned tmp1 = header.v[1]->edgeflag;
 	 unsigned tmp2 = header.v[2]->edgeflag;
 
-	 if (i != 2)   header.v[0]->edgeflag = 0;
-	 if (i != n-1) header.v[2]->edgeflag = 0;
+	 if (i != n-1) header.v[1]->edgeflag = 0;
+	 if (i != 2)   header.v[2]->edgeflag = 0;
 
          header.edgeflags = ((header.v[0]->edgeflag << 0) | 
                              (header.v[1]->edgeflag << 1) | 
@@ -180,27 +183,13 @@ static void emit_poly( struct draw_stage *stage,
 
 	 stage->next->tri( stage->next, &header );
 
-	 header.v[0]->edgeflag = tmp0;
+	 header.v[1]->edgeflag = tmp1;
 	 header.v[2]->edgeflag = tmp2;
       }
    }
 }
 
 
-#if 0
-static void emit_poly( struct draw_stage *stage )
-{
-   unsigned i;
-
-   for (i = 2; i < n; i++) {
-      header->v[0] = inlist[0];
-      header->v[1] = inlist[i-1];
-      header->v[2] = inlist[i];
-	 
-      stage->next->tri( stage->next, header );
-   }
-}
-#endif
 
 
 /* Clip a triangle against the viewport and user clip planes.
@@ -281,6 +270,18 @@ do_clip_tri( struct draw_stage *stage,
       }
    }
 
+   /* If flat-shading, copy color to new provoking vertex.
+    */
+   if (clipper->flat && inlist[0] != header->v[2]) {
+      if (1) {
+	 inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+      }
+
+      copy_colors(stage, inlist[0], header->v[2]);
+   }
+
+
+
    /* Emit the polygon as triangles to the setup stage:
     */
    if (n >= 3)
@@ -328,6 +329,10 @@ do_clip_line( struct draw_stage *stage,
 
    if (v0->clipmask) {
       interp( clipper, stage->tmp[0], t0, v0, v1 );
+
+      if (clipper->flat)
+	 copy_colors(stage, stage->tmp[0], v0);
+
       newprim.v[0] = stage->tmp[0];
    }
    else {
@@ -393,8 +398,55 @@ clip_tri( struct draw_stage *stage,
    }
 }
 
-static void clip_flush( struct draw_stage *stage, unsigned flags )
+/* Update state.  Could further delay this until we hit the first
+ * primitive that really requires clipping.
+ */
+static void 
+clip_init_state( struct draw_stage *stage )
+{
+   struct clipper *clipper = clipper_stage( stage );
+
+   clipper->flat = stage->draw->rasterizer->flatshade;
+
+   if (clipper->flat) {
+      const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+      uint i;
+
+      clipper->num_color_attribs = 0;
+      for (i = 0; i < vs->num_outputs; i++) {
+	 if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+	     vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+	    clipper->color_attribs[clipper->num_color_attribs++] = i;
+	 }
+      }
+   }
+   
+   stage->tri = clip_tri;
+   stage->line = clip_line;
+}
+
+
+
+static void clip_first_tri( struct draw_stage *stage,
+			    struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void clip_first_line( struct draw_stage *stage,
+			     struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void clip_flush( struct draw_stage *stage, 
+			     unsigned flags )
 {
+   stage->tri = clip_first_tri;
+   stage->line = clip_first_line;
    stage->next->flush( stage->next, flags );
 }
 
@@ -420,12 +472,12 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw )
 {
    struct clipper *clipper = CALLOC_STRUCT(clipper);
 
-   draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES );
+   draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES+1 );
 
    clipper->stage.draw = draw;
    clipper->stage.point = clip_point;
-   clipper->stage.line = clip_line;
-   clipper->stage.tri = clip_tri;
+   clipper->stage.line = clip_first_line;
+   clipper->stage.tri = clip_first_tri;
    clipper->stage.flush = clip_flush;
    clipper->stage.reset_stipple_counter = clip_reset_stipple_counter;
    clipper->stage.destroy = clip_destroy;
diff --git a/src/mesa/pipe/draw/draw_validate.c b/src/mesa/pipe/draw/draw_validate.c
index 86d5a5f814..4375ebabbc 100644
--- a/src/mesa/pipe/draw/draw_validate.c
+++ b/src/mesa/pipe/draw/draw_validate.c
@@ -78,6 +78,11 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
       precalc_flat = 1;		/* only needed for triangles really */
       need_det = 1;
    }
+
+   if (draw->rasterizer->flatshade && precalc_flat) {
+      draw->pipeline.flatshade->next = next;
+      next = draw->pipeline.flatshade;
+   }
 	 
    if (draw->rasterizer->offset_cw ||
        draw->rasterizer->offset_ccw) {
@@ -110,13 +115,8 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    {
       draw->pipeline.clip->next = next;
       next = draw->pipeline.clip;
-      precalc_flat = 1;		/* XXX: FIX ME! Only needed for clipped prims */
    }
 
-   if (draw->rasterizer->flatshade && precalc_flat) {
-      draw->pipeline.flatshade->next = next;
-      next = draw->pipeline.flatshade;
-   }
    
    draw->pipeline.first = next;
    return next;
-- 
cgit v1.2.3


From 16ed55c6412d2bdc5bff78e99114490223fb4afe Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 09:23:29 -0700
Subject: gallium: check if surface has defined status in
 check_clear_depth_with_quad()

This was part of Keith's patch from Friday.
---
 src/mesa/state_tracker/st_cb_clear.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index 758d4a4086..0cd469c156 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -408,7 +408,9 @@ check_clear_depth_with_quad(GLcontext *ctx, struct gl_renderbuffer *rb)
    const struct st_renderbuffer *strb = st_renderbuffer(rb);
    const GLboolean isDS = is_depth_stencil_format(strb->surface->format);
    return  ctx->Scissor.Enabled
-      || (isDS && ctx->DrawBuffer->Visual.stencilBits > 0);
+      || (isDS && 
+	  strb->surface->status == PIPE_SURFACE_STATUS_DEFINED &&
+	  ctx->DrawBuffer->Visual.stencilBits > 0);
 }
 
 
-- 
cgit v1.2.3


From 2194675196260c0a5d44242d698b85c86f84074b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Sun, 27 Jan 2008 12:01:47 -0700
Subject: Cell: generalize the batch buffer code for vertex buffers...

---
 src/mesa/pipe/cell/common.h           |  8 ++--
 src/mesa/pipe/cell/ppu/cell_batch.c   | 84 ++++++++++++++++++++---------------
 src/mesa/pipe/cell/ppu/cell_batch.h   |  3 ++
 src/mesa/pipe/cell/ppu/cell_context.c |  5 ++-
 src/mesa/pipe/cell/ppu/cell_context.h | 10 +++--
 src/mesa/pipe/cell/ppu/cell_spu.c     |  4 +-
 src/mesa/pipe/cell/spu/spu_main.c     | 22 ++++-----
 7 files changed, 79 insertions(+), 57 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 0b63ed39be..ce9c381907 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -81,8 +81,8 @@
 #define CELL_CMD_STATE_VERTEX_INFO   13
 
 
-#define CELL_NUM_BATCH_BUFFERS 3
-#define CELL_BATCH_BUFFER_SIZE 1024  /**< 16KB would be the max */
+#define CELL_NUM_BUFFERS 4
+#define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
 
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
@@ -147,7 +147,9 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    struct cell_command *cmd;
-   ubyte *batch_buffers[CELL_NUM_BATCH_BUFFERS];
+
+   /** Buffers for command batches, vertex/index data */
+   ubyte *buffers[CELL_NUM_BUFFERS];
    uint *buffer_status;  /**< points at cell_context->buffer_status */
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index c894ef8608..178caa74e1 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -31,12 +31,46 @@
 #include "cell_spu.h"
 
 
+
+uint
+cell_get_empty_buffer(struct cell_context *cell)
+{
+   uint buf = 0;
+
+   /* Find a buffer that's marked as free by all SPUs */
+   while (1) {
+      uint spu, num_free = 0;
+
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) {
+            num_free++;
+
+            if (num_free == cell->num_spus) {
+               /* found a free buffer, now mark status as used */
+               for (spu = 0; spu < cell->num_spus; spu++) {
+                  cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+               }
+               return buf;
+            }
+         }
+         else {
+            break;
+         }
+      }
+
+      /* try next buf */
+      buf = (buf + 1) % CELL_NUM_BUFFERS;
+   }
+}
+
+
+
 void
 cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->batch_buffer_size[batch];
+   const uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -46,7 +80,7 @@ cell_batch_flush(struct cell_context *cell)
 
    flushing = TRUE;
 
-   assert(batch < CELL_NUM_BATCH_BUFFERS);
+   assert(batch < CELL_NUM_BUFFERS);
 
    /*
    printf("cell_batch_dispatch: buf %u at %p, size %u\n",
@@ -68,28 +102,9 @@ cell_batch_flush(struct cell_context *cell)
     * array indicating that the PPU can re-use the buffer.
     */
 
+   batch = cell_get_empty_buffer(cell);
 
-   /* Find a buffer that's marked as free by all SPUs */
-   while (1) {
-      uint num_free = 0;
-
-      batch = (batch + 1) % CELL_NUM_BATCH_BUFFERS;
-
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         if (cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_FREE)
-            num_free++;
-      }
-
-      if (num_free == cell->num_spus) {
-         /* found a free buffer, now mark status as used */
-         for (spu = 0; spu < cell->num_spus; spu++) {
-            cell->buffer_status[spu][batch][0] = CELL_BUFFER_STATUS_USED;
-         }
-         break;
-      }
-   }
-
-   cell->batch_buffer_size[batch] = 0;  /* empty */
+   cell->buffer_size[batch] = 0;  /* empty */
    cell->cur_batch = batch;
 
    flushing = FALSE;
@@ -99,8 +114,7 @@ cell_batch_flush(struct cell_context *cell)
 uint
 cell_batch_free_space(const struct cell_context *cell)
 {
-   uint free = CELL_BATCH_BUFFER_SIZE
-      - cell->batch_buffer_size[cell->cur_batch];
+   uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
    return free;
 }
 
@@ -117,18 +131,18 @@ cell_batch_append(struct cell_context *cell, const void *cmd, uint length)
    assert(length % 4 == 0);
    assert(cell->cur_batch >= 0);
 
-   size = cell->batch_buffer_size[cell->cur_batch];
+   size = cell->buffer_size[cell->cur_batch];
 
-   if (size + length > CELL_BATCH_BUFFER_SIZE) {
+   if (size + length > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + length <= CELL_BATCH_BUFFER_SIZE);
+   assert(size + length <= CELL_BUFFER_SIZE);
 
-   memcpy(cell->batch_buffer[cell->cur_batch] + size, cmd, length);
+   memcpy(cell->buffer[cell->cur_batch] + size, cmd, length);
 
-   cell->batch_buffer_size[cell->cur_batch] = size + length;
+   cell->buffer_size[cell->cur_batch] = size + length;
 }
 
 
@@ -142,18 +156,18 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
 
    assert(cell->cur_batch >= 0);
 
-   size = cell->batch_buffer_size[cell->cur_batch];
+   size = cell->buffer_size[cell->cur_batch];
 
-   if (size + bytes > CELL_BATCH_BUFFER_SIZE) {
+   if (size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + bytes <= CELL_BATCH_BUFFER_SIZE);
+   assert(size + bytes <= CELL_BUFFER_SIZE);
 
-   pos = (void *) (cell->batch_buffer[cell->cur_batch] + size);
+   pos = (void *) (cell->buffer[cell->cur_batch] + size);
 
-   cell->batch_buffer_size[cell->cur_batch] = size + bytes;
+   cell->buffer_size[cell->cur_batch] = size + bytes;
 
    return pos;
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index c4ba7feb3d..b4c96f465a 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -35,6 +35,9 @@
 struct cell_context;
 
 
+extern uint
+cell_get_empty_buffer(struct cell_context *cell);
+
 extern void
 cell_batch_flush(struct cell_context *cell);
 
diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index 8cb0c48f40..e8020a49bc 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -254,8 +254,9 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws)
 
    cell_start_spus(cell);
 
-   for (buf = 0; buf < CELL_NUM_BATCH_BUFFERS; buf++) {
-      cell->batch_buffer_size[buf] = 0;
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
 
       /* init batch buffer status values,
        * mark 0th buffer as used, rest as free.
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 3bd88bfd5b..de65fb5e9a 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -102,12 +102,14 @@ struct cell_context
 
    uint num_spus;
 
-   uint batch_buffer_size[CELL_NUM_BATCH_BUFFERS];
-   ubyte batch_buffer[CELL_NUM_BATCH_BUFFERS][CELL_BATCH_BUFFER_SIZE] ALIGN16_ATTRIB;
-   int cur_batch;  /**< which batch buffer is being filled */
+   /** Buffers for command batches, vertex/index data */
+   uint buffer_size[CELL_NUM_BUFFERS];
+   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+
+   int cur_batch;  /**< which buffer is being filled w/ commands */
 
    /** [4] to ensure 16-byte alignment for each status word */
-   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BATCH_BUFFERS][4] ALIGN16_ATTRIB;
+   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 };
 
diff --git a/src/mesa/pipe/cell/ppu/cell_spu.c b/src/mesa/pipe/cell/ppu/cell_spu.c
index 4627bc8d1f..7c83a47e57 100644
--- a/src/mesa/pipe/cell/ppu/cell_spu.c
+++ b/src/mesa/pipe/cell/ppu/cell_spu.c
@@ -111,8 +111,8 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].cmd = &cell_global.command[i];
-      for (j = 0; j < CELL_NUM_BATCH_BUFFERS; j++) {
-         cell_global.inits[i].batch_buffers[j] = cell->batch_buffer[j];
+      for (j = 0; j < CELL_NUM_BUFFERS; j++) {
+         cell_global.inits[i].buffers[j] = cell->buffer[j];
       }
       cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 0c83900a18..2097683b82 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -473,22 +473,22 @@ cmd_finish(void)
 
 
 /**
- * Tell the PPU that this SPU has finished copying a batch buffer to
+ * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
  * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_contex->buffer_status[]).
+ * main memory (in cell_context->buffer_status[]).
  */
 static void
-release_batch_buffer(uint buffer)
+release_buffer(uint buffer)
 {
    /* Evidently, using less than a 16-byte status doesn't work reliably */
    static const uint status[4] ALIGN16_ATTRIB
       = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
 
-   const uint index = 4 * (spu.init.id * CELL_NUM_BATCH_BUFFERS + buffer);
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
    uint *dst = spu.init.buffer_status + index;
 
-   ASSERT(buffer < CELL_NUM_BATCH_BUFFERS);
+   ASSERT(buffer < CELL_NUM_BUFFERS);
 
    /*
    printf("SPU %u: Set batch status buf=%u, index %u, at %p to FREE\n",
@@ -513,24 +513,24 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint buffer[CELL_BATCH_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
+   uint buffer[CELL_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
    const uint usize = size / sizeof(uint);
    uint pos;
 
    if (Debug)
       printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
-             spu.init.id, buf, size, spu.init.batch_buffers[buf]);
+             spu.init.id, buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
 
-   ASSERT_ALIGN16(spu.init.batch_buffers[buf]);
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
 
    size = ROUNDUP16(size);
 
-   ASSERT_ALIGN16(spu.init.batch_buffers[buf]);
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
 
    mfc_get(buffer,  /* dest */
-           (unsigned int) spu.init.batch_buffers[buf],  /* src */
+           (unsigned int) spu.init.buffers[buf],  /* src */
            size,
            TAG_BATCH_BUFFER,
            0, /* tid */
@@ -538,7 +538,7 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   release_batch_buffer(buf);
+   release_buffer(buf);
 
    for (pos = 0; pos < usize; /* no incr */) {
       switch (buffer[pos]) {
-- 
cgit v1.2.3


From aaea9a121bc739db87e539214c23f76d4cd5bf49 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 09:57:13 -0700
Subject: Cell: additional debug code, misc clean-up

---
 src/mesa/pipe/cell/ppu/cell_batch.c | 52 +++++++++++++++++++++++++++++--------
 src/mesa/pipe/cell/ppu/cell_batch.h |  2 +-
 2 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 178caa74e1..2d032fc902 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -35,7 +35,7 @@
 uint
 cell_get_empty_buffer(struct cell_context *cell)
 {
-   uint buf = 0;
+   uint buf = 0, tries = 0;
 
    /* Find a buffer that's marked as free by all SPUs */
    while (1) {
@@ -50,6 +50,9 @@ cell_get_empty_buffer(struct cell_context *cell)
                for (spu = 0; spu < cell->num_spus; spu++) {
                   cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
                }
+               /*
+               printf("PPU: ALLOC BUFFER %u\n", buf);
+               */
                return buf;
             }
          }
@@ -60,11 +63,17 @@ cell_get_empty_buffer(struct cell_context *cell)
 
       /* try next buf */
       buf = (buf + 1) % CELL_NUM_BUFFERS;
+
+      tries++;
+      if (tries == 100) {
+         /*
+         printf("PPU WAITING for buffer...\n");
+         */
+      }
    }
 }
 
 
-
 void
 cell_batch_flush(struct cell_context *cell)
 {
@@ -120,29 +129,39 @@ cell_batch_free_space(const struct cell_context *cell)
 
 
 /**
- * \param cmd  command to append
- * \param length  command size in bytes
+ * Append data to current batch.
  */
 void
-cell_batch_append(struct cell_context *cell, const void *cmd, uint length)
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 {
    uint size;
 
-   assert(length % 4 == 0);
-   assert(cell->cur_batch >= 0);
+   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
 
    size = cell->buffer_size[cell->cur_batch];
 
-   if (size + length > CELL_BUFFER_SIZE) {
+   if (size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + length <= CELL_BUFFER_SIZE);
+   assert(size + bytes <= CELL_BUFFER_SIZE);
 
-   memcpy(cell->buffer[cell->cur_batch] + size, cmd, length);
+   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
 
-   cell->buffer_size[cell->cur_batch] = size + length;
+   cell->buffer_size[cell->cur_batch] = size + bytes;
 }
 
 
@@ -153,9 +172,20 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
    uint size;
 
    ASSERT(bytes % 4 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
 
    assert(cell->cur_batch >= 0);
 
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
+
    size = cell->buffer_size[cell->cur_batch];
 
    if (size + bytes > CELL_BUFFER_SIZE) {
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index b4c96f465a..f4f37314a4 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -45,7 +45,7 @@ extern uint
 cell_batch_free_space(const struct cell_context *cell);
 
 extern void
-cell_batch_append(struct cell_context *cell, const void *cmd, uint length);
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
 
 extern void *
 cell_batch_alloc(struct cell_context *cell, uint bytes);
-- 
cgit v1.2.3


From 200dcb4760960f0d9c74a7053de63337e93dd85b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 09:57:51 -0700
Subject: Cell: If flushing for swapbuffers, wait for frame completion

---
 src/mesa/pipe/cell/ppu/cell_flush.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_flush.c b/src/mesa/pipe/cell/ppu/cell_flush.c
index b98bb566b1..cf4e676645 100644
--- a/src/mesa/pipe/cell/ppu/cell_flush.c
+++ b/src/mesa/pipe/cell/ppu/cell_flush.c
@@ -39,6 +39,9 @@ cell_flush(struct pipe_context *pipe, unsigned flags)
 {
    struct cell_context *cell = cell_context(pipe);
 
+   if (flags & PIPE_FLUSH_SWAPBUFFERS)
+      flags |= PIPE_FLUSH_WAIT;
+
    draw_flush( cell->draw );
    cell_flush_int(pipe, flags);
 }
-- 
cgit v1.2.3


From 7024019d4e6e2a1618e910a127bea8c3b7661a54 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:00:27 -0700
Subject: Cell: checkpoint commit: always inline prim indexes into batch buffer

Also, explicit release-vertex-buffer command.
Lots of debug/stale code still in place...
---
 src/mesa/pipe/cell/common.h        |  12 ++++
 src/mesa/pipe/cell/ppu/cell_vbuf.c | 113 ++++++++++++++++++++++++++-----------
 src/mesa/pipe/cell/spu/spu_main.c  | 110 +++++++++++++++++++++++++-----------
 src/mesa/pipe/cell/spu/spu_main.h  |   2 +
 4 files changed, 171 insertions(+), 66 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index ce9c381907..31637ed1cc 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -75,6 +75,7 @@
 #define CELL_CMD_FINISH               3
 #define CELL_CMD_RENDER               4
 #define CELL_CMD_BATCH                5
+#define CELL_CMD_RELEASE_VERTS        6
 #define CELL_CMD_STATE_FRAMEBUFFER   10
 #define CELL_CMD_STATE_DEPTH_STENCIL 11
 #define CELL_CMD_STATE_SAMPLER       12
@@ -124,7 +125,11 @@ struct cell_command_render
    uint vertex_size;  /**< bytes per vertex */
    uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
+#if 0
    const void *vertex_data;
+#else
+   uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
+#endif
    const ushort *index_data;
    float xmin, ymin, xmax, ymax;
    boolean inline_indexes;
@@ -132,6 +137,13 @@ struct cell_command_render
 } ALIGN16_ATTRIB;
 
 
+struct cell_command_release_verts
+{
+   int opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+};
+
+
 /** XXX unions don't seem to work */
 struct cell_command
 {
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index ee572b3a51..6e12e16fe0 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,8 +40,8 @@
 
 
 /** Allow prim indexes, verts to be inlined after RENDER command */
-#define ALLOW_INLINE_INDEXES 1
-#define ALLOW_INLINE_VERTS 1
+#define ALLOW_INLINE_INDEXES 01
+#define ALLOW_INLINE_VERTS 0
 
 
 /**
@@ -55,6 +55,9 @@ struct cell_vbuf_render
    uint prim;
    uint vertex_size;
    void *vertex_buffer;
+#if 1
+   uint vertex_buf;
+#endif
 };
 
 
@@ -81,13 +84,52 @@ cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
+#if 0
    assert(!cvbr->vertex_buffer);
    cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
+#else
+   assert(cvbr->vertex_buf == ~0);
+   cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
+   cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
+   printf("%s vertex_buf = %u\n", __FUNCTION__, cvbr->vertex_buf);
+#endif
    cvbr->vertex_size = vertex_size;
    return cvbr->vertex_buffer;
 }
 
 
+static void
+cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
+                           unsigned vertex_size, unsigned vertices_used)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+
+   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
+#if 0
+   align_free(vertices);
+#else
+   printf("%s vertex_buf = %u  count = %u\n",
+          __FUNCTION__, cvbr->vertex_buf, vertices_used);
+
+   {
+      struct cell_command_release_verts *release
+         = (struct cell_command_release_verts *)
+         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
+      release->opcode = CELL_CMD_RELEASE_VERTS;
+      release->vertex_buf = cvbr->vertex_buf;
+   }
+
+   cvbr->vertex_buf = ~0;
+   cell_flush_int(&cell->pipe, 0x0);/*NEW*/
+#endif
+
+   assert(vertices == cvbr->vertex_buffer);
+   cvbr->vertex_buffer = NULL;
+}
+
+
+
 static void
 cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
@@ -124,7 +166,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
    }
    printf("\n");
-#elif 0
+#elif 01
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
           nr_indices, nr_vertices,
           indices[0], indices[1], indices[2]);
@@ -157,28 +199,26 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       const uint index_bytes = ROUNDUP4(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
 
+      const uint batch_size = sizeof(struct cell_command_render)
+         + index_bytes;
+
       struct cell_command_render *render
          = (struct cell_command_render *)
-         cell_batch_alloc(cell, sizeof(*render));
+         cell_batch_alloc(cell, batch_size);
+
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
-      if (ALLOW_INLINE_INDEXES &&
-          index_bytes <= cell_batch_free_space(cell)) {
-         /* indices inlined, right after render cmd */
-         void *dst = cell_batch_alloc(cell, index_bytes);
-         memcpy(dst, indices, nr_indices * 2);
-         render->inline_indexes = TRUE;
-         render->index_data = NULL;
-      }
-      else {
-         /* indices in separate buffer */
-         render->inline_indexes = FALSE;
-         render->index_data = indices;
-         ASSERT_ALIGN16(render->index_data);
-      }
 
+      /* append indices after render command */
+      memcpy(render + 1, indices, nr_indices * 2);
+      render->inline_indexes = TRUE;
+      render->index_data = NULL;
+
+      /* if there's room, append vertices after the indices, else leave
+       * vertices in the original/separate buffer.
+       */
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
@@ -188,12 +228,21 @@ cell_vbuf_draw(struct vbuf_render *vbr,
          void *dst = cell_batch_alloc(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
+#if 0
          render->vertex_data = NULL;
+#else
+         render->vertex_buf = ~0;
+#endif
       }
       else {
          render->inline_verts = FALSE;
+#if 0
          render->vertex_data = vertices;
          ASSERT_ALIGN16(render->vertex_data);
+#else
+         ASSERT(cvbr->vertex_buf >= 0);
+         render->vertex_buf = cvbr->vertex_buf;
+#endif
       }
 
 
@@ -203,27 +252,13 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->ymax = ymax;
    }
 
-#if 01
+#if 0
    /* XXX this is temporary */
    cell_flush_int(&cell->pipe, PIPE_FLUSH_WAIT);
 #endif
 }
 
 
-static void
-cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
-                           unsigned vertex_size, unsigned vertices_used)
-{
-   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
-
-   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
-   align_free(vertices);
-
-   assert(vertices == cvbr->vertex_buffer);
-   cvbr->vertex_buffer = NULL;
-}
-
-
 static void
 cell_vbuf_destroy(struct vbuf_render *vbr)
 {
@@ -244,8 +279,17 @@ cell_init_vbuf(struct cell_context *cell)
 
    cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
 
+#if 0
    cell->vbuf_render->base.max_indices = CELL_MAX_VBUF_INDEXES;
    cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_MAX_VBUF_SIZE;
+#else
+   cell->vbuf_render->base.max_indices
+      = (CELL_BUFFER_SIZE
+         - sizeof(struct cell_command_render)
+         - sizeof(struct cell_command_release_verts))
+      / sizeof(ushort);
+   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
+#endif
 
    cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
    cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
@@ -255,6 +299,9 @@ cell_init_vbuf(struct cell_context *cell)
    cell->vbuf_render->base.destroy = cell_vbuf_destroy;
 
    cell->vbuf_render->cell = cell;
+#if 1
+   cell->vbuf_render->vertex_buf = ~0;
+#endif
 
    cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base);
 }
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 2097683b82..eb979718f8 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -69,6 +69,32 @@ wait_on_mask_all(unsigned tagMask)
 }
 
 
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const uint status[4] ALIGN16_ATTRIB
+      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
+
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
 
 /**
  * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
@@ -237,13 +263,18 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
+      /*
       printf("SPU %u: indices at %p  vertices at %p\n",
              spu.init.id,
              render->index_data, render->vertex_data);
+      */
    }
 
    ASSERT(sizeof(*render) % 4 == 0);
+#if 0
    ASSERT_ALIGN16(render->vertex_data);
+#else
+#endif
    ASSERT_ALIGN16(render->index_data);
 
 
@@ -251,10 +282,18 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
     ** Get vertex, index buffers if not inlined
     **/
    if (!render->inline_verts) {
+      void *src;
       ASSERT(total_vertex_bytes % 16 == 0);
 
+#if 0
+      src = render->vertex_data;
+#else
+      spu.cur_vertex_buf = render->vertex_buf;
+      src = spu.init.buffers[render->vertex_buf];
+#endif
+
       mfc_get(vertex_data,  /* dest */
-              (unsigned int) render->vertex_data,  /* src */
+              (unsigned int) src,
               total_vertex_bytes,  /* size */
               TAG_VERTEX_BUFFER,
               0, /* tid */
@@ -298,6 +337,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          /* vertices are after indexes, if inlined */
          vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
          *pos_incr = *pos_incr + total_vertex_bytes / 4;
+         spu.cur_vertex_buf = ~0;
       }
    }
 
@@ -310,6 +350,12 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       mask |= (1 << TAG_INDEX_BUFFER);
    wait_on_mask_all(mask);
 
+#if 0
+   if (!render->inline_verts) {
+      printf("SPU %u: release vbuf %u\n", spu.init.id, render->vertex_buf);
+      release_buffer(render->vertex_buf);
+   }
+#endif
 
    /**
     ** find tiles which intersect the prim bounding box
@@ -359,6 +405,14 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       for (j = 0; j < render->num_indexes; j += 3) {
          const float *v0, *v1, *v2;
 
+         if (indexes[j] == 0xffff) {
+            printf("index[%u] = 0xffff\n", j);
+         }
+
+         ASSERT(indexes[j] != 0xffff);
+         ASSERT(indexes[j+1] != 0xffff);
+         ASSERT(indexes[j+2] != 0xffff);
+
          v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
@@ -391,6 +445,17 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 }
 
 
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   if (Debug)
+      printf("SPU %u: RELEASE VERTS %u\n",
+             spu.init.id, spu.cur_vertex_buf);
+   ASSERT(spu.cur_vertex_buf == release->vertex_buf);
+   release_buffer(release->vertex_buf);
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -472,38 +537,6 @@ cmd_finish(void)
 }
 
 
-/**
- * Tell the PPU that this SPU has finished copying a buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_context->buffer_status[]).
- */
-static void
-release_buffer(uint buffer)
-{
-   /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
-   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
-   uint *dst = spu.init.buffer_status + index;
-
-   ASSERT(buffer < CELL_NUM_BUFFERS);
-
-   /*
-   printf("SPU %u: Set batch status buf=%u, index %u, at %p to FREE\n",
-          spu.init.id, buffer, index, dst);
-   */
-
-   mfc_put((void *) &status,    /* src in local memory */
-           (unsigned int) dst,  /* dst in main memory */
-           sizeof(status),      /* size */
-           TAG_MISC,            /* tag is unimportant */
-           0, /* tid */
-           0  /* rid */);
-}
-
-
 /**
  * Execute a batch of commands
  * The opcode param encodes the location of the buffer and its size.
@@ -538,6 +571,8 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
+   if (Debug)
+      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
    release_buffer(buf);
 
    for (pos = 0; pos < usize; /* no incr */) {
@@ -567,6 +602,15 @@ cmd_batch(uint opcode)
             pos += sizeof(*render) / 4 + pos_incr;
          }
          break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            ASSERT(sizeof(*release) == 8);
+            pos += sizeof(*release) / 4;
+         }
+         break;
       case CELL_CMD_FINISH:
          cmd_finish();
          pos += 1;
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 5bc5d9fa99..68c7263b7f 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -65,6 +65,8 @@ struct spu_global
 
    /* XXX more state to come */
 
+   uint cur_vertex_buf;
+
 } ALIGN16_ATTRIB;
 
 
-- 
cgit v1.2.3


From 5b5ec94663d566b4840975c4ef4740abb138bb12 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:41:27 -0700
Subject: Cell: clean-up of render path

Finally removed a number of unneeded flush commands.  Vertex buffers are
allocated from the general buffer pool, freed by SPUs when done.
Still an occasional failed assertion (invalid batch buffer command)...
---
 src/mesa/pipe/cell/common.h        |  12 +---
 src/mesa/pipe/cell/ppu/cell_vbuf.c |  60 ++++++--------------
 src/mesa/pipe/cell/spu/spu_main.c  | 112 +++++++------------------------------
 src/mesa/pipe/cell/spu/spu_main.h  |   2 -
 4 files changed, 38 insertions(+), 148 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 31637ed1cc..d6e1dd4f7d 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -68,7 +68,7 @@
  * The low byte of a mailbox word contains the command opcode.
  * Remaining higher bytes are command specific.
  */
-#define CELL_CMD_OPCODE_MASK 0xf
+#define CELL_CMD_OPCODE_MASK 0xff
 
 #define CELL_CMD_EXIT                 1
 #define CELL_CMD_CLEAR_SURFACE        2
@@ -113,10 +113,6 @@ struct cell_command_clear_surface
 } ALIGN16_ATTRIB;
 
 
-#define CELL_MAX_VBUF_SIZE    (16 * 1024)
-#define CELL_MAX_VBUF_INDEXES 1024
-
-
 struct cell_command_render
 {
    uint opcode;       /**< CELL_CMD_RENDER */
@@ -125,14 +121,8 @@ struct cell_command_render
    uint vertex_size;  /**< bytes per vertex */
    uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
-#if 0
-   const void *vertex_data;
-#else
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
-#endif
-   const ushort *index_data;
    float xmin, ymin, xmax, ymax;
-   boolean inline_indexes;
    boolean inline_verts;
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index 6e12e16fe0..b2a25d767b 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -39,9 +39,8 @@
 #include "pipe/draw/draw_vbuf.h"
 
 
-/** Allow prim indexes, verts to be inlined after RENDER command */
-#define ALLOW_INLINE_INDEXES 01
-#define ALLOW_INLINE_VERTS 0
+/** Allow vertex data to be inlined after RENDER command */
+#define ALLOW_INLINE_VERTS 1
 
 
 /**
@@ -52,12 +51,10 @@ struct cell_vbuf_render
 {
    struct vbuf_render base;
    struct cell_context *cell;
-   uint prim;
-   uint vertex_size;
-   void *vertex_buffer;
-#if 1
-   uint vertex_buf;
-#endif
+   uint prim;            /**< PIPE_PRIM_x */
+   uint vertex_size;     /**< in bytes */
+   void *vertex_buffer;  /**< just for debug, really */
+   uint vertex_buf;      /**< in [0, CELL_NUM_BUFFERS-1] */
 };
 
 
@@ -84,15 +81,10 @@ cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
-#if 0
-   assert(!cvbr->vertex_buffer);
-   cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
-#else
+
    assert(cvbr->vertex_buf == ~0);
    cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
    cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
-   printf("%s vertex_buf = %u\n", __FUNCTION__, cvbr->vertex_buf);
-#endif
    cvbr->vertex_size = vertex_size;
    return cvbr->vertex_buffer;
 }
@@ -105,14 +97,13 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    struct cell_context *cell = cvbr->cell;
 
-   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
-#if 0
-   align_free(vertices);
-#else
+   /*
    printf("%s vertex_buf = %u  count = %u\n",
           __FUNCTION__, cvbr->vertex_buf, vertices_used);
+   */
 
-   {
+   /* Tell SPUs they can release the vert buf */
+   if (cvbr->vertex_buf != ~0U) {
       struct cell_command_release_verts *release
          = (struct cell_command_release_verts *)
          cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
@@ -121,8 +112,7 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
    }
 
    cvbr->vertex_buf = ~0;
-   cell_flush_int(&cell->pipe, 0x0);/*NEW*/
-#endif
+   cell_flush_int(&cell->pipe, 0x0);
 
    assert(vertices == cvbr->vertex_buffer);
    cvbr->vertex_buffer = NULL;
@@ -166,7 +156,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
    }
    printf("\n");
-#elif 01
+#elif 0
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
           nr_indices, nr_vertices,
           indices[0], indices[1], indices[2]);
@@ -213,8 +203,6 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       /* append indices after render command */
       memcpy(render + 1, indices, nr_indices * 2);
-      render->inline_indexes = TRUE;
-      render->index_data = NULL;
 
       /* if there's room, append vertices after the indices, else leave
        * vertices in the original/separate buffer.
@@ -222,30 +210,20 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
-         render->inline_indexes &&
           vertex_bytes <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices */
          void *dst = cell_batch_alloc(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
-#if 0
-         render->vertex_data = NULL;
-#else
          render->vertex_buf = ~0;
-#endif
       }
       else {
+         /* vertex data in separate buffer */
          render->inline_verts = FALSE;
-#if 0
-         render->vertex_data = vertices;
-         ASSERT_ALIGN16(render->vertex_data);
-#else
          ASSERT(cvbr->vertex_buf >= 0);
          render->vertex_buf = cvbr->vertex_buf;
-#endif
       }
 
-
       render->xmin = xmin;
       render->ymin = ymin;
       render->xmax = xmax;
@@ -253,7 +231,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    }
 
 #if 0
-   /* XXX this is temporary */
+   /* helpful for debug */
    cell_flush_int(&cell->pipe, PIPE_FLUSH_WAIT);
 #endif
 }
@@ -279,17 +257,15 @@ cell_init_vbuf(struct cell_context *cell)
 
    cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
 
-#if 0
-   cell->vbuf_render->base.max_indices = CELL_MAX_VBUF_INDEXES;
-   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_MAX_VBUF_SIZE;
-#else
+   /* The max number of indexes is what can fix into a batch buffer,
+    * minus the render and release-verts commands.
+    */
    cell->vbuf_render->base.max_indices
       = (CELL_BUFFER_SIZE
          - sizeof(struct cell_command_render)
          - sizeof(struct cell_command_release_verts))
       / sizeof(ushort);
    cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
-#endif
 
    cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
    cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index eb979718f8..5b50ec6953 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -239,59 +239,45 @@ static void
 cmd_render(const struct cell_command_render *render, uint *pos_incr)
 {
    /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_MAX_VBUF_SIZE] ALIGN16_ATTRIB;
-   ushort index_data[CELL_MAX_VBUF_INDEXES] ALIGN16_ATTRIB;
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
    const uint vertex_size = render->vertex_size; /* in bytes */
    const uint total_vertex_bytes = render->num_verts * vertex_size;
    const ubyte *vertices;
    const ushort *indexes;
-   uint mask;
    uint i, j;
 
 
    if (Debug) {
       printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u  inline_ind=%u\n",
+             "inline_vert=%u\n",
              spu.init.id,
              render->prim_type,
              render->num_verts,
              render->num_indexes,
-             render->inline_verts,
-             render->inline_indexes);
+             render->inline_verts);
 
       /*
       printf("       bound: %g, %g .. %g, %g\n",
              render->xmin, render->ymin, render->xmax, render->ymax);
       */
-      /*
-      printf("SPU %u: indices at %p  vertices at %p\n",
-             spu.init.id,
-             render->index_data, render->vertex_data);
-      */
    }
 
    ASSERT(sizeof(*render) % 4 == 0);
-#if 0
-   ASSERT_ALIGN16(render->vertex_data);
-#else
-#endif
-   ASSERT_ALIGN16(render->index_data);
+   ASSERT(total_vertex_bytes % 16 == 0);
 
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   *pos_incr = (render->num_indexes * 2 + 3) / 4;
 
-   /**
-    ** Get vertex, index buffers if not inlined
-    **/
-   if (!render->inline_verts) {
-      void *src;
-      ASSERT(total_vertex_bytes % 16 == 0);
-
-#if 0
-      src = render->vertex_data;
-#else
-      spu.cur_vertex_buf = render->vertex_buf;
-      src = spu.init.buffers[render->vertex_buf];
-#endif
 
+   if (render->inline_verts) {
+      /* Vertices are right after indexes in batch buffer */
+      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
+      *pos_incr = *pos_incr + total_vertex_bytes / 4;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      void *src = spu.init.buffers[render->vertex_buf];
       mfc_get(vertex_data,  /* dest */
               (unsigned int) src,
               total_vertex_bytes,  /* size */
@@ -300,63 +286,11 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
               0  /* rid */);
 
       vertices = vertex_data;
-   }
-
-   if (!render->inline_indexes) {
-      uint total_index_bytes;
-
-      *pos_incr = 0;
-
-      total_index_bytes = render->num_indexes * sizeof(ushort);
-      if (total_index_bytes < 16)
-         total_index_bytes = 16;
-      else
-         total_index_bytes = ROUNDUP16(total_index_bytes);
 
-      indexes = index_data;
-
-      /* get index data from main memory */
-      mfc_get(index_data,  /* dest */
-              (unsigned int) render->index_data,  /* src */
-              total_index_bytes,
-              TAG_INDEX_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-   }
-
-
-   /**
-    ** Get pointers to inlined indexes, verts, if present
-    **/
-   if (render->inline_indexes) {
-      /* indexes are right after the render command in the batch buffer */
-      indexes = (ushort *) (render + 1);
-      *pos_incr = (render->num_indexes * 2 + 3) / 4;
-
-      if (render->inline_verts) {
-         /* vertices are after indexes, if inlined */
-         vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-         *pos_incr = *pos_incr + total_vertex_bytes / 4;
-         spu.cur_vertex_buf = ~0;
-      }
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
    }
 
 
-   /* wait for vertex and/or index buffers if not inlined */
-   mask = 0x0;
-   if (!render->inline_verts)
-      mask |= (1 << TAG_VERTEX_BUFFER);
-   if (!render->inline_indexes)
-      mask |= (1 << TAG_INDEX_BUFFER);
-   wait_on_mask_all(mask);
-
-#if 0
-   if (!render->inline_verts) {
-      printf("SPU %u: release vbuf %u\n", spu.init.id, render->vertex_buf);
-      release_buffer(render->vertex_buf);
-   }
-#endif
-
    /**
     ** find tiles which intersect the prim bounding box
     **/
@@ -372,7 +306,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 #endif
 
    /* make sure any pending clears have completed */
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
 
 
    /**
@@ -405,14 +339,6 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       for (j = 0; j < render->num_indexes; j += 3) {
          const float *v0, *v1, *v2;
 
-         if (indexes[j] == 0xffff) {
-            printf("index[%u] = 0xffff\n", j);
-         }
-
-         ASSERT(indexes[j] != 0xffff);
-         ASSERT(indexes[j+1] != 0xffff);
-         ASSERT(indexes[j+2] != 0xffff);
-
          v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
@@ -450,8 +376,8 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 {
    if (Debug)
       printf("SPU %u: RELEASE VERTS %u\n",
-             spu.init.id, spu.cur_vertex_buf);
-   ASSERT(spu.cur_vertex_buf == release->vertex_buf);
+             spu.init.id, release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
    release_buffer(release->vertex_buf);
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 68c7263b7f..5bc5d9fa99 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -65,8 +65,6 @@ struct spu_global
 
    /* XXX more state to come */
 
-   uint cur_vertex_buf;
-
 } ALIGN16_ATTRIB;
 
 
-- 
cgit v1.2.3


From 3f8a8eada693c9501b3e52d47986e46028c172b0 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 10:41:51 -0700
Subject: Cell: remove unneeded flush(), dead code

---
 src/mesa/pipe/cell/ppu/cell_clear.c | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_clear.c b/src/mesa/pipe/cell/ppu/cell_clear.c
index e01640b994..e61bfd9b0f 100644
--- a/src/mesa/pipe/cell/ppu/cell_clear.c
+++ b/src/mesa/pipe/cell/ppu/cell_clear.c
@@ -48,7 +48,6 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                    unsigned clearValue)
 {
    struct cell_context *cell = cell_context(pipe);
-   /*uint i;*/
    uint surfIndex;
 
    if (!cell->cbuf_map[0])
@@ -61,29 +60,7 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       surfIndex = 0;
    }
 
-#if 0
-   for (i = 0; i < cell->num_spus; i++) {
-#if 1
-      uint clr = clearValue;
-      if (surfIndex == 0) {
-         /* XXX debug: clear color varied per-SPU to visualize tiles */
-         if ((clr & 0xff) == 0)
-            clr |= 64 + i * 8;
-         if ((clr & 0xff00) == 0)
-            clr |= (64 + i * 8) << 8;
-         if ((clr & 0xff0000) == 0)
-            clr |= (64 + i * 8) << 16;
-         if ((clr & 0xff000000) == 0)
-            clr |= (64 + i * 8) << 24;
-      }
-      cell_global.command[i].clear.value = clr;
-#else
-      cell_global.command[i].clear.value = clearValue;
-#endif
-      cell_global.command[i].clear.surface = surfIndex;
-      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_CLEAR_SURFACE);
-   }
-#else
+
    {
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
@@ -92,9 +69,4 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
-#endif
-
-   /* XXX temporary */
-   cell_flush(&cell->pipe, 0x0);
-
 }
-- 
cgit v1.2.3


From a8590e097e965c01ede7df47ff752b0e7adabace Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 11:20:47 -0700
Subject: Cell: make sure state commands aren't split across batches

---
 src/mesa/pipe/cell/ppu/cell_state_emit.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index dbca900c35..6776ec88c7 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -33,6 +33,17 @@
 
 
+static void
+emit_state_cmd(struct cell_context *cell, uint cmd,
+               const void *state, uint state_size)
+{
+   uint *dst = (uint *) cell_batch_alloc(cell, sizeof(uint) + state_size);
+   *dst = cmd;
+   memcpy(dst + 1, state, state_size);
+}
+
+
+
 void
 cell_emit_state(struct cell_context *cell)
 {
@@ -51,22 +62,18 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      uint cmd = CELL_CMD_STATE_DEPTH_STENCIL;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, cell->depth_stencil,
-                        sizeof(struct pipe_depth_stencil_alpha_state));
+      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL,
+                     cell->depth_stencil,
+                     sizeof(struct pipe_depth_stencil_alpha_state));
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
-      uint cmd = CELL_CMD_STATE_SAMPLER;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, cell->sampler[0],
-                        sizeof(struct pipe_sampler_state));
+      emit_state_cmd(cell, CELL_CMD_STATE_SAMPLER,
+                     cell->sampler[0], sizeof(struct pipe_sampler_state));
    }
 
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
-      uint cmd = CELL_CMD_STATE_VERTEX_INFO;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, &cell->vertex_info, sizeof(struct vertex_info));
+      emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
+                     &cell->vertex_info, sizeof(struct vertex_info));
    }
 }
-- 
cgit v1.2.3


From 3d1b0f4c57edaf5707e4952617dcd6c57dfbdc65 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 11:21:25 -0700
Subject: Cell: additional assertions

---
 src/mesa/pipe/cell/spu/spu_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 5b50ec6953..62f6a357ba 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -441,9 +441,12 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
-   if (Debug)
+   if (Debug) {
       printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
              vinfo->num_attribs);
+   }
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
 }
 
-- 
cgit v1.2.3


From c50ba44095ceef6395727769663ed46c63a1a514 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 20:57:16 +0000
Subject: gallium: fix typos in hardwired fetch path

---
 src/mesa/pipe/draw/draw_vertex_fetch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index afdf1971d2..89e4c256a7 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -396,7 +396,7 @@ static void fetch_xyz_rgb_st( struct draw_context *draw,
 
       {
 	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 float *out = &machine->Inputs[2].xyzw[0].f[i];
 	 out[0] = in[0];
 	 out[4] = in[1];
 	 out[8] = 0.0f;
@@ -500,7 +500,7 @@ void draw_update_vertex_fetch( struct draw_context *draw )
    case 3:
       if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
 	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
-	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32_FLOAT)
+	  draw->vertex_element[2].src_format == PIPE_FORMAT_R32G32_FLOAT)
 	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
       break;
    default:
-- 
cgit v1.2.3


From 1e2d6b1b82aaa8bc57535e56c5e6eac9387e22e6 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 28 Jan 2008 20:57:58 +0000
Subject: gallium: remove dead vars, code

---
 src/mesa/pipe/draw/draw_vf_sse.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index b238b542e7..066d6c0b7b 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -343,8 +343,6 @@ static boolean build_vertex_emit( struct x86_program *p )
    struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
    struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
    struct x86_reg temp = x86_make_reg(file_XMM, 0);
-   struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
-   struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
    uint8_t *fixup, *label;
 
    /* Push a few regs?
@@ -371,7 +369,6 @@ static boolean build_vertex_emit( struct x86_program *p )
    
    /* always load, needed or not:
     */
-   sse_movups(&p->func, p->chan0, x86_make_disp(vfESI, get_offset(vf, &vf->chan_scale[0])));
    sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
 
    /* Note address for loop jump */
-- 
cgit v1.2.3


From f3d0882c0218612a91a6feac91d23b34f6447d8e Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 09:34:09 +0900
Subject: gallium: Remove direct dependencies to mesa internals.

_mesa_exec_free is still being called. More invasive refactoring is necessary to clean it out.
---
 src/mesa/pipe/draw/draw_vf.c         | 20 ++++++++------
 src/mesa/pipe/draw/draw_vf.h         | 52 +++++-------------------------------
 src/mesa/pipe/draw/draw_vf_generic.c |  9 ++++---
 src/mesa/pipe/draw/draw_vf_sse.c     | 14 +++++-----
 4 files changed, 31 insertions(+), 64 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index deedfc7bc7..d36f6293b1 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -25,17 +25,20 @@
  *    Keith Whitwell <keithw@tungstengraphics.com>
  */
 
-#include "glheader.h"
-#include "context.h"
-#include "colormac.h"
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
 
 #include "draw_vf.h"
 
+
 #define DBG 0
 
 
+/* TODO: remove this */
+extern void 
+_mesa_exec_free( void *addr );
+
 
 static boolean match_fastpath( struct draw_vertex_fetch *vf,
 				 const struct draw_vf_fastpath *fp)
@@ -88,7 +91,7 @@ void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
    fastpath->match_strides = match_strides;
    fastpath->func = vf->emit;
    fastpath->attr = (struct draw_vf_attr_type *)
-      _mesa_malloc(vf->attr_count * sizeof(fastpath->attr[0]));
+      MALLOC(vf->attr_count * sizeof(fastpath->attr[0]));
 
    for (i = 0; i < vf->attr_count; i++) {
       fastpath->attr[i].format = vf->attr[i].format;
@@ -156,7 +159,7 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
    unsigned offset = 0;
    unsigned i, j;
 
-   assert(nr < DRAW_VF_ATTRIB_MAX);
+   assert(nr < PIPE_ATTRIB_MAX);
 
    memset(vf->lookup, 0, sizeof(vf->lookup));
 
@@ -200,7 +203,7 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
 
-
+#if 0
 /* Set attribute pointers, adjusted for start position:
  */
 void draw_vf_set_sources( struct draw_vertex_fetch *vf,
@@ -223,6 +226,7 @@ void draw_vf_set_sources( struct draw_vertex_fetch *vf,
       a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
    }
 }
+#endif
 
 
 /* Set attribute pointers, adjusted for start position:
@@ -260,7 +264,7 @@ struct draw_vertex_fetch *draw_vf_create( void )
    struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
    unsigned i;
 
-   for (i = 0; i < DRAW_VF_ATTRIB_MAX; i++)
+   for (i = 0; i < PIPE_ATTRIB_MAX; i++)
       vf->attr[i].vf = vf;
 
    vf->identity[0] = 0.0;
@@ -271,7 +275,7 @@ struct draw_vertex_fetch *draw_vf_create( void )
    vf->codegen_emit = NULL;
 
 #ifdef USE_SSE_ASM
-   if (!_mesa_getenv("MESA_NO_CODEGEN"))
+   if (!GETENV("MESA_NO_CODEGEN"))
       vf->codegen_emit = draw_vf_generate_sse_emit;
 #endif
 
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index c6a8fe0d53..7d90f35b0f 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -29,44 +29,11 @@
 #define DRAW_VF_H
 
 
-#include "math/m_vector.h"
-
 #include "pipe/p_compiler.h"
-#include "draw_vertex.h"
+#include "pipe/p_state.h"
 
+#include "draw_vertex.h"
 
-enum {
-   DRAW_VF_ATTRIB_POS = 0,
-   DRAW_VF_ATTRIB_WEIGHT = 1,
-   DRAW_VF_ATTRIB_NORMAL = 2,
-   DRAW_VF_ATTRIB_COLOR0 = 3,
-   DRAW_VF_ATTRIB_COLOR1 = 4,
-   DRAW_VF_ATTRIB_FOG = 5,
-   DRAW_VF_ATTRIB_COLOR_INDEX = 6,
-   DRAW_VF_ATTRIB_EDGEFLAG = 7,
-   DRAW_VF_ATTRIB_TEX0 = 8,
-   DRAW_VF_ATTRIB_TEX1 = 9,
-   DRAW_VF_ATTRIB_TEX2 = 10,
-   DRAW_VF_ATTRIB_TEX3 = 11,
-   DRAW_VF_ATTRIB_TEX4 = 12,
-   DRAW_VF_ATTRIB_TEX5 = 13,
-   DRAW_VF_ATTRIB_TEX6 = 14,
-   DRAW_VF_ATTRIB_TEX7 = 15,
-   DRAW_VF_ATTRIB_VAR0 = 16,
-   DRAW_VF_ATTRIB_VAR1 = 17,
-   DRAW_VF_ATTRIB_VAR2 = 18,
-   DRAW_VF_ATTRIB_VAR3 = 19,
-   DRAW_VF_ATTRIB_VAR4 = 20,
-   DRAW_VF_ATTRIB_VAR5 = 21,
-   DRAW_VF_ATTRIB_VAR6 = 22,
-   DRAW_VF_ATTRIB_VAR7 = 23,
-   DRAW_VF_ATTRIB_POINTSIZE = 24,
-   DRAW_VF_ATTRIB_BFC0 = 25,
-   DRAW_VF_ATTRIB_BFC1 = 26,
-   DRAW_VF_ATTRIB_CLIP_POS = 27,
-   DRAW_VF_ATTRIB_VERTEX_HEADER = 28,
-   DRAW_VF_ATTRIB_MAX = 29
-};
 
 enum draw_vf_attr_format {
    DRAW_EMIT_1F,
@@ -101,10 +68,12 @@ draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
                                unsigned nr, 
                                unsigned vertex_stride );
 
+#if 0
 void 
 draw_vf_set_sources( struct draw_vertex_fetch *vf,
 		     GLvector4f * const attrib[],
-		     unsigned start ); 
+		     unsigned start );
+#endif
 
 void 
 draw_vf_set_data( struct draw_vertex_fetch *vf,
@@ -115,13 +84,6 @@ draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
 		       unsigned count,
 		       void *dest );
 
-void 
-draw_vf_get_attr( struct draw_vertex_fetch *vf,
-		  const void *vertex,
-		  GLenum attr, 
-		  const float *dflt,
-		  float *dest );
-
 struct draw_vertex_fetch *
 draw_vf_create( void );
 
@@ -174,11 +136,11 @@ struct draw_vf_attr
 
 struct draw_vertex_fetch
 {
-   struct draw_vf_attr attr[DRAW_VF_ATTRIB_MAX];
+   struct draw_vf_attr attr[PIPE_ATTRIB_MAX];
    unsigned attr_count;
    unsigned vertex_stride;
 
-   struct draw_vf_attr *lookup[DRAW_VF_ATTRIB_MAX];
+   struct draw_vf_attr *lookup[PIPE_ATTRIB_MAX];
    
    draw_vf_emit_func emit;
 
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 343428d26c..a16eb456b7 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -26,12 +26,13 @@
  *    Keith Whitwell <keithw@tungstengraphics.com>
  */
 
-#include "glheader.h"
-#include "context.h"
-#include "colormac.h"
+
+#include <assert.h>
+
 #include "simple_list.h"
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
 
 #include "draw_vf.h"
 
@@ -94,7 +95,7 @@ static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, co
 static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
 {
    (void) a; (void) v; (void) in;
-   _mesa_exit(1);
+   assert(0);
 }
 
 static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index 066d6c0b7b..4036ded1d8 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -25,15 +25,14 @@
  *    Keith Whitwell <keithw@tungstengraphics.com>
  */
 
-#include "glheader.h"
-#include "colormac.h"
+
 #include "simple_list.h"
-#include "enums.h"
 
 #include "pipe/p_compiler.h"
 
 #include "draw_vf.h"
 
+
 #if defined(USE_SSE_ASM)
 
 #include "x86/rtasm/x86sse.h"
@@ -450,7 +449,8 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 else {
-	    _mesa_printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+	    fprintf(stderr, "Can't emit 1ub %x %x %d\n", 
+	            a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
 	    return FALSE;
 	 }
 	 break;
@@ -495,7 +495,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    j++;		/* NOTE: two attrs consumed */
 	 }
 	 else {
-	    _mesa_printf("Can't emit 3ub\n");
+	    fprintf(stderr, "Can't emit 3ub\n");
 	 }
 	 return FALSE;	/* add this later */
 	 break;
@@ -528,7 +528,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       default:
-	 _mesa_printf("unknown a[%d].format %d\n", j, a->format);
+	 fprintf(stderr, "unknown a[%d].format %d\n", j, a->format);
 	 return FALSE;	/* catch any new opcodes */
       }
       
@@ -577,7 +577,7 @@ void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
       return;
    }
 
-   _mesa_memset(&p, 0, sizeof(p));
+   memset(&p, 0, sizeof(p));
 
    p.vf = vf;
    p.inputs_safe = 0;		/* for now */
-- 
cgit v1.2.3


From 93d727eea75812ecb21706804033a33d2e761eb8 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 09:41:21 +0900
Subject: gallium: Use CALLOC for pb_buffer to ensure that all fields of
 pipe_buffer are initiallized.

---
 src/mesa/pipe/pipebuffer/pb_buffer_malloc.c | 3 +--
 src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c     | 2 ++
 src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c   | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
index fc83a00f36..2151f1d691 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
@@ -107,10 +107,9 @@ pb_malloc_buffer_create(size_t size,
 {
    struct malloc_buffer *buf;
    
-   /* TODO: accept an alignment parameter */
    /* TODO: do a single allocation */
    
-   buf = (struct malloc_buffer *)MALLOC(sizeof(struct malloc_buffer));
+   buf = CALLOC_STRUCT(malloc_buffer);
    if(!buf)
       return NULL;
    
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
index 2694f57bca..a2657dac59 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
@@ -399,6 +399,8 @@ mm_buffer_destroy(struct pb_buffer *buf)
    struct mm_buffer *mm_buf = mm_buffer(buf);
    struct mm_pb_manager *mm = mm_buf->mgr;
    
+   assert(buf->base.refcount == 0);
+   
    _glthread_LOCK_MUTEX(mm->mutex);
    mmFreeMem(mm_buf->block);
    FREE(buf);
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
index 7c29954112..f80c7e34c0 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
@@ -258,7 +258,7 @@ pool_bufmgr_create(struct pb_manager *provider,
    if(!pool->map)
       goto failure;
 
-   pool->bufs = (struct pool_buffer *) MALLOC(numBufs * sizeof(*pool->bufs));
+   pool->bufs = (struct pool_buffer *)CALLOC(numBufs, sizeof(*pool->bufs));
    if (!pool->bufs)
       goto failure;
 
-- 
cgit v1.2.3


From 1cc0b0dda7eaf6bdf891d6915b36e7b2ff41133c Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 09:42:03 +0900
Subject: gallium: Use GALLIUM_ prefix for env vars.

---
 src/mesa/pipe/draw/draw_vf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index d36f6293b1..4fc2312ad1 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -275,7 +275,7 @@ struct draw_vertex_fetch *draw_vf_create( void )
    vf->codegen_emit = NULL;
 
 #ifdef USE_SSE_ASM
-   if (!GETENV("MESA_NO_CODEGEN"))
+   if (!GETENV("GALLIUM_NO_CODEGEN"))
       vf->codegen_emit = draw_vf_generate_sse_emit;
 #endif
 
-- 
cgit v1.2.3


From 043fc00a60377f8cd1878e0d0e5157dfb4567289 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 12:46:05 -0700
Subject: Cell: re-enable bounding boxes

The geometry bounding box is used to restrict rasterization to just those
tiles that are relevant.
Note another dummy field had to be added to the cell_command_render struct.
Apparently, every 4th word in a struct is susceptible to corruption in some
circumstances.  Might be a compiler bug.
---
 src/mesa/pipe/cell/common.h        |  2 +-
 src/mesa/pipe/cell/ppu/cell_vbuf.c |  4 ++++
 src/mesa/pipe/cell/spu/spu_main.c  | 30 +++++++++++++++++++-----------
 3 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d6e1dd4f7d..5e32b209e6 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -122,7 +122,7 @@ struct cell_command_render
    uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
-   float xmin, ymin, xmax, ymax;
+   float xmin, dummy2, ymin, xmax, ymax;  /* XXX another dummy field */
    boolean inline_verts;
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index b2a25d767b..9f737287ad 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -180,6 +180,10 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       if (v[1] > ymax)
          ymax = v[1];
    }
+#if 0
+   printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax);
+   fflush(stdout);
+#endif
 
    if (cvbr->prim != PIPE_PRIM_TRIANGLES)
       return; /* only render tris for now */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 62f6a357ba..c2b05ed5a2 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -200,7 +200,7 @@ tile_bounding_box(const struct cell_command_render *render,
                   uint *txmin, uint *tymin,
                   uint *box_num_tiles, uint *box_width_tiles)
 {
-#if 1
+#if 0
    /* Debug: full-window bounding box */
    uint txmax = spu.fb.width_tiles - 1;
    uint tymax = spu.fb.height_tiles - 1;
@@ -223,13 +223,24 @@ tile_bounding_box(const struct cell_command_render *render,
    *box_num_tiles = *box_width_tiles * box_height_tiles;
 #endif
 #if 0
-   printf("Render bounds: %g, %g  ...  %g, %g\n",
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
           render->xmin, render->ymin, render->xmax, render->ymax);
-   printf("Render tiles:  %u, %u .. %u, %u\n", *txmin, *tymin, txmax, tymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
 #endif
 }
 
 
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
 /**
  * Render primitives
  * \param pos_incr  returns value indicating how may words to skip after
@@ -295,15 +306,9 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
     ** find tiles which intersect the prim bounding box
     **/
    uint txmin, tymin, box_width_tiles, box_num_tiles;
-#if 0
    tile_bounding_box(render, &txmin, &tymin,
                      &box_num_tiles, &box_width_tiles);
-#else
-   txmin = 0;
-   tymin = 0;
-   box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   box_width_tiles = spu.fb.width_tiles;
-#endif
+
 
    /* make sure any pending clears have completed */
    wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
@@ -312,13 +317,16 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    /**
     ** loop over tiles, rendering tris
     **/
-   for (i = spu.init.id; i < box_num_tiles; i += spu.init.num_spus) {
+   for (i = 0; i < box_num_tiles; i++) {
       const uint tx = txmin + i % box_width_tiles;
       const uint ty = tymin + i / box_width_tiles;
 
       ASSERT(tx < spu.fb.width_tiles);
       ASSERT(ty < spu.fb.height_tiles);
 
+      if (!my_tile(tx, ty))
+         continue;
+
       /* Start fetching color/z tiles.  We'll wait for completion when
        * we need read/write to them later in triangle rasterization.
        */
-- 
cgit v1.2.3


From 41899c70a72cd6206acec6c4c41953fea17d4ecf Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 13:02:11 -0700
Subject: Cell: emit state in cell_clear_surface() if dirty.

Without this a program that does nothing but glClear() doesn't work.  We need
the framebuffer state.
---
 src/mesa/pipe/cell/ppu/cell_clear.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_clear.c b/src/mesa/pipe/cell/ppu/cell_clear.c
index e61bfd9b0f..07b908eec5 100644
--- a/src/mesa/pipe/cell/ppu/cell_clear.c
+++ b/src/mesa/pipe/cell/ppu/cell_clear.c
@@ -50,6 +50,10 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
    struct cell_context *cell = cell_context(pipe);
    uint surfIndex;
 
+   if (cell->dirty)
+      cell_update_derived(cell);
+
+
    if (!cell->cbuf_map[0])
       cell->cbuf_map[0] = pipe_surface_map(ps);
 
-- 
cgit v1.2.3


From c2372cc7481bf3985a6a3126952ab9d5dab4bf77 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 17:22:12 -0700
Subject: Cell: initial texture cache/sampling code

---
 src/mesa/pipe/cell/spu/spu_texture.c | 139 +++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_texture.h |  43 +++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 src/mesa/pipe/cell/spu/spu_texture.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_texture.h

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
new file mode 100644
index 0000000000..6d566a5006
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -0,0 +1,139 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+
+
+/**
+ * Number of texture tiles to cache.
+ * Note that this will probably be the largest consumer of SPU local store/
+ * memory for this driver!
+ */
+#define CACHE_SIZE 16
+
+static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
+
+static int tex_tile_x[CACHE_SIZE], tex_tile_y[CACHE_SIZE];
+
+
+
+/**
+ * Mark all tex cache entries as invalid.
+ */
+void
+invalidate_tex_cache(void)
+{
+   /* XXX memset? */
+   uint i;
+   for (i = 0; i < CACHE_SIZE; i++)
+      tex_tile_x[i] = tex_tile_y[i] = -1;
+}
+
+
+/**
+ * Return the cache pos/index which corresponds to texel (i,j)
+ */
+static INLINE uint
+cache_pos(uint i, uint j)
+{
+   uint tx = i / TILE_SIZE;
+   uint ty = j / TILE_SIZE;
+   uint pos = (tx + ty * 4) % CACHE_SIZE;
+   return pos;
+}
+
+
+/**
+ * Make sure the tile for texel (i,j) is present, return its position/index
+ * in the cache.
+ */
+static uint
+get_tex_tile(uint i, uint j)
+{
+   const int tx = i / TILE_SIZE;
+   const int ty = j / TILE_SIZE;
+   const uint pos = cache_pos(i, j);
+
+   if (tex_tile_x[pos] != tx || tex_tile_y[pos] != ty) {
+      /* texture cache miss, fetch tile from main memory */
+      const uint tiles_per_row = spu.texture.width / TILE_SIZE;
+      const uint bytes_per_tile = sizeof(tile_t);
+      const void *src = (const ubyte *) spu.texture.start
+         + (ty * tiles_per_row + tx) * bytes_per_tile;
+
+      printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
+             spu.init.id, tx, ty, pos,
+             tex_tile_x[pos], tex_tile_y[pos]);
+#if 0
+      printf("SPU %u: get tex tile from %p to %p\n",
+             spu.init.id, src, tex_tiles[pos].t32);
+#endif
+
+      ASSERT_ALIGN16(tex_tiles[pos].t32);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(tex_tiles[pos].t32,  /* dest */
+              (unsigned int) src,
+              bytes_per_tile,      /* size */
+              TAG_TEXTURE_TILE,
+              0, /* tid */
+              0  /* rid */);
+
+      wait_on_mask(1 << TAG_TEXTURE_TILE);
+
+      tex_tile_x[pos] = tx;
+      tex_tile_y[pos] = ty;
+   }
+   else {
+#if 0
+      printf("SPU %u: tex cache HIT at %d, %d\n",
+             spu.init.id, tx, ty);
+#endif
+   }
+
+   return pos;
+}
+
+
+/**
+ * Get texture sample at texcoord.
+ * XXX this is extremely primitive for now.
+ */
+uint
+sample_texture(const float *texcoord)
+{
+   /* wrap/repeat */
+   uint i = (uint) (texcoord[0] * spu.texture.width) % spu.texture.width;
+   uint j = (uint) (texcoord[1] * spu.texture.height) % spu.texture.height;
+   uint pos = get_tex_tile(i, j);
+   uint texel = tex_tiles[pos].t32[j % TILE_SIZE][i % TILE_SIZE];
+   return texel;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
new file mode 100644
index 0000000000..b75b7ac44f
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TEXTURE_H
+#define SPU_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern void
+invalidate_tex_cache(void);
+
+
+extern uint
+sample_texture(const float *texcoord);
+
+
+#endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 425f270fcbfdbfce98adaf9da4b8eb7360f34447 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 17:23:44 -0700
Subject: Cell: basic texture mapping

Texture images are tiled in PPU code.  SPUs use a texture cache for getting
texels from textures.
This is very rough code, but demos/texcyl.c works.
---
 src/mesa/pipe/cell/common.h                 | 10 +++-
 src/mesa/pipe/cell/ppu/cell_context.h       |  5 +-
 src/mesa/pipe/cell/ppu/cell_state_emit.c    | 12 +++-
 src/mesa/pipe/cell/ppu/cell_state_sampler.c | 10 +++-
 src/mesa/pipe/cell/ppu/cell_texture.c       | 87 +++++++++++++++++++++++++++++
 src/mesa/pipe/cell/ppu/cell_texture.h       |  6 ++
 src/mesa/pipe/cell/spu/Makefile             |  1 +
 src/mesa/pipe/cell/spu/spu_main.c           | 17 ++++++
 src/mesa/pipe/cell/spu/spu_main.h           |  3 +
 src/mesa/pipe/cell/spu/spu_tri.c            | 60 ++++++++++++--------
 10 files changed, 183 insertions(+), 28 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 5e32b209e6..f0d48ff403 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -79,7 +79,8 @@
 #define CELL_CMD_STATE_FRAMEBUFFER   10
 #define CELL_CMD_STATE_DEPTH_STENCIL 11
 #define CELL_CMD_STATE_SAMPLER       12
-#define CELL_CMD_STATE_VERTEX_INFO   13
+#define CELL_CMD_STATE_TEXTURE       13
+#define CELL_CMD_STATE_VERTEX_INFO   14
 
 
 #define CELL_NUM_BUFFERS 4
@@ -134,6 +135,13 @@ struct cell_command_release_verts
 };
 
 
+struct cell_command_texture
+{
+   void *start;         /**< Address in main memory */
+   uint width, height;
+};
+
+
 /** XXX unions don't seem to work */
 struct cell_command
 {
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index de65fb5e9a..7d234f3e45 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -76,7 +76,7 @@ struct cell_context
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
-   struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct cell_texture *texture[PIPE_MAX_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
    struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
@@ -84,6 +84,9 @@ struct cell_context
    ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
    ubyte *zsbuf_map;
 
+   struct pipe_surface *tex_surf;
+   uint *tex_map;
+
    uint dirty;
 
    /** The primitive drawing context */
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 6776ec88c7..391ff454ac 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -30,7 +30,7 @@
 #include "cell_state.h"
 #include "cell_state_emit.h"
 #include "cell_batch.h"
-
+#include "cell_texture.h"
 
 
 static void
@@ -72,6 +72,16 @@ cell_emit_state(struct cell_context *cell)
                      cell->sampler[0], sizeof(struct pipe_sampler_state));
    }
 
+   if (cell->dirty & CELL_NEW_TEXTURE) {
+      struct cell_command_texture texture;
+      texture.start = cell->texture[0]->tiled_data;
+      texture.width = cell->texture[0]->base.width[0];
+      texture.height = cell->texture[0]->base.height[0];
+
+      emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
+                     &texture, sizeof(struct cell_command_texture));
+   }
+
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
       emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
                      &cell->vertex_info, sizeof(struct vertex_info));
diff --git a/src/mesa/pipe/cell/ppu/cell_state_sampler.c b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
index ae1eeb4620..317f7603bb 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_sampler.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
@@ -30,12 +30,10 @@
  */
 
 #include "pipe/p_util.h"
+#include "pipe/draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
-#if 0
 #include "cell_texture.h"
-#include "cell_tile_cache.h"
-#endif
 
 
 void *
@@ -53,6 +51,8 @@ cell_bind_sampler_state(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    assert(unit < PIPE_MAX_SAMPLERS);
    cell->sampler[unit] = (struct pipe_sampler_state *)sampler;
 
@@ -76,7 +76,11 @@ cell_set_sampler_texture(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->texture[sampler] = texture;
 
+   cell_update_texture_mapping(cell);
+
    cell->dirty |= CELL_NEW_TEXTURE;
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index 0a8190d983..acbe4c79f0 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -163,3 +163,90 @@ cell_get_tex_surface(struct pipe_context *pipe,
    }
    return ps;
 }
+
+
+
+static void
+tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = h / tile_size, w_t = w / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* fill in tile (i, j) */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+         for (i = 0; i < tile_size; i++) {
+            for (j = 0; j < tile_size; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               *tdst++ = src[srci * h + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ */
+static void
+cell_tile_texture(struct cell_context *cell,
+                  struct cell_texture *texture)
+{
+   uint face = 0, level = 0, zslice = 0;
+   struct pipe_surface *surf;
+   const uint w = texture->base.width[0], h = texture->base.height[0];
+   const uint *src;
+
+   /* temporary restrictions: */
+   assert(w >= TILE_SIZE);
+   assert(h >= TILE_SIZE);
+   assert(w % TILE_SIZE == 0);
+   assert(h % TILE_SIZE == 0);
+
+   surf = cell_get_tex_surface(&cell->pipe, &texture->base, face, level, zslice);
+   ASSERT(surf);
+
+   src = (const uint *) pipe_surface_map(surf);
+
+   if (texture->tiled_data) {
+      align_free(texture->tiled_data);
+   }
+   texture->tiled_data = align_malloc(w * h * 4, 16);
+
+   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+
+   pipe_surface_unmap(surf);
+
+   pipe_surface_reference(&surf, NULL);
+}
+
+
+
+void
+cell_update_texture_mapping(struct cell_context *cell)
+{
+   uint face = 0, level = 0, zslice = 0;
+
+   cell_tile_texture(cell, cell->texture[0]);
+#if 0
+   if (cell->tex_surf && cell->tex_map) {
+      pipe_surface_unmap(cell->tex_surf);
+      cell->tex_map = NULL;
+   }
+
+   /* XXX free old surface */
+
+   cell->tex_surf = cell_get_tex_surface(&cell->pipe,
+                                         &cell->texture[0]->base,
+                                         face, level, zslice);
+
+   cell->tex_map = pipe_surface_map(cell->tex_surf);
+#endif
+}
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.h b/src/mesa/pipe/cell/ppu/cell_texture.h
index ef5808c086..bd434c8776 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.h
+++ b/src/mesa/pipe/cell/ppu/cell_texture.h
@@ -46,6 +46,8 @@ struct cell_texture
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
+
+   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
 };
 
 
@@ -70,4 +72,8 @@ cell_get_tex_surface(struct pipe_context *pipe,
                      unsigned face, unsigned level, unsigned zslice);
 
 
+extern void
+cell_update_texture_mapping(struct cell_context *cell);
+
+
 #endif /* CELL_TEXTURE */
diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 417ae1b072..011fdcefe3 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -17,6 +17,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_texture.c \
 	spu_tile.c \
 	spu_tri.c
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index c2b05ed5a2..5a5b17dd89 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -34,6 +34,7 @@
 #include <spu_mfcio.h>
 
 #include "spu_main.h"
+#include "spu_texture.h"
 #include "spu_tri.h"
 #include "spu_tile.h"
 #include "pipe/cell/common.h"
@@ -446,6 +447,17 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
 }
 
 
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   if (Debug)
+      printf("SPU %u: TEXTURE at %p  size %u x %u\n",
+             spu.init.id, texture->start, texture->width, texture->height);
+
+   memcpy(&spu.texture, texture, sizeof(*texture));
+}
+
+
 static void
 cmd_state_vertex_info(const struct vertex_info *vinfo)
 {
@@ -561,6 +573,10 @@ cmd_batch(uint opcode)
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
          pos += (1 + sizeof(struct pipe_sampler_state) / 4);
          break;
+      case CELL_CMD_STATE_TEXTURE:
+         cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
+         pos += (1 + sizeof(struct cell_command_texture) / 4);
+         break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
          pos += (1 + sizeof(struct vertex_info) / 4);
@@ -656,6 +672,7 @@ one_time_init(void)
 {
    memset(tile_status, TILE_STATUS_DEFINED, sizeof(tile_status));
    memset(tile_status_z, TILE_STATUS_DEFINED, sizeof(tile_status_z));
+   invalidate_tex_cache();
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 5bc5d9fa99..480c54ebd0 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -60,6 +60,7 @@ struct spu_global
    struct pipe_depth_stencil_alpha_state depth_stencil;
    struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct cell_command_texture texture;
 
    struct vertex_info vertex_info;
 
@@ -84,6 +85,8 @@ extern struct spu_global spu;
 #define TAG_INDEX_BUFFER      16
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
+#define TAG_TEXTURE_TILE      19
+
 
 
 extern void
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 3d0d106c10..aad28f1036 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -33,6 +33,7 @@
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
 #include "spu_main.h"
+#include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
 
@@ -362,9 +363,24 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
    /* Cell: "write" quad fragments to the tile by setting prim color */
    const int ix = x - setup->cliprect_minx;
    const int iy = y - setup->cliprect_miny;
-   float colors[4][4];
-
-   eval_coeff(setup, 1, (float) x, (float) y, colors);
+   uint colors[4];  /* indexed by QUAD_x */
+
+   if (spu.texture.start) {
+      float texcoords[4][4];
+      uint i;
+      eval_coeff(setup, 2, (float) x, (float) y, texcoords);
+      for (i = 0; i < 4; i++) {
+         colors[i] = sample_texture(texcoords[i]);
+      }
+   }
+   else {
+      float fcolors[4][4];
+      eval_coeff(setup, 1, (float) x, (float) y, fcolors);
+      colors[QUAD_TOP_LEFT] = pack_color(fcolors[QUAD_TOP_LEFT]);
+      colors[QUAD_TOP_RIGHT] = pack_color(fcolors[QUAD_TOP_RIGHT]);
+      colors[QUAD_BOTTOM_LEFT] = pack_color(fcolors[QUAD_BOTTOM_LEFT]);
+      colors[QUAD_BOTTOM_RIGHT] = pack_color(fcolors[QUAD_BOTTOM_RIGHT]);
+   }
 
    if (spu.depth_stencil.depth.enabled) {
       mask &= do_depth_test(setup, x, y, mask);
@@ -382,13 +398,13 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
       tile_status[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
 
       if (mask & MASK_TOP_LEFT)
-         ctile.t32[iy][ix] = pack_color(colors[QUAD_TOP_LEFT]);
+         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (mask & MASK_TOP_RIGHT)
-         ctile.t32[iy][ix+1] = pack_color(colors[QUAD_TOP_RIGHT]);
+         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
       if (mask & MASK_BOTTOM_LEFT)
-         ctile.t32[iy+1][ix] = pack_color(colors[QUAD_BOTTOM_LEFT]);
+         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (mask & MASK_BOTTOM_RIGHT)
-         ctile.t32[iy+1][ix+1] = pack_color(colors[QUAD_BOTTOM_RIGHT]);
+         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
    }
 #endif
 }
@@ -606,7 +622,6 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 }
 
 
-#if 0
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  * The value value comes from vertex->data[slot][i].
@@ -614,21 +629,20 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
  * \param slot  which attribute slot 
  * \param i  which component of the slot (0..3)
  */
-static void const_coeff( struct setup_stage *setup,
-			 unsigned slot,
-			 unsigned i )
+static void const_coeff(struct setup_stage *setup, uint slot)
 {
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
-   assert(i <= 3);
+   uint i;
+   ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-   setup->coef[slot].dadx[i] = 0;
-   setup->coef[slot].dady[i] = 0;
+   for (i = 0; i < 4; i++) {
+      setup->coef[slot].dadx[i] = 0;
+      setup->coef[slot].dady[i] = 0;
 
-   /* need provoking vertex info!
-    */
-   setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+      /* need provoking vertex info!
+       */
+      setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+   }
 }
-#endif
 
 
 /**
@@ -735,15 +749,17 @@ static void setup_tri_coefficients( struct setup_stage *setup )
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(setup, i, 2, 3);  /* slot 0, z */
+         tri_linear_coeff(setup, i, 2, 3);
          /* XXX interp W if PERSPECTIVE... */
          break;
       case INTERP_CONSTANT:
-         /* fall-through */
+         const_coeff(setup, i);
+         break;
       case INTERP_LINEAR:
-         tri_linear_coeff(setup, i, 0, 4);  /* slot 1, color */
+         tri_linear_coeff(setup, i, 0, 4);
          break;
       case INTERP_PERSPECTIVE:
+         tri_linear_coeff(setup, i, 0, 4); /* XXX temporary */
          break;
       default:
          ASSERT(0);
-- 
cgit v1.2.3


From 25105276b38451439516928d188e07f2eb3e250e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 17:32:23 -0700
Subject: Cell: minor optimization for flat shading

---
 src/mesa/pipe/cell/spu/spu_tri.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index aad28f1036..19a231d9c4 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -200,16 +200,35 @@ static INLINE void
 eval_coeff( struct setup_stage *setup, uint slot,
             float x, float y, float result[4][4])
 {
-   uint i;
-   const float *dadx = setup->coef[slot].dadx;
-   const float *dady = setup->coef[slot].dady;
+   switch (spu.vertex_info.interp_mode[slot]) {
+   case INTERP_CONSTANT:
+      {
+         uint i;
+         for (i = 0; i < 4; i++) {
+            result[QUAD_TOP_LEFT][i] =
+            result[QUAD_TOP_RIGHT][i] =
+            result[QUAD_BOTTOM_LEFT][i] =
+            result[QUAD_BOTTOM_RIGHT][i] = setup->coef[slot].a0[i];
+         }
+      }
+      break;
 
-   /* loop over XYZW comps */
-   for (i = 0; i < 4; i++) {
-      result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-      result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
-      result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
-      result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
+   case INTERP_LINEAR:
+      /* fall-through, for now */
+   default:
+      {
+         uint i;
+         const float *dadx = setup->coef[slot].dadx;
+         const float *dady = setup->coef[slot].dady;
+
+         /* loop over XYZW comps */
+         for (i = 0; i < 4; i++) {
+            result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
+            result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
+            result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
+            result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
+         }
+      }
    }
 }
 
-- 
cgit v1.2.3


From e2406b47883d74933e74507af65695c8c7d7861a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:03:45 -0700
Subject: Cell: compute min index referenced in draw command, use it to reduce
 size of vertex data payload

---
 src/mesa/pipe/cell/common.h        |  2 ++
 src/mesa/pipe/cell/ppu/cell_vbuf.c | 13 +++++++++++--
 src/mesa/pipe/cell/spu/spu_main.c  | 20 ++++++++++++++++----
 3 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index f0d48ff403..90aa46a534 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -124,6 +124,8 @@ struct cell_command_render
    uint num_indexes;
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
    float xmin, dummy2, ymin, xmax, ymax;  /* XXX another dummy field */
+   uint dummy3;
+   uint min_index;
    boolean inline_verts;
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index 9f737287ad..e63b34cf52 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -138,16 +138,23 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    struct cell_context *cell = cvbr->cell;
    float xmin, ymin, xmax, ymax;
    uint i;
-   uint nr_vertices = 0;
+   uint nr_vertices = 0, min_index = ~0;
    const void *vertices = cvbr->vertex_buffer;
    const uint vertex_size = cvbr->vertex_size;
 
    for (i = 0; i < nr_indices; i++) {
       if (indices[i] > nr_vertices)
          nr_vertices = indices[i];
+      if (indices[i] < min_index)
+         min_index = indices[i];
    }
    nr_vertices++;
 
+#if 0
+   /*if (min_index > 0)*/
+      printf("%s min_index = %u\n", __FUNCTION__, min_index);
+#endif
+
 #if 0
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n",
           nr_indices, nr_vertices);
@@ -169,7 +176,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    /* compute x/y bounding box */
    xmin = ymin = 1e50;
    xmax = ymax = -1e50;
-   for (i = 0; i < nr_vertices; i++) {
+   for (i = min_index; i < nr_vertices; i++) {
       const float *v = (float *) ((ubyte *) vertices + i * vertex_size);
       if (v[0] < xmin)
          xmin = v[0];
@@ -204,6 +211,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
+      render->min_index = min_index;
 
       /* append indices after render command */
       memcpy(render + 1, indices, nr_indices * 2);
@@ -214,6 +222,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
+          min_index == 0 &&
           vertex_bytes <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices */
          void *dst = cell_batch_alloc(cell, vertex_bytes);
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 5a5b17dd89..3c9efb4741 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -253,7 +253,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    /* we'll DMA into these buffers */
    ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
    const uint vertex_size = render->vertex_size; /* in bytes */
-   const uint total_vertex_bytes = render->num_verts * vertex_size;
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
@@ -289,9 +289,21 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    }
    else {
       /* Begin DMA fetch of vertex buffer */
-      void *src = spu.init.buffers[render->vertex_buf];
-      mfc_get(vertex_data,  /* dest */
-              (unsigned int) src,
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
               total_vertex_bytes,  /* size */
               TAG_VERTEX_BUFFER,
               0, /* tid */
-- 
cgit v1.2.3


From 4bede9219be1f93844c5897216c6674b46a23a88 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:09:16 -0700
Subject: Cell: add a few null texture tests

---
 src/mesa/pipe/cell/ppu/cell_state_emit.c | 13 ++++++++++---
 src/mesa/pipe/cell/ppu/cell_texture.c    |  3 ++-
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 391ff454ac..702184416b 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -74,9 +74,16 @@ cell_emit_state(struct cell_context *cell)
 
    if (cell->dirty & CELL_NEW_TEXTURE) {
       struct cell_command_texture texture;
-      texture.start = cell->texture[0]->tiled_data;
-      texture.width = cell->texture[0]->base.width[0];
-      texture.height = cell->texture[0]->base.height[0];
+      if (cell->texture[0]) {
+         texture.start = cell->texture[0]->tiled_data;
+         texture.width = cell->texture[0]->base.width[0];
+         texture.height = cell->texture[0]->base.height[0];
+      }
+      else {
+         texture.start = NULL;
+         texture.width = 0;
+         texture.height = 0;
+      }
 
       emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
                      &texture, sizeof(struct cell_command_texture));
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index acbe4c79f0..2cf6022939 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -234,7 +234,8 @@ cell_update_texture_mapping(struct cell_context *cell)
 {
    uint face = 0, level = 0, zslice = 0;
 
-   cell_tile_texture(cell, cell->texture[0]);
+   if (cell->texture[0])
+      cell_tile_texture(cell, cell->texture[0]);
 #if 0
    if (cell->tex_surf && cell->tex_map) {
       pipe_surface_unmap(cell->tex_surf);
-- 
cgit v1.2.3


From 64935c875128d2d1254b6b39ced72b9848d477fe Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 28 Jan 2008 18:17:30 -0700
Subject: Cell: move cmd_render() into new spu_render.c file

---
 src/mesa/pipe/cell/spu/Makefile     |   1 +
 src/mesa/pipe/cell/spu/spu_main.c   | 206 +------------------------------
 src/mesa/pipe/cell/spu/spu_main.h   |   1 +
 src/mesa/pipe/cell/spu/spu_render.c | 240 ++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_render.h |  38 ++++++
 5 files changed, 283 insertions(+), 203 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_render.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_render.h

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 011fdcefe3..d5b30e1f27 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -17,6 +17,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
 	spu_tri.c
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 3c9efb4741..6e02f2c964 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -34,8 +34,8 @@
 #include <spu_mfcio.h>
 
 #include "spu_main.h"
+#include "spu_render.h"
 #include "spu_texture.h"
-#include "spu_tri.h"
 #include "spu_tile.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
@@ -47,7 +47,7 @@ helpful headers:
 /opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
 */
 
-static boolean Debug = FALSE;
+boolean Debug = FALSE;
 
 struct spu_global spu;
 
@@ -61,7 +61,7 @@ wait_on_mask(unsigned tagMask)
 }
 
 
-static void
+static INLINE void
 wait_on_mask_all(unsigned tagMask)
 {
    mfc_write_tag_mask( tagMask );
@@ -192,206 +192,6 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 }
 
 
-/**
- * Given a rendering command's bounding box (in pixels) compute the
- * location of the corresponding screen tile bounding box.
- */
-static INLINE void
-tile_bounding_box(const struct cell_command_render *render,
-                  uint *txmin, uint *tymin,
-                  uint *box_num_tiles, uint *box_width_tiles)
-{
-#if 0
-   /* Debug: full-window bounding box */
-   uint txmax = spu.fb.width_tiles - 1;
-   uint tymax = spu.fb.height_tiles - 1;
-   *txmin = 0;
-   *tymin = 0;
-   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   *box_width_tiles = spu.fb.width_tiles;
-   (void) render;
-   (void) txmax;
-   (void) tymax;
-#else
-   uint txmax, tymax, box_height_tiles;
-
-   *txmin = (uint) render->xmin / TILE_SIZE;
-   *tymin = (uint) render->ymin / TILE_SIZE;
-   txmax = (uint) render->xmax / TILE_SIZE;
-   tymax = (uint) render->ymax / TILE_SIZE;
-   *box_width_tiles = txmax - *txmin + 1;
-   box_height_tiles = tymax - *tymin + 1;
-   *box_num_tiles = *box_width_tiles * box_height_tiles;
-#endif
-#if 0
-   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
-          render->xmin, render->ymin, render->xmax, render->ymax);
-   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
-           spu.init.id, *txmin, *tymin, txmax, tymax);
-   ASSERT(render->xmin <= render->xmax);
-   ASSERT(render->ymin <= render->ymax);
-#endif
-}
-
-
-/** Check if the tile at (tx,ty) belongs to this SPU */
-static INLINE boolean
-my_tile(uint tx, uint ty)
-{
-   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
-}
-
-
-/**
- * Render primitives
- * \param pos_incr  returns value indicating how may words to skip after
- *                  this command in the batch buffer
- */
-static void
-cmd_render(const struct cell_command_render *render, uint *pos_incr)
-{
-   /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
-   const uint vertex_size = render->vertex_size; /* in bytes */
-   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
-   const ubyte *vertices;
-   const ushort *indexes;
-   uint i, j;
-
-
-   if (Debug) {
-      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u\n",
-             spu.init.id,
-             render->prim_type,
-             render->num_verts,
-             render->num_indexes,
-             render->inline_verts);
-
-      /*
-      printf("       bound: %g, %g .. %g, %g\n",
-             render->xmin, render->ymin, render->xmax, render->ymax);
-      */
-   }
-
-   ASSERT(sizeof(*render) % 4 == 0);
-   ASSERT(total_vertex_bytes % 16 == 0);
-
-   /* indexes are right after the render command in the batch buffer */
-   indexes = (const ushort *) (render + 1);
-   *pos_incr = (render->num_indexes * 2 + 3) / 4;
-
-
-   if (render->inline_verts) {
-      /* Vertices are right after indexes in batch buffer */
-      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-      *pos_incr = *pos_incr + total_vertex_bytes / 4;
-   }
-   else {
-      /* Begin DMA fetch of vertex buffer */
-      ubyte *src = spu.init.buffers[render->vertex_buf];
-      ubyte *dest = vertex_data;
-
-      /* skip vertex data we won't use */
-#if 01
-      src += render->min_index * vertex_size;
-      dest += render->min_index * vertex_size;
-      total_vertex_bytes -= render->min_index * vertex_size;
-#endif
-      ASSERT(total_vertex_bytes % 16 == 0);
-      ASSERT_ALIGN16(dest);
-      ASSERT_ALIGN16(src);
-
-      mfc_get(dest,   /* in vertex_data[] array */
-              (unsigned int) src,  /* src in main memory */
-              total_vertex_bytes,  /* size */
-              TAG_VERTEX_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-
-      vertices = vertex_data;
-
-      wait_on_mask(1 << TAG_VERTEX_BUFFER);
-   }
-
-
-   /**
-    ** find tiles which intersect the prim bounding box
-    **/
-   uint txmin, tymin, box_width_tiles, box_num_tiles;
-   tile_bounding_box(render, &txmin, &tymin,
-                     &box_num_tiles, &box_width_tiles);
-
-
-   /* make sure any pending clears have completed */
-   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
-
-
-   /**
-    ** loop over tiles, rendering tris
-    **/
-   for (i = 0; i < box_num_tiles; i++) {
-      const uint tx = txmin + i % box_width_tiles;
-      const uint ty = tymin + i / box_width_tiles;
-
-      ASSERT(tx < spu.fb.width_tiles);
-      ASSERT(ty < spu.fb.height_tiles);
-
-      if (!my_tile(tx, ty))
-         continue;
-
-      /* Start fetching color/z tiles.  We'll wait for completion when
-       * we need read/write to them later in triangle rasterization.
-       */
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
-            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         }
-      }
-
-      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      }
-
-      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
-      ASSERT(render->num_indexes % 3 == 0);
-
-      /* loop over tris */
-      for (j = 0; j < render->num_indexes; j += 3) {
-         const float *v0, *v1, *v2;
-
-         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
-         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
-         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
-
-         tri_draw(v0, v1, v2, tx, ty);
-      }
-
-      /* write color/z tiles back to main framebuffer, if dirtied */
-      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
-         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-         tile_status[ty][tx] = TILE_STATUS_DEFINED;
-      }
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
-            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
-         }
-      }
-
-      /* XXX move these... */
-      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-      if (spu.depth_stencil.depth.enabled) {
-         wait_on_mask(1 << TAG_WRITE_TILE_Z);
-      }
-   }
-
-   if (Debug)
-      printf("SPU %u: RENDER done\n",
-             spu.init.id);
-}
-
-
 static void
 cmd_release_verts(const struct cell_command_release_verts *release)
 {
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 480c54ebd0..009e046ba5 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -70,6 +70,7 @@ struct spu_global
 
 
 extern struct spu_global spu;
+extern boolean Debug;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
new file mode 100644
index 0000000000..21a286a23d
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -0,0 +1,240 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_tri.h"
+#include "spu_tile.h"
+#include "pipe/cell/common.h"
+
+
+
+/**
+ * Given a rendering command's bounding box (in pixels) compute the
+ * location of the corresponding screen tile bounding box.
+ */
+static INLINE void
+tile_bounding_box(const struct cell_command_render *render,
+                  uint *txmin, uint *tymin,
+                  uint *box_num_tiles, uint *box_width_tiles)
+{
+#if 0
+   /* Debug: full-window bounding box */
+   uint txmax = spu.fb.width_tiles - 1;
+   uint tymax = spu.fb.height_tiles - 1;
+   *txmin = 0;
+   *tymin = 0;
+   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   *box_width_tiles = spu.fb.width_tiles;
+   (void) render;
+   (void) txmax;
+   (void) tymax;
+#else
+   uint txmax, tymax, box_height_tiles;
+
+   *txmin = (uint) render->xmin / TILE_SIZE;
+   *tymin = (uint) render->ymin / TILE_SIZE;
+   txmax = (uint) render->xmax / TILE_SIZE;
+   tymax = (uint) render->ymax / TILE_SIZE;
+   *box_width_tiles = txmax - *txmin + 1;
+   box_height_tiles = tymax - *tymin + 1;
+   *box_num_tiles = *box_width_tiles * box_height_tiles;
+#endif
+#if 0
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
+          render->xmin, render->ymin, render->xmax, render->ymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
+#endif
+}
+
+
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
+/**
+ * Render primitives
+ * \param pos_incr  returns value indicating how may words to skip after
+ *                  this command in the batch buffer
+ */
+void
+cmd_render(const struct cell_command_render *render, uint *pos_incr)
+{
+   /* we'll DMA into these buffers */
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   const uint vertex_size = render->vertex_size; /* in bytes */
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   const ubyte *vertices;
+   const ushort *indexes;
+   uint i, j;
+
+
+   if (Debug) {
+      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
+             "inline_vert=%u\n",
+             spu.init.id,
+             render->prim_type,
+             render->num_verts,
+             render->num_indexes,
+             render->inline_verts);
+
+      /*
+      printf("       bound: %g, %g .. %g, %g\n",
+             render->xmin, render->ymin, render->xmax, render->ymax);
+      */
+   }
+
+   ASSERT(sizeof(*render) % 4 == 0);
+   ASSERT(total_vertex_bytes % 16 == 0);
+
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   *pos_incr = (render->num_indexes * 2 + 3) / 4;
+
+
+   if (render->inline_verts) {
+      /* Vertices are right after indexes in batch buffer */
+      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
+      *pos_incr = *pos_incr + total_vertex_bytes / 4;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
+              total_vertex_bytes,  /* size */
+              TAG_VERTEX_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+
+      vertices = vertex_data;
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+   }
+
+
+   /**
+    ** find tiles which intersect the prim bounding box
+    **/
+   uint txmin, tymin, box_width_tiles, box_num_tiles;
+   tile_bounding_box(render, &txmin, &tymin,
+                     &box_num_tiles, &box_width_tiles);
+
+
+   /* make sure any pending clears have completed */
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+
+
+   /**
+    ** loop over tiles, rendering tris
+    **/
+   for (i = 0; i < box_num_tiles; i++) {
+      const uint tx = txmin + i % box_width_tiles;
+      const uint ty = tymin + i / box_width_tiles;
+
+      ASSERT(tx < spu.fb.width_tiles);
+      ASSERT(ty < spu.fb.height_tiles);
+
+      if (!my_tile(tx, ty))
+         continue;
+
+      /* Start fetching color/z tiles.  We'll wait for completion when
+       * we need read/write to them later in triangle rasterization.
+       */
+      if (spu.depth_stencil.depth.enabled) {
+         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
+            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
+         }
+      }
+
+      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
+         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
+      }
+
+      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+      ASSERT(render->num_indexes % 3 == 0);
+
+      /* loop over tris */
+      for (j = 0; j < render->num_indexes; j += 3) {
+         const float *v0, *v1, *v2;
+
+         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
+         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
+         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
+
+         tri_draw(v0, v1, v2, tx, ty);
+      }
+
+      /* write color/z tiles back to main framebuffer, if dirtied */
+      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
+         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
+         tile_status[ty][tx] = TILE_STATUS_DEFINED;
+      }
+      if (spu.depth_stencil.depth.enabled) {
+         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
+            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
+            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
+         }
+      }
+
+      /* XXX move these... */
+      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+      if (spu.depth_stencil.depth.enabled) {
+         wait_on_mask(1 << TAG_WRITE_TILE_Z);
+      }
+   }
+
+   if (Debug)
+      printf("SPU %u: RENDER done\n",
+             spu.init.id);
+}
+
+
diff --git a/src/mesa/pipe/cell/spu/spu_render.h b/src/mesa/pipe/cell/spu/spu_render.h
new file mode 100644
index 0000000000..fbcdc5ec31
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_render.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_RENDER_H
+#define SPU_RENDER_H
+
+#include "pipe/cell/common.h"
+
+extern void
+cmd_render(const struct cell_command_render *render, uint *pos_incr);
+
+#endif /* SPU_RENDER_H */
+
-- 
cgit v1.2.3


From 3d3f7cf06e4ac25aeb604703cb8113db9fd2f8eb Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 11:21:29 +0900
Subject: gallium: Add extern keyword to global.

---
 src/mesa/pipe/draw/draw_vf.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 7d90f35b0f..c0fa063c52 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -195,7 +195,8 @@ struct draw_vf_format_info {
    const unsigned attrsize;
 };
 
-const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX];
+extern const struct draw_vf_format_info 
+draw_vf_format_info[DRAW_EMIT_MAX];
 
 
 #endif
-- 
cgit v1.2.3


From 6f75de06ff2ea899b43b94236dbfbfaee12ba88c Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 18:01:17 +0900
Subject: gallium: Allow draw_vf usage to be controlled at runtime.

---
 src/mesa/pipe/draw/draw_vbuf.c | 149 ++++++++++++++++++++---------------------
 1 file changed, 71 insertions(+), 78 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 8ca225c65a..2309ed9f12 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -127,15 +127,9 @@ emit_vertex( struct vbuf_stage *vbuf,
              struct vertex_header *vertex )
 {
 #if 0
-   const struct vertex_info *vinfo = vbuf->vinfo;
-
-   uint i;
-   uint count = 0;  /* for debug/sanity */
-   
-   assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
-
-//   fprintf(stderr, "emit vertex %d to %p\n", 
-//           vbuf->nr_vertices, vbuf->vertex_ptr);
+   fprintf(stderr, "emit vertex %d to %p\n", 
+           vbuf->nr_vertices, vbuf->vertex_ptr);
+#endif
 
    if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
       if(vertex->vertex_id < vbuf->nr_vertices)
@@ -148,75 +142,72 @@ emit_vertex( struct vbuf_stage *vbuf,
       
    vertex->vertex_id = vbuf->nr_vertices++;
 
-   for (i = 0; i < vinfo->num_attribs; i++) {
-      uint j = vinfo->src_index[i];
-      switch (vinfo->emit[i]) {
-      case EMIT_OMIT:
-         /* no-op */
-         break;
-      case EMIT_ALL:
-         /* just copy the whole vertex as-is to the vbuf */
-         assert(i == 0);
-         assert(j == 0);
-         memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
-         vbuf->vertex_ptr += vinfo->size;
-         count += vinfo->size;
-         break;
-      case EMIT_1F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         count++;
-         break;
-      case EMIT_1F_PSIZE:
-         *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
-         count++;
-         break;
-      case EMIT_2F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         count += 2;
-         break;
-      case EMIT_3F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-         count += 3;
-         break;
-      case EMIT_4F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
-         count += 4;
-         break;
-      case EMIT_4UB:
-	 *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
-                                        float_to_ubyte( vertex->data[j][1] ),
-                                        float_to_ubyte( vertex->data[j][0] ),
-                                        float_to_ubyte( vertex->data[j][3] ));
-         count += 1;
-         break;
-      default:
-         assert(0);
+   if(!vbuf->vf) {
+      const struct vertex_info *vinfo = vbuf->vinfo;
+      uint i;
+      uint count = 0;  /* for debug/sanity */
+      
+      assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+
+      for (i = 0; i < vinfo->num_attribs; i++) {
+         uint j = vinfo->src_index[i];
+         switch (vinfo->emit[i]) {
+         case EMIT_OMIT:
+            /* no-op */
+            break;
+         case EMIT_ALL:
+            /* just copy the whole vertex as-is to the vbuf */
+            assert(i == 0);
+            assert(j == 0);
+            memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
+            vbuf->vertex_ptr += vinfo->size;
+            count += vinfo->size;
+            break;
+         case EMIT_1F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            count++;
+            break;
+         case EMIT_1F_PSIZE:
+            *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
+            count++;
+            break;
+         case EMIT_2F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            count += 2;
+            break;
+         case EMIT_3F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            count += 3;
+            break;
+         case EMIT_4F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
+            count += 4;
+            break;
+         case EMIT_4UB:
+   	 *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
+                                           float_to_ubyte( vertex->data[j][1] ),
+                                           float_to_ubyte( vertex->data[j][0] ),
+                                           float_to_ubyte( vertex->data[j][3] ));
+            count += 1;
+            break;
+         default:
+            assert(0);
+         }
       }
+      assert(count == vinfo->size);
    }
-   assert(count == vinfo->size);
-#else
-   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
-      if(vertex->vertex_id < vbuf->nr_vertices)
-	 return;
-      else
-	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
-	         vertex->vertex_id, vbuf->nr_vertices);
-      return;
+   else {
+      draw_vf_set_data(vbuf->vf, vertex->data);
+      draw_vf_emit_vertices(vbuf->vf, 1, vbuf->vertex_ptr);
+   
+      vbuf->vertex_ptr += vbuf->vertex_size/4;
    }
-      
-   vertex->vertex_id = vbuf->nr_vertices++;
-
-   draw_vf_set_data(vbuf->vf, vertex->data);
-   draw_vf_emit_vertices(vbuf->vf, 1, vbuf->vertex_ptr);
-
-   vbuf->vertex_ptr += vbuf->vertex_size/4;
-#endif
 }
 
 
@@ -229,6 +220,9 @@ vbuf_set_vf_attributes(struct vbuf_stage *vbuf )
    uint count = 0;  /* for debug/sanity */
    unsigned nr_attrs = 0;
    
+   if(!vbuf->vf)
+      return;
+   
 //   fprintf(stderr, "emit vertex %d to %p\n", 
 //           vbuf->nr_vertices, vbuf->vertex_ptr);
 
@@ -625,9 +619,8 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
 
    vbuf->prim = ~0;
    
-   vbuf->vf = draw_vf_create();
-   if(!vbuf->vf)
-      vbuf_destroy(&vbuf->stage);
+   if(!GETENV("GALLIUM_NOVF"))
+      vbuf->vf = draw_vf_create();
    
    return &vbuf->stage;
 }
-- 
cgit v1.2.3


From 5022344c656c0e004222a0a77c98838e8ae0a1ac Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 20:46:48 +0900
Subject: gallium: Emit constants.

---
 src/mesa/pipe/draw/draw_vf.c         | 10 +++++++--
 src/mesa/pipe/draw/draw_vf.h         | 22 ++++++++++++++++++-
 src/mesa/pipe/draw/draw_vf_generic.c | 42 +++++++++++++++++++++++++-----------
 src/mesa/pipe/draw/draw_vf_sse.c     |  4 ++++
 4 files changed, 62 insertions(+), 16 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 4fc2312ad1..958d31933b 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -182,6 +182,9 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 	 vf->attr[j].insert = draw_vf_format_info[format].insert;
 	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
 	 vf->attr[j].vertoffset = offset;
+	 vf->attr[j].isconst = draw_vf_format_info[format].isconst;
+	 if(vf->attr[j].isconst)
+	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
 	 
 	 if (DBG)
 	    _mesa_printf("%d: %s, offset %d\n", i,  
@@ -240,8 +243,11 @@ void draw_vf_set_data( struct draw_vertex_fetch *vf,
    for (j = 0; j < vf->attr_count; j++) {
       a[j].inputstride = 0; /* XXX: one-vertex-max ATM */ 
       a[j].inputsize = 4;
-      a[j].do_insert = a[j].insert[4 - 1]; 
-      a[j].inputptr = (uint8_t *)&data[a[j].attrib][0];
+      a[j].do_insert = a[j].insert[4 - 1];
+      if(a[j].isconst)
+	 a[j].inputptr = a[j].data;
+      else
+	 a[j].inputptr = (uint8_t *)&data[a[j].attrib][0];
    }
 }
 
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index c0fa063c52..911ea07bdf 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -48,14 +48,30 @@ enum draw_vf_attr_format {
    DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
    DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
    DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
+   DRAW_EMIT_1F_CONST,
+   DRAW_EMIT_2F_CONST,
+   DRAW_EMIT_3F_CONST,
+   DRAW_EMIT_4F_CONST,
    DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
    DRAW_EMIT_MAX
 };
 
-struct draw_vf_attr_map {
+struct draw_vf_attr_map 
+{
+   /** Input attribute number */
    unsigned attrib;
+   
    enum draw_vf_attr_format format;
+   
    unsigned offset;
+   
+   /** 
+    * Constant data for DRAW_EMIT_*_CONST 
+    */
+   union {
+      uint8_t ub[4];
+      float f[4];
+   } data;
 };
 
 struct draw_vertex_fetch;
@@ -124,6 +140,9 @@ struct draw_vf_attr
    unsigned inputsize;
    unsigned inputstride;
    unsigned vertoffset;      /**< position of the attrib in the vertex struct */
+   
+   boolean isconst;              /**< read from const data below */
+   uint8_t data[16];
 
    unsigned attrib;          /**< which vertex attrib (0=position, etc) */
    unsigned vertattrsize;    /**< size of the attribute in bytes */
@@ -193,6 +212,7 @@ struct draw_vf_format_info {
    const char *name;
    draw_vf_insert_func insert[4];
    const unsigned attrsize;
+   const boolean isconst;
 };
 
 extern const struct draw_vf_format_info 
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index a16eb456b7..0caa798396 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -387,62 +387,78 @@ const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] =
 {
    { "1f",
      { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
-     sizeof(float) },
+     sizeof(float), FALSE },
 
    { "2f",
      { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
-     2 * sizeof(float) },
+     2 * sizeof(float), FALSE },
 
    { "3f",
      { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
-     3 * sizeof(float) },
+     3 * sizeof(float), FALSE },
 
    { "4f",
      { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
-     4 * sizeof(float) },
+     4 * sizeof(float), FALSE },
 
    { "3f_xyw",
      { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
        insert_3f_xyw_4 },
-     3 * sizeof(float) },
+     3 * sizeof(float), FALSE },
 
    { "1ub_1f",
      { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
-     sizeof(uint8_t) },
+     sizeof(uint8_t), FALSE },
 
    { "3ub_3f_rgb",
      { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
        insert_3ub_3f_rgb_3 },
-     3 * sizeof(uint8_t) },
+     3 * sizeof(uint8_t), FALSE },
 
    { "3ub_3f_bgr",
      { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
        insert_3ub_3f_bgr_3 },
-     3 * sizeof(uint8_t) },
+     3 * sizeof(uint8_t), FALSE },
 
    { "4ub_4f_rgba",
      { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
        insert_4ub_4f_rgba_4 },
-     4 * sizeof(uint8_t) },
+     4 * sizeof(uint8_t), FALSE },
 
    { "4ub_4f_bgra",
      { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
        insert_4ub_4f_bgra_4 },
-     4 * sizeof(uint8_t) },
+     4 * sizeof(uint8_t), FALSE },
 
    { "4ub_4f_argb",
      { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
        insert_4ub_4f_argb_4 },
-     4 * sizeof(uint8_t) },
+     4 * sizeof(uint8_t), FALSE },
 
    { "4ub_4f_abgr",
      { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
        insert_4ub_4f_abgr_4 },
-     4 * sizeof(uint8_t) },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "1f_const",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), TRUE },
+   
+   { "2f_const",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), TRUE },
+   
+   { "3f_const",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), TRUE },
+   
+   { "4f_const",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), TRUE },
 
    { "pad",
      { NULL, NULL, NULL, NULL },
-     0 }
+     0, FALSE },
 
 };
 
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index 4036ded1d8..1389e6cfb9 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -388,18 +388,21 @@ static boolean build_vertex_emit( struct x86_program *p )
        */
       switch (a->format) {
       case DRAW_EMIT_1F:
+      case DRAW_EMIT_1F_CONST:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 1, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case DRAW_EMIT_2F:
+      case DRAW_EMIT_2F_CONST:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 2, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case DRAW_EMIT_3F:
+      case DRAW_EMIT_3F_CONST:
 	 /* Potentially the worst case - hardcode 2+1 copying:
 	  */
 	 if (0) {
@@ -423,6 +426,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 }
 	 break;
       case DRAW_EMIT_4F:
+      case DRAW_EMIT_4F_CONST:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 4, temp);
-- 
cgit v1.2.3


From f94425b316b57ad19ce067a449b20ebee50064f9 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 29 Jan 2008 20:47:30 +0900
Subject: gallium: Emit point size as a constant.

---
 src/mesa/pipe/draw/draw_vbuf.c | 85 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 81 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 2309ed9f12..92a8b9fbcf 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -115,6 +115,70 @@ check_space( struct vbuf_stage *vbuf, unsigned nr )
 }
 
 
+#if 0
+static INLINE void
+dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
+{
+   assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+   unsigned i, j, k;
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         fprintf(stderr, "EMIT_OMIT:");
+         break;
+      case EMIT_ALL:
+         assert(i == 0);
+         assert(j == 0);
+         fprintf(stderr, "EMIT_ALL:\t");
+         for(k = 0; k < vinfo->size*4; ++k)
+            fprintf(stderr, "%02x ", *data++);
+         break;
+      case EMIT_1F:
+         fprintf(stderr, "EMIT_1F:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_1F_PSIZE:
+         fprintf(stderr, "EMIT_1F_PSIZE:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_2F:
+         fprintf(stderr, "EMIT_2F:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_3F:
+         fprintf(stderr, "EMIT_3F:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         data += sizeof(float);
+         break;
+      case EMIT_4F:
+         fprintf(stderr, "EMIT_4F:\t");
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_4UB:
+         fprintf(stderr, "EMIT_4UB:\t");
+         fprintf(stderr, "%u ", *data++);
+         fprintf(stderr, "%u ", *data++);
+         fprintf(stderr, "%u ", *data++);
+         fprintf(stderr, "%u ", *data++);
+         break;
+      default:
+         assert(0);
+      }
+      fprintf(stderr, "\n");
+   }
+   fprintf(stderr, "\n");
+}
+#endif
+
+
 /**
  * Extract the needed fields from post-transformed vertex and emit
  * a hardware(driver) vertex.
@@ -190,7 +254,7 @@ emit_vertex( struct vbuf_stage *vbuf,
             count += 4;
             break;
          case EMIT_4UB:
-   	 *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
+            *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
                                            float_to_ubyte( vertex->data[j][1] ),
                                            float_to_ubyte( vertex->data[j][0] ),
                                            float_to_ubyte( vertex->data[j][3] ));
@@ -201,6 +265,20 @@ emit_vertex( struct vbuf_stage *vbuf,
          }
       }
       assert(count == vinfo->size);
+#if 0
+      {
+	 static float data[256]; 
+	 draw_vf_set_data(vbuf->vf, vertex->data);
+	 draw_vf_emit_vertices(vbuf->vf, 1, data);
+	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
+            fprintf(stderr, "With VF:\n");
+            dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
+	    fprintf(stderr, "Without VF:\n");
+	    dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
+	    assert(0);
+	 }
+      }
+#endif
    }
    else {
       draw_vf_set_data(vbuf->vf, vertex->data);
@@ -297,11 +375,10 @@ vbuf_set_vf_attributes(struct vbuf_stage *vbuf )
          count++;
          break;
       case EMIT_1F_PSIZE:
-	 /* FIXME */
-	 assert(0);
 	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
 	 attrs[nr_attrs].offset = 0;
+	 attrs[nr_attrs].data.f[0] = vbuf->stage.draw->rasterizer->point_size;
 	 nr_attrs++;
          count++;
          break;
-- 
cgit v1.2.3


From c185c55aec8c6d0e47a2d7b84acf7d063acfce61 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 29 Jan 2008 12:37:07 +0000
Subject: gallium: don't rely on assert(0) for error handling - may be disabled

---
 src/mesa/state_tracker/st_draw.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 8ef50ee768..c9b8e78485 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -298,6 +298,7 @@ st_draw_vbo(GLcontext *ctx,
          break;
       default:
          assert(0);
+	 return;
       }
 
       /* get/create the index buffer object */
@@ -570,6 +571,7 @@ st_feedback_draw_vbo(GLcontext *ctx,
          break;
       default:
          assert(0);
+	 return;
       }
 
       map = pipe->winsys->buffer_map(pipe->winsys,
-- 
cgit v1.2.3


From c81848210e744650724a63fbf5a5795fa4e019c5 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 29 Jan 2008 12:37:47 +0000
Subject: gallium: streamline various unfilled & stippled paths

---
 src/mesa/pipe/draw/draw_prim.c | 158 +++++++++++++++++++++++++++++------------
 1 file changed, 113 insertions(+), 45 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 2a612a1673..41b3fddcc1 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -69,28 +69,46 @@ static void draw_prim_queue_flush( struct draw_context *draw )
     * draw->pipeline->first is often changed by the first call to tri(),
     * line(), etc.
     */
-   switch (draw->reduced_prim) {
-   case RP_TRI:
-      for (i = 0; i < draw->pq.queue_nr; i++) {
-	 if (draw->pq.queue[i].reset_line_stipple)
-	    draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
-	 draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+   if (draw->rasterizer->line_stipple_enable) {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_POINT:
+	 draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
       }
-      break;
-   case RP_LINE:
-      for (i = 0; i < draw->pq.queue_nr; i++) {
-	 if (draw->pq.queue[i].reset_line_stipple)
-	    draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
-	 draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+   }
+   else {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_POINT:
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
       }
-      break;
-   case RP_POINT:
-      draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-      for (i = 0; i < draw->pq.queue_nr; i++)
-	 draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
-      break;
    }
 
    draw->pq.queue_nr = 0;   
@@ -231,7 +249,7 @@ static void do_ef_triangle( struct draw_context *draw,
 }
 
 
-static void do_quad( struct draw_context *draw,
+static void do_ef_quad( struct draw_context *draw,
 		     unsigned v0,
 		     unsigned v1,
 		     unsigned v2,
@@ -243,6 +261,16 @@ static void do_quad( struct draw_context *draw,
    do_ef_triangle( draw, 0, omitEdge3, v1, v2, v3 );
 }
 
+static void do_quad( struct draw_context *draw,
+		     unsigned v0,
+		     unsigned v1,
+		     unsigned v2,
+		     unsigned v3 )
+{
+   do_triangle( draw, v0, v1, v3 );
+   do_triangle( draw, v1, v2, v3 );
+}
+
 
 /**
  * Main entrypoint to draw some number of points/lines/triangles
@@ -252,6 +280,8 @@ draw_prim( struct draw_context *draw,
 	   unsigned prim, unsigned start, unsigned count )
 {
    unsigned i;
+   boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+		       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
 
 //   _mesa_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
 
@@ -289,24 +319,32 @@ draw_prim( struct draw_context *draw,
       break;
 
    case PIPE_PRIM_LINE_STRIP:
-      if (count >= 2) {
-	 for (i = 1; i < count; i++) {
-	    do_line( draw,
-		     i == 1,
-		     start + i - 1,
-		     start + i );
-	 }
+      for (i = 1; i < count; i++) {
+	 do_line( draw,
+		  i == 1,
+		  start + i - 1,
+		  start + i );
       }
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 0; i+2 < count; i += 3) {
-	 do_ef_triangle( draw,
-			 1, 
-			 ~0,
+      if (unfilled) {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_ef_triangle( draw,
+			    1, 
+			    ~0,
+			    start + i + 0,
+			    start + i + 1,
+			    start + i + 2 );
+	 }
+      } 
+      else {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_triangle( draw,
 			 start + i + 0,
 			 start + i + 1,
 			 start + i + 2 );
+	 }
       }
       break;
 
@@ -340,27 +378,49 @@ draw_prim( struct draw_context *draw,
 
 
    case PIPE_PRIM_QUADS:
-      for (i = 0; i+3 < count; i += 4) {
-	 do_quad( draw,
-		  start + i + 0,
-		  start + i + 1,
-		  start + i + 2,
-		  start + i + 3);
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_ef_quad( draw,
+			start + i + 0,
+			start + i + 1,
+			start + i + 2,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_quad( draw,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 2,
+		     start + i + 3);
+	 }
       }
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 0; i+3 < count; i += 2) {
-	 do_quad( draw,
-		  start + i + 2,
-		  start + i + 0,
-		  start + i + 1,
-		  start + i + 3);
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_ef_quad( draw,
+			start + i + 2,
+			start + i + 0,
+			start + i + 1,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_quad( draw,
+		     start + i + 2,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 3);
+	 }
       }
       break;
 
    case PIPE_PRIM_POLYGON:
-      if (count >= 3) {
+      if (unfilled) {
 	 unsigned ef_mask = (1<<2) | (1<<0);
 
 	 for (i = 0; i+2 < count; i++) {
@@ -378,6 +438,14 @@ draw_prim( struct draw_context *draw,
 	    ef_mask &= ~(1<<2);
 	 }
       }
+      else {
+	 for (i = 0; i+2 < count; i++) {
+	    do_triangle( draw,
+			 start + i + 1,
+			 start + i + 2,
+			 start + 0);
+	 }
+      }
       break;
 
    default:
-- 
cgit v1.2.3


From 1aaed3856878a39beb5aab0402d2adb8b277e812 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 29 Jan 2008 15:17:56 +0000
Subject: gallium: weaken assert slightly

---
 src/mesa/pipe/draw/draw_vf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 958d31933b..06b84b93cc 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -174,7 +174,7 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
       }
       else {
-	 assert(vf->lookup[map[i].attrib] == 0);
+	 assert(vf->lookup[map[i].attrib] == 0 || format == DRAW_EMIT_1F_CONST);
 	 vf->lookup[map[i].attrib] = &vf->attr[j];
 
 	 vf->attr[j].attrib = map[i].attrib;
-- 
cgit v1.2.3


From 7f2713a29ff46a608de0feac2f56f034dbc738cb Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 29 Jan 2008 11:22:57 -0700
Subject: Cell: use _pack_rgba8() from pack_rgba8.h to do float[4]->uint color
 conversion

texcyl.c is twice as fast now in non-texture mode
---
 src/mesa/pipe/cell/spu/spu_tri.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 19a231d9c4..7c6a54134f 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -29,6 +29,8 @@
  * Triangle rendering within a tile.
  */
 
+#include <pack_rgba8.h>
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
@@ -38,7 +40,6 @@
 #include "spu_tri.h"
 
 
-
 /**
  * Simplified types taken from other parts of Gallium
  */
@@ -252,19 +253,11 @@ eval_z( struct setup_stage *setup,
 static INLINE uint
 pack_color(const float color[4])
 {
-   uint r = (uint) (color[0] * 255.0);
-   uint g = (uint) (color[1] * 255.0);
-   uint b = (uint) (color[2] * 255.0);
-   uint a = (uint) (color[3] * 255.0);
-   r = MIN2(r, 255);
-   g = MIN2(g, 255);
-   b = MIN2(b, 255);
-   a = MIN2(a, 255);
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return (a << 24) | (r << 16) | (g << 8) | b;
+      return _pack_rgba8(color[3], color[0], color[1], color[2]);
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return (b << 24) | (g << 16) | (r << 8) | a;
+      return _pack_rgba8(color[2], color[1], color[0], color[3]);
    default:
       ASSERT(0);
       return 0;
-- 
cgit v1.2.3


From 17ef840af40c9228ee0f4f7453bc00e318d9e6c4 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Tue, 29 Jan 2008 16:41:10 +0100
Subject: gallium: Fix build on WinXP.

---
 src/mesa/pipe/draw/draw_clip.c              |  2 +-
 src/mesa/pipe/draw/draw_vertex_fetch.c      | 16 ++++++++--------
 src/mesa/pipe/draw/draw_vertex_shader.c     |  4 ++--
 src/mesa/pipe/draw/draw_vf.c                | 10 ++++++----
 src/mesa/pipe/draw/draw_vf_generic.c        |  2 --
 src/mesa/pipe/pipebuffer/pb_buffer_fenced.c |  4 ++--
 6 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_clip.c b/src/mesa/pipe/draw/draw_clip.c
index da20028904..61130c5600 100644
--- a/src/mesa/pipe/draw/draw_clip.c
+++ b/src/mesa/pipe/draw/draw_clip.c
@@ -406,7 +406,7 @@ clip_init_state( struct draw_stage *stage )
 {
    struct clipper *clipper = clipper_stage( stage );
 
-   clipper->flat = stage->draw->rasterizer->flatshade;
+   clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
 
    if (clipper->flat) {
       const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index 89e4c256a7..b23f487e74 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -326,6 +326,10 @@ static void fetch_xyz_rgb( struct draw_context *draw,
 			   const unsigned *elts,
 			   unsigned count )
 {
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
    assert(count <= 4);
 
 //   _mesa_printf("%s\n", __FUNCTION__);
@@ -333,10 +337,6 @@ static void fetch_xyz_rgb( struct draw_context *draw,
    /* loop over vertex attributes (vertex shader inputs)
     */
 
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
    for (i = 0; i < 4; i++) {
       {
 	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
@@ -366,15 +366,15 @@ static void fetch_xyz_rgb_st( struct draw_context *draw,
 			      const unsigned *elts,
 			      unsigned count )
 {
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
    assert(count <= 4);
 
    /* loop over vertex attributes (vertex shader inputs)
     */
 
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
    for (i = 0; i < 4; i++) {
       {
 	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 0806e23d6c..b851da845f 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -199,7 +199,7 @@ run_vertex_program(struct draw_context *draw,
 void
 draw_vertex_shader_queue_flush(struct draw_context *draw)
 {
-   unsigned i, j;
+   unsigned i;
 
    assert(draw->vs.queue_nr != 0);
 
@@ -219,7 +219,7 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
    for (i = 0; i < draw->vs.queue_nr; i += 4) {
       struct vertex_header *dests[4];
       unsigned elts[4];
-      int n = MIN2(4, draw->vs.queue_nr - i);
+      int j, n = MIN2(4, draw->vs.queue_nr - i);
 
       for (j = 0; j < n; j++) {
          elts[j] = draw->vs.queue[i + j].elt;
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 06b84b93cc..0debea1f12 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -32,7 +32,7 @@
 #include "draw_vf.h"
 
 
-#define DBG 0
+#define DRAW_VF_DBG 0
 
 
 /* TODO: remove this */
@@ -166,9 +166,10 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
    for (j = 0, i = 0; i < nr; i++) {
       const unsigned format = map[i].format;
       if (format == DRAW_EMIT_PAD) {
-	 if (DBG)
+#if (DRAW_VF_DBG)
 	    _mesa_printf("%d: pad %d, offset %d\n", i,  
 			 map[i].offset, offset);  
+#endif
 
 	 offset += map[i].offset;
 
@@ -186,10 +187,11 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 	 if(vf->attr[j].isconst)
 	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
 	 
-	 if (DBG)
+#if (DRAW_VF_DBG)
 	    _mesa_printf("%d: %s, offset %d\n", i,  
 			 draw_vf_format_info[format].name,
 			 vf->attr[j].vertoffset);   
+#endif
 
 	 offset += draw_vf_format_info[format].attrsize;
 	 j++;
@@ -303,7 +305,7 @@ void draw_vf_destroy( struct draw_vertex_fetch *vf )
        * to unify them, but this probably won't change until this
        * module gets another overhaul.
        */
-      _mesa_exec_free((void *) fp->func);
+      //_mesa_exec_free((void *) fp->func);
       FREE(fp);
    }
    
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 0caa798396..7f5f56ef9c 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -29,8 +29,6 @@
 
 #include <assert.h>
 
-#include "simple_list.h"
-
 #include "pipe/p_compiler.h"
 #include "pipe/p_util.h"
 
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
index 349647fe6e..4cf4222db9 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
@@ -145,7 +145,7 @@ _fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list,
       /* Do the delayed destroy:
        */
       pb_reference(&fenced_buf->buffer, NULL);
-      free(fenced_buf);
+      FREE(fenced_buf);
    }
 }
 
@@ -162,7 +162,7 @@ fenced_buffer_destroy(struct pb_buffer *buf)
    }
    else {
       pb_reference(&fenced_buf->buffer, NULL);
-      free(fenced_buf);
+      FREE(fenced_buf);
    }
    
    if ((fenced_list->numDelayed % fenced_list->checkDelayed) == 0)
-- 
cgit v1.2.3


From ed0327980a73947cab0ae619cdcfa7455259bff2 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 30 Jan 2008 15:24:56 +0900
Subject: gallium: Remove draw_vertex_fetch::lookup.

It is not being used, and would be dangerous to use given the possibility of constants.
---
 src/mesa/pipe/draw/draw_vf.c | 5 -----
 src/mesa/pipe/draw/draw_vf.h | 2 --
 2 files changed, 7 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 0debea1f12..64d9ed02a9 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -161,8 +161,6 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
    assert(nr < PIPE_ATTRIB_MAX);
 
-   memset(vf->lookup, 0, sizeof(vf->lookup));
-
    for (j = 0, i = 0; i < nr; i++) {
       const unsigned format = map[i].format;
       if (format == DRAW_EMIT_PAD) {
@@ -175,9 +173,6 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 
       }
       else {
-	 assert(vf->lookup[map[i].attrib] == 0 || format == DRAW_EMIT_1F_CONST);
-	 vf->lookup[map[i].attrib] = &vf->attr[j];
-
 	 vf->attr[j].attrib = map[i].attrib;
 	 vf->attr[j].format = format;
 	 vf->attr[j].insert = draw_vf_format_info[format].insert;
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 911ea07bdf..09cf4d3a6a 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -159,8 +159,6 @@ struct draw_vertex_fetch
    unsigned attr_count;
    unsigned vertex_stride;
 
-   struct draw_vf_attr *lookup[PIPE_ATTRIB_MAX];
-   
    draw_vf_emit_func emit;
 
    /* Parameters and constants for codegen:
-- 
cgit v1.2.3


From eb0e0d38eae02df17e2c11503dc047718c1244ad Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 30 Jan 2008 16:46:41 +0900
Subject: gallium: Teach draw_vf about draw vertices.

This reduces the emit overhead, which is significant since we're
emiting one vertex at a time.
---
 src/mesa/pipe/draw/draw_vbuf.c | 147 ++---------------------------------
 src/mesa/pipe/draw/draw_vf.c   | 171 ++++++++++++++++++++++++++++++++++-------
 src/mesa/pipe/draw/draw_vf.h   |  17 ++--
 3 files changed, 161 insertions(+), 174 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 92a8b9fbcf..ac03001d8f 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -35,7 +35,6 @@
 
 
 #include <assert.h>
-#include <stddef.h>
 
 #include "pipe/p_util.h"
 
@@ -268,8 +267,7 @@ emit_vertex( struct vbuf_stage *vbuf,
 #if 0
       {
 	 static float data[256]; 
-	 draw_vf_set_data(vbuf->vf, vertex->data);
-	 draw_vf_emit_vertices(vbuf->vf, 1, data);
+	 draw_vf_emit_vertex(vbuf->vf, vertex, data);
 	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
             fprintf(stderr, "With VF:\n");
             dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
@@ -281,149 +279,13 @@ emit_vertex( struct vbuf_stage *vbuf,
 #endif
    }
    else {
-      draw_vf_set_data(vbuf->vf, vertex->data);
-      draw_vf_emit_vertices(vbuf->vf, 1, vbuf->vertex_ptr);
+      draw_vf_emit_vertex(vbuf->vf, vertex, vbuf->vertex_ptr);
    
       vbuf->vertex_ptr += vbuf->vertex_size/4;
    }
 }
 
 
-static void
-vbuf_set_vf_attributes(struct vbuf_stage *vbuf ) 
-{
-   const struct vertex_info *vinfo = vbuf->vinfo;
-   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
-   uint i;
-   uint count = 0;  /* for debug/sanity */
-   unsigned nr_attrs = 0;
-   
-   if(!vbuf->vf)
-      return;
-   
-//   fprintf(stderr, "emit vertex %d to %p\n", 
-//           vbuf->nr_vertices, vbuf->vertex_ptr);
-
-#if 0
-   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
-      if(vertex->vertex_id < vbuf->nr_vertices)
-	 return;
-      else
-	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
-	         vertex->vertex_id, vbuf->nr_vertices);
-      return;
-   }
-#endif
-   
-   for (i = 0; i < vinfo->num_attribs; i++) {
-      uint j = vinfo->src_index[i];
-      switch (vinfo->emit[i]) {
-      case EMIT_OMIT:
-         /* no-op */
-         break;
-      case EMIT_ALL: {
-         /* just copy the whole vertex as-is to the vbuf */
-	 unsigned k, s = vinfo->size;
-         assert(i == 0);
-         assert(j == 0);
-         /* copy the vertex header */
-         /* XXX: we actually don't copy the header, just pad it */
-	 attrs[nr_attrs].attrib = 0;
-	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
-	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
-	 s -= offsetof(struct vertex_header, data)/4;
-         count += offsetof(struct vertex_header, data)/4;
-	 nr_attrs++;
-	 /* copy the vertex data */
-         for(k = 0; k < (s & ~0x3); k += 4) {
-      	    attrs[nr_attrs].attrib = k/4;
-      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
-      	    attrs[nr_attrs].offset = 0;
-      	    nr_attrs++;
-            count += 4;
-         }
-         /* tail */
-         /* XXX: actually, this shouldn't be needed */
- 	 attrs[nr_attrs].attrib = k/4;
-  	 attrs[nr_attrs].offset = 0;
-         switch(s & 0x3) {
-         case 0:
-            break;
-         case 1:
-      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
-      	    nr_attrs++;
-            count += 1;
-            break;
-         case 2:
-      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
-      	    nr_attrs++;
-            count += 2;
-            break;
-         case 3:
-      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
-      	    nr_attrs++;
-            count += 3;
-            break;
-         }
-         break;
-      }
-      case EMIT_1F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_1F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count++;
-         break;
-      case EMIT_1F_PSIZE:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
-	 attrs[nr_attrs].offset = 0;
-	 attrs[nr_attrs].data.f[0] = vbuf->stage.draw->rasterizer->point_size;
-	 nr_attrs++;
-         count++;
-         break;
-      case EMIT_2F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_2F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 2;
-         break;
-      case EMIT_3F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_3F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 3;
-         break;
-      case EMIT_4F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_4F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 4;
-         break;
-      case EMIT_4UB:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 1;
-         break;
-      default:
-         assert(0);
-      }
-   }
-   
-   assert(count == vinfo->size);  
-   
-   draw_vf_set_vertex_attributes(vbuf->vf, 
-                                 attrs, 
-                                 nr_attrs, 
-                                 vbuf->vertex_size);
-}
-
-
 static void 
 vbuf_tri( struct draw_stage *stage,
           struct prim_header *prim )
@@ -498,7 +360,10 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
 
    vbuf->vinfo = vinfo;
    vbuf->vertex_size = vertex_size;
-   vbuf_set_vf_attributes(vbuf);
+   if(vbuf->vf)
+      draw_vf_set_vertex_info(vbuf->vf, 
+                              vbuf->vinfo,
+                              vbuf->stage.draw->rasterizer->point_size);
    
    if (!vbuf->vertices)
       vbuf_alloc_vertices(vbuf);
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 64d9ed02a9..0da8e59ad6 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -26,6 +26,8 @@
  */
 
 
+#include <stddef.h>
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_util.h"
 
@@ -151,10 +153,11 @@ static void choose_emit_func( struct draw_vertex_fetch *vf,
 
 
-unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
-				 const struct draw_vf_attr_map *map,
-				 unsigned nr, 
-				 unsigned vertex_stride )
+static unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride )
 {
    unsigned offset = 0;
    unsigned i, j;
@@ -202,6 +205,133 @@ unsigned draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 }
 
 
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size )
+{
+   unsigned i, j, k;
+   struct draw_vf_attr *a = vf->attr;
+   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
+   unsigned count = 0;  /* for debug/sanity */
+   unsigned nr_attrs = 0;
+   
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         /* no-op */
+         break;
+      case EMIT_ALL: {
+         /* just copy the whole vertex as-is to the vbuf */
+	 unsigned s = vinfo->size;
+         assert(i == 0);
+         assert(j == 0);
+         /* copy the vertex header */
+         /* XXX: we actually don't copy the header, just pad it */
+	 attrs[nr_attrs].attrib = 0;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
+	 s -= offsetof(struct vertex_header, data)/4;
+         count += offsetof(struct vertex_header, data)/4;
+	 nr_attrs++;
+	 /* copy the vertex data */
+         for(k = 0; k < (s & ~0x3); k += 4) {
+      	    attrs[nr_attrs].attrib = k/4;
+      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
+      	    attrs[nr_attrs].offset = 0;
+      	    nr_attrs++;
+            count += 4;
+         }
+         /* tail */
+         /* XXX: actually, this shouldn't be needed */
+ 	 attrs[nr_attrs].attrib = k/4;
+  	 attrs[nr_attrs].offset = 0;
+         switch(s & 0x3) {
+         case 0:
+            break;
+         case 1:
+      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
+      	    nr_attrs++;
+            count += 1;
+            break;
+         case 2:
+      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
+      	    nr_attrs++;
+            count += 2;
+            break;
+         case 3:
+      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
+      	    nr_attrs++;
+            count += 3;
+            break;
+         }
+         break;
+      }
+      case EMIT_1F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_1F_PSIZE:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
+	 attrs[nr_attrs].offset = 0;
+	 attrs[nr_attrs].data.f[0] = point_size;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_2F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_2F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 2;
+         break;
+      case EMIT_3F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_3F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 3;
+         break;
+      case EMIT_4F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 4;
+         break;
+      case EMIT_4UB:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   
+   assert(count == vinfo->size);  
+   
+   draw_vf_set_vertex_attributes(vf, 
+                                 attrs, 
+                                 nr_attrs, 
+                                 vinfo->size * sizeof(float) );
+
+   for (j = 0; j < vf->attr_count; j++) {
+      a[j].inputsize = 4;
+      a[j].do_insert = a[j].insert[4 - 1];
+      if(a[j].isconst) {
+	 a[j].inputptr = a[j].data;
+	 a[j].inputstride = 0;
+      }
+   }
+}
+
 
 #if 0
 /* Set attribute pointers, adjusted for start position:
@@ -229,38 +359,27 @@ void draw_vf_set_sources( struct draw_vertex_fetch *vf,
 #endif
 
 
-/* Set attribute pointers, adjusted for start position:
+/**
+ * Emit a vertex to dest.  
  */
-void draw_vf_set_data( struct draw_vertex_fetch *vf,
-                       float data[][4])
+void draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                          struct vertex_header *vertex,
+                          void *dest )
 {
    struct draw_vf_attr *a = vf->attr;
    unsigned j;
    
    for (j = 0; j < vf->attr_count; j++) {
-      a[j].inputstride = 0; /* XXX: one-vertex-max ATM */ 
-      a[j].inputsize = 4;
-      a[j].do_insert = a[j].insert[4 - 1];
-      if(a[j].isconst)
-	 a[j].inputptr = a[j].data;
-      else
-	 a[j].inputptr = (uint8_t *)&data[a[j].attrib][0];
+      if(!a[j].isconst) {
+	 a[j].inputptr = (uint8_t *)&vertex->data[a[j].attrib][0];
+	 a[j].inputstride = 0; /* XXX: one-vertex-max ATM */
+      }
    }
+   
+   vf->emit( vf, 1, (uint8_t*) dest );
 }
 
 
-/* Emit count VB vertices to dest.  
- */
-void draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
-		       unsigned count,
-		       void *dest )
-{
-   vf->emit( vf, count, (uint8_t*) dest );	
-}
-
-
-
-
 
 struct draw_vertex_fetch *draw_vf_create( void )
 {
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
index 09cf4d3a6a..e694b98675 100644
--- a/src/mesa/pipe/draw/draw_vf.h
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -33,6 +33,7 @@
 #include "pipe/p_state.h"
 
 #include "draw_vertex.h"
+#include "draw_private.h" // for vertex_header
 
 
 enum draw_vf_attr_format {
@@ -78,11 +79,17 @@ struct draw_vertex_fetch;
 
 
+#if 0
 unsigned 
 draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
                                const struct draw_vf_attr_map *map,
                                unsigned nr, 
                                unsigned vertex_stride );
+#endif
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size );
 
 #if 0
 void 
@@ -92,13 +99,9 @@ draw_vf_set_sources( struct draw_vertex_fetch *vf,
 #endif
 
 void 
-draw_vf_set_data( struct draw_vertex_fetch *vf,
-                  float data[][4]);
-
-void 
-draw_vf_emit_vertices( struct draw_vertex_fetch *vf,
-		       unsigned count,
-		       void *dest );
+draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                     struct vertex_header *vertex,
+                     void *dest );
 
 struct draw_vertex_fetch *
 draw_vf_create( void );
-- 
cgit v1.2.3


From da6eac242d9b79ad77389b6ab579804bc0261005 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:49:26 -0700
Subject: Cell: move CELL_MAX_SPUS

---
 src/mesa/pipe/cell/common.h           | 2 ++
 src/mesa/pipe/cell/ppu/cell_context.h | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 90aa46a534..d5e86863d4 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -61,6 +61,8 @@
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
 
+#define CELL_MAX_SPUS 6
+
 #define TILE_SIZE 32
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 7d234f3e45..65b89518ad 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -38,9 +38,6 @@
 #include "pipe/cell/common.h"
 
 
-#define CELL_MAX_SPUS 6
-
-
 struct cell_vbuf_render;
 
 struct cell_vertex_shader_state
-- 
cgit v1.2.3


From 41bdf4cf4c924e4c04c62dc144584cf7ead3cf44 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:49:51 -0700
Subject: Cell: make wait_on_mask() static/inlined

---
 src/mesa/pipe/cell/spu/spu_main.c | 19 -------------------
 src/mesa/pipe/cell/spu/spu_main.h | 23 +++++++++++++++++++++--
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 6e02f2c964..6886f283be 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,7 +31,6 @@
 
 #include <stdio.h>
 #include <libmisc.h>
-#include <spu_mfcio.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -52,24 +51,6 @@ boolean Debug = FALSE;
 struct spu_global spu;
 
 
-void
-wait_on_mask(unsigned tagMask)
-{
-   mfc_write_tag_mask( tagMask );
-   /* wait for completion of _any_ DMAs specified by tagMask */
-   mfc_read_tag_status_any();
-}
-
-
-static INLINE void
-wait_on_mask_all(unsigned tagMask)
-{
-   mfc_write_tag_mask( tagMask );
-   /* wait for completion of _any_ DMAs specified by tagMask */
-   mfc_read_tag_status_all();
-}
-
-
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 009e046ba5..8908bf8bc0 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -29,6 +29,8 @@
 #define SPU_MAIN_H
 
 
+#include <spu_mfcio.h>
+
 #include "pipe/cell/common.h"
 #include "pipe/draw/draw_vertex.h"
 #include "pipe/p_state.h"
@@ -90,8 +92,25 @@ extern boolean Debug;
 
 
-extern void
-wait_on_mask(unsigned tag);
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
+
+
 
 
 static INLINE void
-- 
cgit v1.2.3


From 0d3f60ec64965a07ef26b551436f0d768154e4d3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:56:14 -0700
Subject: Cell: check tile status before wait_on_mask()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 7c6a54134f..01a47a4851 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -278,7 +278,7 @@ do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
    }
-   else {
+   else if (tile_status_z[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
@@ -403,7 +403,7 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
       }
-      else {
+      else if (tile_status[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
          /* make sure we've got the tile from main mem */
          wait_on_mask(1 << TAG_READ_TILE_COLOR);
       }
-- 
cgit v1.2.3


From dcf41a0eed71a67060b4efa9ab4befc86eafc177 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 11:56:41 -0700
Subject: Cell: minor code refactoring, movement

---
 src/mesa/pipe/cell/spu/spu_render.c | 85 ++++++++++++++++++++++++-------------
 1 file changed, 55 insertions(+), 30 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index 21a286a23d..f506095116 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -88,6 +88,55 @@ my_tile(uint tx, uint ty)
 }
 
 
+/**
+ * Start fetching non-clear color/Z tiles from main memory
+ */
+static INLINE void
+get_cz_tiles(uint tx, uint ty)
+{
+   if (spu.depth_stencil.depth.enabled) {
+      if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
+         get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
+      }
+   }
+
+   if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
+      get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
+   }
+}
+
+
+/**
+ * Start putting dirty color/Z tiles back to main memory
+ */
+static INLINE void
+put_cz_tiles(uint tx, uint ty)
+{
+   if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
+      put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
+      tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
+   }
+
+   if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
+      put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
+      tile_status[ty][tx] = TILE_STATUS_DEFINED;
+   }
+}
+
+
+/**
+ * Wait for 'put' of color/z tiles to complete.
+ */
+static INLINE void
+wait_put_cz_tiles(void)
+{
+   wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+   if (spu.depth_stencil.depth.enabled) {
+      wait_on_mask(1 << TAG_WRITE_TILE_Z);
+   }
+}
+
+
 /**
  * Render primitives
  * \param pos_incr  returns value indicating how may words to skip after
@@ -122,6 +171,9 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
    ASSERT(sizeof(*render) % 4 == 0);
    ASSERT(total_vertex_bytes % 16 == 0);
+   ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+   ASSERT(render->num_indexes % 3 == 0);
+
 
    /* indexes are right after the render command in the batch buffer */
    indexes = (const ushort *) (render + 1);
@@ -186,21 +238,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
-      /* Start fetching color/z tiles.  We'll wait for completion when
-       * we need read/write to them later in triangle rasterization.
-       */
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
-            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         }
-      }
-
-      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      }
-
-      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
-      ASSERT(render->num_indexes % 3 == 0);
+      get_cz_tiles(tx, ty);
 
       /* loop over tris */
       for (j = 0; j < render->num_indexes; j += 3) {
@@ -214,22 +252,9 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       }
 
       /* write color/z tiles back to main framebuffer, if dirtied */
-      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
-         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-         tile_status[ty][tx] = TILE_STATUS_DEFINED;
-      }
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
-            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
-         }
-      }
+      put_cz_tiles(tx, ty);
 
-      /* XXX move these... */
-      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-      if (spu.depth_stencil.depth.enabled) {
-         wait_on_mask(1 << TAG_WRITE_TILE_Z);
-      }
+      wait_put_cz_tiles(); /* XXX seems unnecessary... */
    }
 
    if (Debug)
-- 
cgit v1.2.3


From 022bf6dfa1ef1c18f0439553e39e473b678848e2 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 12:08:23 -0700
Subject: Cell: make 'setup' a regular var instead of passing around a pointer
 everywhere

We'll never have more than one of these objects.
Avoiding pointer deref improves performance a bit.
---
 src/mesa/pipe/cell/spu/spu_tri.c | 419 +++++++++++++++++++--------------------
 1 file changed, 209 insertions(+), 210 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 01a47a4851..5bb2cb12e3 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -135,6 +135,12 @@ struct setup_stage {
 };
 
 
+
+static struct setup_stage setup;
+
+
+
+
 #if 0
 /**
  * Basically a cast wrapper.
@@ -147,33 +153,33 @@ static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
 
 #if 0
 /**
- * Clip setup->quad against the scissor/surface bounds.
+ * Clip setup.quad against the scissor/surface bounds.
  */
 static INLINE void
 quad_clip(struct setup_stage *setup)
 {
-   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
    const int minx = (int) cliprect->minx;
    const int maxx = (int) cliprect->maxx;
    const int miny = (int) cliprect->miny;
    const int maxy = (int) cliprect->maxy;
 
-   if (setup->quad.x0 >= maxx ||
-       setup->quad.y0 >= maxy ||
-       setup->quad.x0 + 1 < minx ||
-       setup->quad.y0 + 1 < miny) {
+   if (setup.quad.x0 >= maxx ||
+       setup.quad.y0 >= maxy ||
+       setup.quad.x0 + 1 < minx ||
+       setup.quad.y0 + 1 < miny) {
       /* totally clipped */
-      setup->quad.mask = 0x0;
+      setup.quad.mask = 0x0;
       return;
    }
-   if (setup->quad.x0 < minx)
-      setup->quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup->quad.y0 < miny)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup->quad.x0 == maxx - 1)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup->quad.y0 == maxy - 1)
-      setup->quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+   if (setup.quad.x0 < minx)
+      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (setup.quad.y0 < miny)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (setup.quad.x0 == maxx - 1)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (setup.quad.y0 == maxy - 1)
+      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
 }
 #endif
 
@@ -185,9 +191,9 @@ static INLINE void
 clip_emit_quad(struct setup_stage *setup)
 {
    quad_clip(setup);
-   if (setup->quad.mask) {
-      struct softpipe_context *sp = setup->softpipe;
-      sp->quad.first->run(sp->quad.first, &setup->quad);
+   if (setup.quad.mask) {
+      struct softpipe_context *sp = setup.softpipe;
+      sp->quad.first->run(sp->quad.first, &setup.quad);
    }
 }
 #endif
@@ -198,8 +204,7 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff( struct setup_stage *setup, uint slot,
-            float x, float y, float result[4][4])
+eval_coeff(uint slot, float x, float y, float result[4][4])
 {
    switch (spu.vertex_info.interp_mode[slot]) {
    case INTERP_CONSTANT:
@@ -209,7 +214,7 @@ eval_coeff( struct setup_stage *setup, uint slot,
             result[QUAD_TOP_LEFT][i] =
             result[QUAD_TOP_RIGHT][i] =
             result[QUAD_BOTTOM_LEFT][i] =
-            result[QUAD_BOTTOM_RIGHT][i] = setup->coef[slot].a0[i];
+            result[QUAD_BOTTOM_RIGHT][i] = setup.coef[slot].a0[i];
          }
       }
       break;
@@ -219,12 +224,12 @@ eval_coeff( struct setup_stage *setup, uint slot,
    default:
       {
          uint i;
-         const float *dadx = setup->coef[slot].dadx;
-         const float *dady = setup->coef[slot].dady;
+         const float *dadx = setup.coef[slot].dadx;
+         const float *dady = setup.coef[slot].dady;
 
          /* loop over XYZW comps */
          for (i = 0; i < 4; i++) {
-            result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
+            result[QUAD_TOP_LEFT][i] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
             result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
             result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
             result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
@@ -235,15 +240,14 @@ eval_coeff( struct setup_stage *setup, uint slot,
 
 
 static INLINE void
-eval_z( struct setup_stage *setup,
-        float x, float y, float result[4])
+eval_z(float x, float y, float result[4])
 {
    const uint slot = 0;
    const uint i = 2;
-   const float *dadx = setup->coef[slot].dadx;
-   const float *dady = setup->coef[slot].dady;
+   const float *dadx = setup.coef[slot].dadx;
+   const float *dady = setup.coef[slot].dady;
 
-   result[QUAD_TOP_LEFT] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
+   result[QUAD_TOP_LEFT] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
    result[QUAD_TOP_RIGHT] = result[0] + dadx[i];
    result[QUAD_BOTTOM_LEFT] = result[0] + dady[i];
    result[QUAD_BOTTOM_RIGHT] = result[0] + dadx[i] + dady[i];
@@ -266,23 +270,23 @@ pack_color(const float color[4])
 
 
 static uint
-do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
+do_depth_test(int x, int y, unsigned mask)
 {
-   int ix = x - setup->cliprect_minx;
-   int iy = y - setup->cliprect_miny;
+   int ix = x - setup.cliprect_minx;
+   int iy = y - setup.cliprect_miny;
    float zvals[4];
 
-   eval_z(setup, (float) x, (float) y, zvals);
+   eval_z((float) x, (float) y, zvals);
 
-   if (tile_status_z[setup->ty][setup->tx] == TILE_STATUS_CLEAR) {
+   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
    }
-   else if (tile_status_z[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
+   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
-   tile_status_z[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
+   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
 
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
@@ -363,31 +367,31 @@ do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
  * Emit a quad (pass to next stage).  No clipping is done.
  */
 static INLINE void
-emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
+emit_quad( int x, int y, unsigned mask )
 {
 #if 0
-   struct softpipe_context *sp = setup->softpipe;
-   setup->quad.x0 = x;
-   setup->quad.y0 = y;
-   setup->quad.mask = mask;
-   sp->quad.first->run(sp->quad.first, &setup->quad);
+   struct softpipe_context *sp = setup.softpipe;
+   setup.quad.x0 = x;
+   setup.quad.y0 = y;
+   setup.quad.mask = mask;
+   sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
    /* Cell: "write" quad fragments to the tile by setting prim color */
-   const int ix = x - setup->cliprect_minx;
-   const int iy = y - setup->cliprect_miny;
+   const int ix = x - setup.cliprect_minx;
+   const int iy = y - setup.cliprect_miny;
    uint colors[4];  /* indexed by QUAD_x */
 
    if (spu.texture.start) {
       float texcoords[4][4];
       uint i;
-      eval_coeff(setup, 2, (float) x, (float) y, texcoords);
+      eval_coeff(2, (float) x, (float) y, texcoords);
       for (i = 0; i < 4; i++) {
          colors[i] = sample_texture(texcoords[i]);
       }
    }
    else {
       float fcolors[4][4];
-      eval_coeff(setup, 1, (float) x, (float) y, fcolors);
+      eval_coeff(1, (float) x, (float) y, fcolors);
       colors[QUAD_TOP_LEFT] = pack_color(fcolors[QUAD_TOP_LEFT]);
       colors[QUAD_TOP_RIGHT] = pack_color(fcolors[QUAD_TOP_RIGHT]);
       colors[QUAD_BOTTOM_LEFT] = pack_color(fcolors[QUAD_BOTTOM_LEFT]);
@@ -395,19 +399,19 @@ emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-      mask &= do_depth_test(setup, x, y, mask);
+      mask &= do_depth_test(x, y, mask);
    }
 
    if (mask) {
-      if (tile_status[setup->ty][setup->tx] == TILE_STATUS_CLEAR) {
+      if (tile_status[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
       }
-      else if (tile_status[setup->ty][setup->tx] != TILE_STATUS_DIRTY) {
+      else if (tile_status[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
          /* make sure we've got the tile from main mem */
          wait_on_mask(1 << TAG_READ_TILE_COLOR);
       }
-      tile_status[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
+      tile_status[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
 
       if (mask & MASK_TOP_LEFT)
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
@@ -439,20 +443,20 @@ static INLINE int block( int x )
  * this is pretty nasty...  may need to rework flush_spans again to
  * fix it, if possible.
  */
-static unsigned calculate_mask( struct setup_stage *setup, int x )
+static unsigned calculate_mask( int x )
 {
    unsigned mask = 0x0;
 
-   if (x >= setup->span.left[0] && x < setup->span.right[0]) 
+   if (x >= setup.span.left[0] && x < setup.span.right[0]) 
       mask |= MASK_TOP_LEFT;
 
-   if (x >= setup->span.left[1] && x < setup->span.right[1]) 
+   if (x >= setup.span.left[1] && x < setup.span.right[1]) 
       mask |= MASK_BOTTOM_LEFT;
       
-   if (x+1 >= setup->span.left[0] && x+1 < setup->span.right[0]) 
+   if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) 
       mask |= MASK_TOP_RIGHT;
 
-   if (x+1 >= setup->span.left[1] && x+1 < setup->span.right[1]) 
+   if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) 
       mask |= MASK_BOTTOM_RIGHT;
 
    return mask;
@@ -462,28 +466,28 @@ static unsigned calculate_mask( struct setup_stage *setup, int x )
 /**
  * Render a horizontal span of quads
  */
-static void flush_spans( struct setup_stage *setup )
+static void flush_spans( void )
 {
    int minleft, maxright;
    int x;
 
-   switch (setup->span.y_flags) {
+   switch (setup.span.y_flags) {
    case 0x3:
       /* both odd and even lines written (both quad rows) */
-      minleft = MIN2(setup->span.left[0], setup->span.left[1]);
-      maxright = MAX2(setup->span.right[0], setup->span.right[1]);
+      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
+      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
       break;
 
    case 0x1:
       /* only even line written (quad top row) */
-      minleft = setup->span.left[0];
-      maxright = setup->span.right[0];
+      minleft = setup.span.left[0];
+      maxright = setup.span.right[0];
       break;
 
    case 0x2:
       /* only odd line written (quad bottom row) */
-      minleft = setup->span.left[1];
-      maxright = setup->span.right[1];
+      minleft = setup.span.left[1];
+      maxright = setup.span.right[1];
       break;
 
    default:
@@ -494,31 +498,29 @@ static void flush_spans( struct setup_stage *setup )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-      emit_quad( setup, x, setup->span.y, 
-                 calculate_mask( setup, x ) );
+      emit_quad( x, setup.span.y, 
+                 calculate_mask( x ) );
    }
 
-   setup->span.y = 0;
-   setup->span.y_flags = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
 }
 
 #if DEBUG_VERTS
-static void print_vertex(const struct setup_stage *setup,
-                         const struct vertex_header *v)
+static void print_vertex(const struct vertex_header *v)
 {
    int i;
    fprintf(stderr, "Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad.nr_attrs; i++) {
+   for (i = 0; i < setup.quad.nr_attrs; i++) {
       fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
               v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
    }
 }
 #endif
 
-static boolean setup_sort_vertices( struct setup_stage *setup,
-				      const struct prim_header *prim )
+static boolean setup_sort_vertices(const struct prim_header *prim )
 {
    const struct vertex_header *v0 = prim->v[0];
    const struct vertex_header *v1 = prim->v[1];
@@ -526,12 +528,12 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
-   print_vertex(setup, v0);
-   print_vertex(setup, v1);
-   print_vertex(setup, v2);
+   print_vertex(v0);
+   print_vertex(v1);
+   print_vertex(v2);
 #endif
 
-   setup->vprovoke = v2;
+   setup.vprovoke = v2;
 
    /* determine bottom to top order of vertices */
    {
@@ -541,65 +543,65 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
       if (y0 <= y1) {
 	 if (y1 <= y2) {
 	    /* y0<=y1<=y2 */
-	    setup->vmin = v0;   
-	    setup->vmid = v1;   
-	    setup->vmax = v2;
+	    setup.vmin = v0;   
+	    setup.vmid = v1;   
+	    setup.vmax = v2;
 	 }
 	 else if (y2 <= y0) {
 	    /* y2<=y0<=y1 */
-	    setup->vmin = v2;   
-	    setup->vmid = v0;   
-	    setup->vmax = v1;   
+	    setup.vmin = v2;   
+	    setup.vmid = v0;   
+	    setup.vmax = v1;   
 	 }
 	 else {
 	    /* y0<=y2<=y1 */
-	    setup->vmin = v0;   
-	    setup->vmid = v2;   
-	    setup->vmax = v1;  
+	    setup.vmin = v0;   
+	    setup.vmid = v2;   
+	    setup.vmax = v1;  
 	 }
       }
       else {
 	 if (y0 <= y2) {
 	    /* y1<=y0<=y2 */
-	    setup->vmin = v1;   
-	    setup->vmid = v0;   
-	    setup->vmax = v2;  
+	    setup.vmin = v1;   
+	    setup.vmid = v0;   
+	    setup.vmax = v2;  
 	 }
 	 else if (y2 <= y1) {
 	    /* y2<=y1<=y0 */
-	    setup->vmin = v2;   
-	    setup->vmid = v1;   
-	    setup->vmax = v0;  
+	    setup.vmin = v2;   
+	    setup.vmid = v1;   
+	    setup.vmax = v0;  
 	 }
 	 else {
 	    /* y1<=y2<=y0 */
-	    setup->vmin = v1;   
-	    setup->vmid = v2;   
-	    setup->vmax = v0;
+	    setup.vmin = v1;   
+	    setup.vmid = v2;   
+	    setup.vmax = v0;
 	 }
       }
    }
 
    /* Check if triangle is completely outside the tile bounds */
-   if (setup->vmin->data[0][1] > setup->cliprect_maxy)
+   if (setup.vmin->data[0][1] > setup.cliprect_maxy)
       return FALSE;
-   if (setup->vmax->data[0][1] < setup->cliprect_miny)
+   if (setup.vmax->data[0][1] < setup.cliprect_miny)
       return FALSE;
-   if (setup->vmin->data[0][0] < setup->cliprect_minx &&
-       setup->vmid->data[0][0] < setup->cliprect_minx &&
-       setup->vmax->data[0][0] < setup->cliprect_minx)
+   if (setup.vmin->data[0][0] < setup.cliprect_minx &&
+       setup.vmid->data[0][0] < setup.cliprect_minx &&
+       setup.vmax->data[0][0] < setup.cliprect_minx)
       return FALSE;
-   if (setup->vmin->data[0][0] > setup->cliprect_maxx &&
-       setup->vmid->data[0][0] > setup->cliprect_maxx &&
-       setup->vmax->data[0][0] > setup->cliprect_maxx)
+   if (setup.vmin->data[0][0] > setup.cliprect_maxx &&
+       setup.vmid->data[0][0] > setup.cliprect_maxx &&
+       setup.vmax->data[0][0] > setup.cliprect_maxx)
       return FALSE;
 
-   setup->ebot.dx = setup->vmid->data[0][0] - setup->vmin->data[0][0];
-   setup->ebot.dy = setup->vmid->data[0][1] - setup->vmin->data[0][1];
-   setup->emaj.dx = setup->vmax->data[0][0] - setup->vmin->data[0][0];
-   setup->emaj.dy = setup->vmax->data[0][1] - setup->vmin->data[0][1];
-   setup->etop.dx = setup->vmax->data[0][0] - setup->vmid->data[0][0];
-   setup->etop.dy = setup->vmax->data[0][1] - setup->vmid->data[0][1];
+   setup.ebot.dx = setup.vmid->data[0][0] - setup.vmin->data[0][0];
+   setup.ebot.dy = setup.vmid->data[0][1] - setup.vmin->data[0][1];
+   setup.emaj.dx = setup.vmax->data[0][0] - setup.vmin->data[0][0];
+   setup.emaj.dy = setup.vmax->data[0][1] - setup.vmin->data[0][1];
+   setup.etop.dx = setup.vmax->data[0][0] - setup.vmid->data[0][0];
+   setup.etop.dy = setup.vmax->data[0][1] - setup.vmid->data[0][1];
 
    /*
     * Compute triangle's area.  Use 1/area to compute partial
@@ -612,13 +614,13 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
     * use the prim->det value because its sign is correct.
     */
    {
-      const float area = (setup->emaj.dx * setup->ebot.dy - 
-			    setup->ebot.dx * setup->emaj.dy);
+      const float area = (setup.emaj.dx * setup.ebot.dy - 
+			    setup.ebot.dx * setup.emaj.dy);
 
-      setup->oneoverarea = 1.0f / area;
+      setup.oneoverarea = 1.0f / area;
       /*
       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup->oneoverarea, area, prim->det );
+                   __FUNCTION__, setup.oneoverarea, area, prim->det );
       */
    }
 
@@ -627,7 +629,7 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.facing = (prim->det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
 #endif
 
    return TRUE;
@@ -637,22 +639,22 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  * The value value comes from vertex->data[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
+ * The result will be put into setup.coef[slot].a0[i].
  * \param slot  which attribute slot 
  * \param i  which component of the slot (0..3)
  */
-static void const_coeff(struct setup_stage *setup, uint slot)
+static void const_coeff(uint slot)
 {
    uint i;
    ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
    for (i = 0; i < 4; i++) {
-      setup->coef[slot].dadx[i] = 0;
-      setup->coef[slot].dady[i] = 0;
+      setup.coef[slot].dadx[i] = 0;
+      setup.coef[slot].dady[i] = 0;
 
       /* need provoking vertex info!
        */
-      setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+      setup.coef[slot].a0[i] = setup.vprovoke->data[slot][i];
    }
 }
 
@@ -661,20 +663,19 @@ static void const_coeff(struct setup_stage *setup, uint slot)
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a triangle.
  */
-static void tri_linear_coeff( struct setup_stage *setup,
-                              uint slot, uint firstComp, uint lastComp )
+static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
 {
    uint i;
    for (i = firstComp; i < lastComp; i++) {
-      float botda = setup->vmid->data[slot][i] - setup->vmin->data[slot][i];
-      float majda = setup->vmax->data[slot][i] - setup->vmin->data[slot][i];
-      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float botda = setup.vmid->data[slot][i] - setup.vmin->data[slot][i];
+      float majda = setup.vmax->data[slot][i] - setup.vmin->data[slot][i];
+      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
    
       ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-      setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-      setup->coef[slot].dady[i] = b * setup->oneoverarea;
+      setup.coef[slot].dadx[i] = a * setup.oneoverarea;
+      setup.coef[slot].dady[i] = b * setup.oneoverarea;
 
       /* calculate a0 as the value which would be sampled for the
        * fragment at (0,0), taking into account that we want to sample at
@@ -688,17 +689,17 @@ static void tri_linear_coeff( struct setup_stage *setup,
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup->coef[slot].a0[i] = (setup->vmin->data[slot][i] - 
-                                 (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-                                  setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0[i] = (setup.vmin->data[slot][i] - 
+                                 (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
+                                  setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
    }
 
    /*
    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
-		setup->coef[slot].a0[i],
-		setup->coef[slot].dadx[i],
-		setup->coef[slot].dady[i]);
+		setup.coef[slot].a0[i],
+		setup.coef[slot].dadx[i],
+		setup.coef[slot].dady[i]);
    */
 }
 
@@ -712,46 +713,45 @@ static void tri_linear_coeff( struct setup_stage *setup,
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void tri_persp_coeff( struct setup_stage *setup,
-                             unsigned slot,
+static void tri_persp_coeff( unsigned slot,
                              unsigned i )
 {
    /* premultiply by 1/w:
     */
-   float mina = setup->vmin->data[slot][i] * setup->vmin->data[0][3];
-   float mida = setup->vmid->data[slot][i] * setup->vmid->data[0][3];
-   float maxa = setup->vmax->data[slot][i] * setup->vmax->data[0][3];
+   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
+   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
+   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
 
    float botda = mida - mina;
    float majda = maxa - mina;
-   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
       
    /*
    printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup->vmin->data[slot][i],
-          setup->vmid->data[slot][i],
-          setup->vmax->data[slot][i]
+          setup.vmin->data[slot][i],
+          setup.vmid->data[slot][i],
+          setup.vmax->data[slot][i]
           );
    */
 
    assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
 
-   setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-   setup->coef[slot].dady[i] = b * setup->oneoverarea;
-   setup->coef[slot].a0[i] = (mina - 
-			    (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-			     setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+   setup.coef[slot].dadx[i] = a * setup.oneoverarea;
+   setup.coef[slot].dady[i] = b * setup.oneoverarea;
+   setup.coef[slot].a0[i] = (mina - 
+			    (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
+			     setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
 }
 #endif
 
 
 /**
- * Compute the setup->coef[] array dadx, dady, a0 values.
- * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
+ * Compute the setup.coef[] array dadx, dady, a0 values.
+ * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
  */
-static void setup_tri_coefficients( struct setup_stage *setup )
+static void setup_tri_coefficients(void)
 {
 #if 1
    uint i;
@@ -761,17 +761,17 @@ static void setup_tri_coefficients( struct setup_stage *setup )
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(setup, i, 2, 3);
+         tri_linear_coeff(i, 2, 3);
          /* XXX interp W if PERSPECTIVE... */
          break;
       case INTERP_CONSTANT:
-         const_coeff(setup, i);
+         const_coeff(i);
          break;
       case INTERP_LINEAR:
-         tri_linear_coeff(setup, i, 0, 4);
+         tri_linear_coeff(i, 0, 4);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff(setup, i, 0, 4); /* XXX temporary */
+         tri_linear_coeff(i, 0, 4); /* XXX temporary */
          break;
       default:
          ASSERT(0);
@@ -781,35 +781,35 @@ static void setup_tri_coefficients( struct setup_stage *setup )
    ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
    ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
           spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
-   tri_linear_coeff(setup, 0, 2, 3);  /* slot 0, z */
-   tri_linear_coeff(setup, 1, 0, 4);  /* slot 1, color */
+   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
+   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
 #endif
 }
 
 
-static void setup_tri_edges( struct setup_stage *setup )
+static void setup_tri_edges(void)
 {
-   float vmin_x = setup->vmin->data[0][0] + 0.5f;
-   float vmid_x = setup->vmid->data[0][0] + 0.5f;
-
-   float vmin_y = setup->vmin->data[0][1] - 0.5f;
-   float vmid_y = setup->vmid->data[0][1] - 0.5f;
-   float vmax_y = setup->vmax->data[0][1] - 0.5f;
-
-   setup->emaj.sy = CEILF(vmin_y);
-   setup->emaj.lines = (int) CEILF(vmax_y - setup->emaj.sy);
-   setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
-   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
-
-   setup->etop.sy = CEILF(vmid_y);
-   setup->etop.lines = (int) CEILF(vmax_y - setup->etop.sy);
-   setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
-   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
-
-   setup->ebot.sy = CEILF(vmin_y);
-   setup->ebot.lines = (int) CEILF(vmid_y - setup->ebot.sy);
-   setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
-   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
+   float vmin_x = setup.vmin->data[0][0] + 0.5f;
+   float vmid_x = setup.vmid->data[0][0] + 0.5f;
+
+   float vmin_y = setup.vmin->data[0][1] - 0.5f;
+   float vmid_y = setup.vmid->data[0][1] - 0.5f;
+   float vmax_y = setup.vmax->data[0][1] - 0.5f;
+
+   setup.emaj.sy = CEILF(vmin_y);
+   setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
+   setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
+   setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
+
+   setup.etop.sy = CEILF(vmid_y);
+   setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
+   setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
+   setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
+
+   setup.ebot.sy = CEILF(vmin_y);
+   setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
+   setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
+   setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 }
 
 
@@ -817,15 +817,14 @@ static void setup_tri_edges( struct setup_stage *setup )
  * Render the upper or lower half of a triangle.
  * Scissoring/cliprect is applied here too.
  */
-static void subtriangle( struct setup_stage *setup,
-			 struct edge *eleft,
+static void subtriangle( struct edge *eleft,
 			 struct edge *eright,
 			 unsigned lines )
 {
-   const int minx = setup->cliprect_minx;
-   const int maxx = setup->cliprect_maxx;
-   const int miny = setup->cliprect_miny;
-   const int maxy = setup->cliprect_maxy;
+   const int minx = setup.cliprect_minx;
+   const int maxx = setup.cliprect_maxx;
+   const int miny = setup.cliprect_miny;
+   const int maxy = setup.cliprect_maxy;
    int y, start_y, finish_y;
    int sy = (int)eleft->sy;
 
@@ -867,14 +866,14 @@ static void subtriangle( struct setup_stage *setup,
 
       if (left < right) {
          int _y = sy + y;
-         if (block(_y) != setup->span.y) {
-            flush_spans(setup);
-            setup->span.y = block(_y);
+         if (block(_y) != setup.span.y) {
+            flush_spans();
+            setup.span.y = block(_y);
          }
 
-         setup->span.left[_y&1] = left;
-         setup->span.right[_y&1] = right;
-         setup->span.y_flags |= 1<<(_y&1);
+         setup.span.left[_y&1] = left;
+         setup.span.right[_y&1] = right;
+         setup.span.y_flags |= 1<<(_y&1);
       }
    }
 
@@ -892,41 +891,41 @@ static void subtriangle( struct setup_stage *setup,
  * Do setup for triangle rasterization, then render the triangle.
  */
 static void
-setup_tri(struct setup_stage *setup, struct prim_header *prim)
+setup_tri(struct prim_header *prim)
 {
-   if (!setup_sort_vertices( setup, prim )) {
+   if (!setup_sort_vertices( prim )) {
       return; /* totally clipped */
    }
 
-   setup_tri_coefficients( setup );
-   setup_tri_edges( setup );
+   setup_tri_coefficients();
+   setup_tri_edges();
 
 #if 0
-   setup->quad.prim = PRIM_TRI;
+   setup.quad.prim = PRIM_TRI;
 #endif
 
-   setup->span.y = 0;
-   setup->span.y_flags = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
-   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
 
    /*   init_constant_attribs( setup ); */
       
-   if (setup->oneoverarea < 0.0) {
+   if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
-      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
-      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
+      subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
+      subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
    }
    else {
       /* emaj on right:
        */
-      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
-      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+      subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
+      subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
    }
 
-   flush_spans( setup );
+   flush_spans();
 }
 
 
@@ -939,7 +938,7 @@ void
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
    struct prim_header tri;
-   struct setup_stage setup;
+   /*struct setup_stage setup;*/
 
    tri.v[0] = (struct vertex_header *) v0;
    tri.v[1] = (struct vertex_header *) v1;
@@ -954,5 +953,5 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
-   setup_tri(&setup, &tri);
+   setup_tri(&tri);
 }
-- 
cgit v1.2.3


From 7b149449df3a7de62f79eb96d5b722cc9d3b5912 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 12:13:04 -0700
Subject: Cell: fold setup_tri() into tri_draw()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 62 ++++++++++++----------------------------
 1 file changed, 19 insertions(+), 43 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 5bb2cb12e3..1c615a6e6a 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -47,9 +47,6 @@ struct vertex_header {
    float data[0][4];
 };
 
-struct prim_header {
-   struct vertex_header *v[3];
-};
 
 
 /* XXX fix this */
@@ -520,11 +517,10 @@ static void print_vertex(const struct vertex_header *v)
 }
 #endif
 
-static boolean setup_sort_vertices(const struct prim_header *prim )
+static boolean setup_sort_vertices(const struct vertex_header *v0,
+                                   const struct vertex_header *v1,
+                                   const struct vertex_header *v2)
 {
-   const struct vertex_header *v0 = prim->v[0];
-   const struct vertex_header *v1 = prim->v[1];
-   const struct vertex_header *v2 = prim->v[2];
 
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
@@ -888,22 +884,30 @@ static void subtriangle( struct edge *eleft,
 
 
 /**
- * Do setup for triangle rasterization, then render the triangle.
+ * Draw triangle into tile at (tx, ty) (tile coords)
+ * The tile data should have already been fetched.
  */
-static void
-setup_tri(struct prim_header *prim)
+void
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
-   if (!setup_sort_vertices( prim )) {
+   setup.tx = tx;
+   setup.ty = ty;
+
+   /* set clipping bounds to tile bounds */
+   setup.cliprect_minx = tx * TILE_SIZE;
+   setup.cliprect_miny = ty * TILE_SIZE;
+   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
+   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
+
+   if (!setup_sort_vertices((struct vertex_header *) v0,
+                            (struct vertex_header *) v1,
+                            (struct vertex_header *) v2)) {
       return; /* totally clipped */
    }
 
    setup_tri_coefficients();
    setup_tri_edges();
 
-#if 0
-   setup.quad.prim = PRIM_TRI;
-#endif
-
    setup.span.y = 0;
    setup.span.y_flags = 0;
    setup.span.right[0] = 0;
@@ -927,31 +931,3 @@ setup_tri(struct prim_header *prim)
 
    flush_spans();
 }
-
-
-
-/**
- * Draw triangle into tile at (tx, ty) (tile coords)
- * The tile data should have already been fetched.
- */
-void
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
-{
-   struct prim_header tri;
-   /*struct setup_stage setup;*/
-
-   tri.v[0] = (struct vertex_header *) v0;
-   tri.v[1] = (struct vertex_header *) v1;
-   tri.v[2] = (struct vertex_header *) v2;
-
-   setup.tx = tx;
-   setup.ty = ty;
-
-   /* set clipping bounds to tile bounds */
-   setup.cliprect_minx = tx * TILE_SIZE;
-   setup.cliprect_miny = ty * TILE_SIZE;
-   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
-   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
-
-   setup_tri(&tri);
-}
-- 
cgit v1.2.3


From 24f0e54c1b9ff43dcb75758c8e0faba355c0617c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 15:26:51 -0700
Subject: Cell: start to SIMD-ize triangle attribute interpolation

Using the spu_add(), etc intrinsics.
About a 15% speed-up with some tests.
---
 src/mesa/pipe/cell/spu/spu_main.h    |   7 ++
 src/mesa/pipe/cell/spu/spu_texture.c |   6 +-
 src/mesa/pipe/cell/spu/spu_texture.h |   2 +-
 src/mesa/pipe/cell/spu/spu_tri.c     | 126 +++++++++++++++++++----------------
 4 files changed, 79 insertions(+), 62 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 8908bf8bc0..73f9ed29d6 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -36,6 +36,13 @@
 #include "pipe/p_state.h"
 
 
+typedef union
+{
+   vector float v;
+   float f[4];
+} float4;
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 6d566a5006..7a1ca097c0 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -128,11 +128,11 @@ get_tex_tile(uint i, uint j)
  * XXX this is extremely primitive for now.
  */
 uint
-sample_texture(const float *texcoord)
+sample_texture(float4 texcoord)
 {
    /* wrap/repeat */
-   uint i = (uint) (texcoord[0] * spu.texture.width) % spu.texture.width;
-   uint j = (uint) (texcoord[1] * spu.texture.height) % spu.texture.height;
+   uint i = (uint) (texcoord.f[0] * spu.texture.width) % spu.texture.width;
+   uint j = (uint) (texcoord.f[1] * spu.texture.height) % spu.texture.height;
    uint pos = get_tex_tile(i, j);
    uint texel = tex_tiles[pos].t32[j % TILE_SIZE][i % TILE_SIZE];
    return texel;
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index b75b7ac44f..938a42b549 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -37,7 +37,7 @@ invalidate_tex_cache(void);
 
 
 extern uint
-sample_texture(const float *texcoord);
+sample_texture(float4 texcoord);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 1c615a6e6a..4fc6d90895 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -81,9 +81,9 @@ struct edge {
 
 struct interp_coef
 {
-   float a0[4];
-   float dadx[4];
-   float dady[4];
+   float4 a0;
+   float4 dadx;
+   float4 dady;
 };
 
 
@@ -201,36 +201,31 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, float result[4][4])
+eval_coeff(uint slot, float x, float y, float4 result[4])
 {
    switch (spu.vertex_info.interp_mode[slot]) {
    case INTERP_CONSTANT:
-      {
-         uint i;
-         for (i = 0; i < 4; i++) {
-            result[QUAD_TOP_LEFT][i] =
-            result[QUAD_TOP_RIGHT][i] =
-            result[QUAD_BOTTOM_LEFT][i] =
-            result[QUAD_BOTTOM_RIGHT][i] = setup.coef[slot].a0[i];
-         }
-      }
+      result[QUAD_TOP_LEFT] =
+      result[QUAD_TOP_RIGHT] =
+      result[QUAD_BOTTOM_LEFT] =
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
       break;
 
    case INTERP_LINEAR:
       /* fall-through, for now */
    default:
       {
-         uint i;
-         const float *dadx = setup.coef[slot].dadx;
-         const float *dady = setup.coef[slot].dady;
-
-         /* loop over XYZW comps */
-         for (i = 0; i < 4; i++) {
-            result[QUAD_TOP_LEFT][i] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-            result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
-            result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
-            result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
-         }
+         register vector float dadx = setup.coef[slot].dadx.v;
+         register vector float dady = setup.coef[slot].dady.v;
+         register vector float topLeft
+            = spu_add(setup.coef[slot].a0.v,
+                      spu_add(spu_mul(spu_splats(x), dadx),
+                              spu_mul(spu_splats(y), dady)));
+
+         result[QUAD_TOP_LEFT].v = topLeft;
+         result[QUAD_TOP_RIGHT].v = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT].v = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT].v = spu_add(spu_add(topLeft, dadx), dady);
       }
    }
 }
@@ -240,28 +235,46 @@ static INLINE void
 eval_z(float x, float y, float result[4])
 {
    const uint slot = 0;
-   const uint i = 2;
-   const float *dadx = setup.coef[slot].dadx;
-   const float *dady = setup.coef[slot].dady;
-
-   result[QUAD_TOP_LEFT] = setup.coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-   result[QUAD_TOP_RIGHT] = result[0] + dadx[i];
-   result[QUAD_BOTTOM_LEFT] = result[0] + dady[i];
-   result[QUAD_BOTTOM_RIGHT] = result[0] + dadx[i] + dady[i];
+   const float dzdx = setup.coef[slot].dadx.f[2];
+   const float dzdy = setup.coef[slot].dady.f[2];
+   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+#if 1
+   result[QUAD_TOP_LEFT] = topLeft;
+   result[QUAD_TOP_RIGHT] = topLeft + dzdx;
+   result[QUAD_BOTTOM_LEFT] = topLeft + dzdy;
+   result[QUAD_BOTTOM_RIGHT] = topLeft + dzdx + dzdy;
+#else
+   /* XXX vectorize */
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs
+      = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   vector float *res = (vector float *) result;
+   *res = spu_add(topLeftv, derivs);
+#endif
 }
 
 
-static INLINE uint
-pack_color(const float color[4])
+static INLINE void
+pack_colors(uint uicolors[4], const float4 fcolors[4])
 {
+   /* XXX grab the code for _pack_rgba8() and use the shuffle
+    * command to do the swizzling seen here.
+    */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return _pack_rgba8(color[3], color[0], color[1], color[2]);
+      uicolors[0] = _pack_rgba8(fcolors[0].f[3], fcolors[0].f[0], fcolors[0].f[1], fcolors[0].f[2]);
+      uicolors[1] = _pack_rgba8(fcolors[1].f[3], fcolors[1].f[0], fcolors[1].f[1], fcolors[1].f[2]);
+      uicolors[2] = _pack_rgba8(fcolors[2].f[3], fcolors[2].f[0], fcolors[2].f[1], fcolors[2].f[2]);
+      uicolors[3] = _pack_rgba8(fcolors[3].f[3], fcolors[0].f[0], fcolors[3].f[1], fcolors[3].f[2]);
+      break;
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return _pack_rgba8(color[2], color[1], color[0], color[3]);
+      uicolors[0] = _pack_rgba8(fcolors[0].f[2], fcolors[0].f[1], fcolors[0].f[0], fcolors[0].f[3]);
+      uicolors[1] = _pack_rgba8(fcolors[1].f[2], fcolors[1].f[1], fcolors[1].f[0], fcolors[1].f[3]);
+      uicolors[2] = _pack_rgba8(fcolors[2].f[2], fcolors[2].f[1], fcolors[2].f[0], fcolors[2].f[3]);
+      uicolors[3] = _pack_rgba8(fcolors[3].f[2], fcolors[3].f[1], fcolors[3].f[0], fcolors[3].f[3]);
+      break;
    default:
       ASSERT(0);
-      return 0;
    }
 }
 
@@ -379,7 +392,7 @@ emit_quad( int x, int y, unsigned mask )
    uint colors[4];  /* indexed by QUAD_x */
 
    if (spu.texture.start) {
-      float texcoords[4][4];
+      float4 texcoords[4];
       uint i;
       eval_coeff(2, (float) x, (float) y, texcoords);
       for (i = 0; i < 4; i++) {
@@ -387,12 +400,9 @@ emit_quad( int x, int y, unsigned mask )
       }
    }
    else {
-      float fcolors[4][4];
+      float4 fcolors[4];
       eval_coeff(1, (float) x, (float) y, fcolors);
-      colors[QUAD_TOP_LEFT] = pack_color(fcolors[QUAD_TOP_LEFT]);
-      colors[QUAD_TOP_RIGHT] = pack_color(fcolors[QUAD_TOP_RIGHT]);
-      colors[QUAD_BOTTOM_LEFT] = pack_color(fcolors[QUAD_BOTTOM_LEFT]);
-      colors[QUAD_BOTTOM_RIGHT] = pack_color(fcolors[QUAD_BOTTOM_RIGHT]);
+      pack_colors(colors, fcolors);
    }
 
    if (spu.depth_stencil.depth.enabled) {
@@ -645,12 +655,12 @@ static void const_coeff(uint slot)
    ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
    for (i = 0; i < 4; i++) {
-      setup.coef[slot].dadx[i] = 0;
-      setup.coef[slot].dady[i] = 0;
+      setup.coef[slot].dadx.f[i] = 0;
+      setup.coef[slot].dady.f[i] = 0;
 
       /* need provoking vertex info!
        */
-      setup.coef[slot].a0[i] = setup.vprovoke->data[slot][i];
+      setup.coef[slot].a0.f[i] = setup.vprovoke->data[slot][i];
    }
 }
 
@@ -670,8 +680,8 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
    
       ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-      setup.coef[slot].dadx[i] = a * setup.oneoverarea;
-      setup.coef[slot].dady[i] = b * setup.oneoverarea;
+      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
 
       /* calculate a0 as the value which would be sampled for the
        * fragment at (0,0), taking into account that we want to sample at
@@ -685,17 +695,17 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup.coef[slot].a0[i] = (setup.vmin->data[slot][i] - 
-                                 (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
-                                  setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0.f[i] = (setup.vmin->data[slot][i] - 
+                                 (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
+                                  setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
    }
 
    /*
    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
 		setup.coef[slot].a0[i],
-		setup.coef[slot].dadx[i],
-		setup.coef[slot].dady[i]);
+		setup.coef[slot].dadx.f[i],
+		setup.coef[slot].dady.f[i]);
    */
 }
 
@@ -734,11 +744,11 @@ static void tri_persp_coeff( unsigned slot,
    assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
 
-   setup.coef[slot].dadx[i] = a * setup.oneoverarea;
-   setup.coef[slot].dady[i] = b * setup.oneoverarea;
-   setup.coef[slot].a0[i] = (mina - 
-			    (setup.coef[slot].dadx[i] * (setup.vmin->data[0][0] - 0.5f) + 
-			     setup.coef[slot].dady[i] * (setup.vmin->data[0][1] - 0.5f)));
+   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
+   setup.coef[slot].a0.f[i] = (mina - 
+			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
+			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
 }
 #endif
 
-- 
cgit v1.2.3


From 8fb73a59939ac9ec1e41abf89a4a8c8dde09b8df Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 30 Jan 2008 20:40:26 -0700
Subject: Cell: prototype SIMD code for z testing

---
 src/mesa/pipe/cell/spu/spu_tile.h |  10 +++
 src/mesa/pipe/cell/spu/spu_tri.c  | 147 +++++++++++++++++++++++++++++---------
 2 files changed, 123 insertions(+), 34 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index f83dc009c2..18d1b3c117 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -42,6 +42,7 @@
 typedef union {
    ushort t16[TILE_SIZE][TILE_SIZE];
    uint   t32[TILE_SIZE][TILE_SIZE];
+   float4 f4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
 
 
@@ -83,9 +84,18 @@ clear_z_tile(tile_t *ztile)
                TILE_SIZE * TILE_SIZE);
    }
    else {
+      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
+#if SIMD_Z
+      union fi z;
+      z.f = 1.0;
+      memset32((uint*) ztile->t32,
+               z.i,/*spu.fb.depth_clear_value,*/
+               TILE_SIZE * TILE_SIZE);
+#else
       memset32((uint*) ztile->t32,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
+#endif
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 4fc6d90895..e436e153ec 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -40,6 +40,19 @@
 #include "spu_tri.h"
 
 
+/*
+ * If SIMD_Z=1 the Z buffer is floating point and we use vector instructions
+ * to do Z testing/updating.
+ */
+#define SIMD_Z 0
+
+#if SIMD_Z
+typedef vector unsigned int mask_t;
+#else
+typedef uint mask_t;
+#endif
+
+
 /**
  * Simplified types taken from other parts of Gallium
  */
@@ -231,26 +244,16 @@ eval_coeff(uint slot, float x, float y, float4 result[4])
 }
 
 
-static INLINE void
-eval_z(float x, float y, float result[4])
+static INLINE vector float
+eval_z(float x, float y)
 {
    const uint slot = 0;
    const float dzdx = setup.coef[slot].dadx.f[2];
    const float dzdy = setup.coef[slot].dady.f[2];
    const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
-#if 1
-   result[QUAD_TOP_LEFT] = topLeft;
-   result[QUAD_TOP_RIGHT] = topLeft + dzdx;
-   result[QUAD_BOTTOM_LEFT] = topLeft + dzdy;
-   result[QUAD_BOTTOM_RIGHT] = topLeft + dzdx + dzdy;
-#else
-   /* XXX vectorize */
    const vector float topLeftv = spu_splats(topLeft);
-   const vector float derivs
-      = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
-   vector float *res = (vector float *) result;
-   *res = spu_add(topLeftv, derivs);
-#endif
+   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   return spu_add(topLeftv, derivs);
 }
 
 
@@ -279,14 +282,22 @@ pack_colors(uint uicolors[4], const float4 fcolors[4])
 }
 
 
-static uint
-do_depth_test(int x, int y, unsigned mask)
+
+static unsigned int
+do_depth_test(int x, int y, unsigned int mask)
 {
+   static const float4 zscale16
+      = {.f={65535.0, 65535.0, 65535.0, 65535.0}};
+   static const float4 zscale32
+      = {.f={(float)0xffffffff,
+             (float)0xffffffff,
+             (float)0xffffffff,
+             (float)0xffffffff}};
    int ix = x - setup.cliprect_minx;
    int iy = y - setup.cliprect_miny;
-   float zvals[4];
+   float4 zvals;
 
-   eval_z((float) x, (float) y, zvals);
+   zvals.v = eval_z((float) x, (float) y);
 
    if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
@@ -300,9 +311,9 @@ do_depth_test(int x, int y, unsigned mask)
 
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      const float zscale = 65535.0;
+      zvals.v = spu_mul(zvals.v, zscale16.v);
       if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) (zvals[0] * zscale);
+         uint z = (uint) zvals.f[0];
          if (z < ztile.t16[iy][ix])
             ztile.t16[iy][ix] = z;
          else
@@ -310,7 +321,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) (zvals[1] * zscale);
+         uint z = (uint) zvals.f[1];
          if (z < ztile.t16[iy][ix+1])
             ztile.t16[iy][ix+1] = z;
          else
@@ -318,7 +329,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) (zvals[2] * zscale);
+         uint z = (uint) zvals.f[2];
          if (z < ztile.t16[iy+1][ix])
             ztile.t16[iy+1][ix] = z;
          else
@@ -326,7 +337,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) (zvals[3] * zscale);
+         uint z = (uint) zvals.f[3];
          if (z < ztile.t16[iy+1][ix+1])
             ztile.t16[iy+1][ix+1] = z;
          else
@@ -334,10 +345,10 @@ do_depth_test(int x, int y, unsigned mask)
       }
    }
    else {
-      const float zscale = (float) 0xffffffff;
+      zvals.v = spu_mul(zvals.v, zscale32.v);
       ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
       if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) (zvals[0] * zscale);
+         uint z = (uint) zvals.f[0];
          if (z < ztile.t32[iy][ix])
             ztile.t32[iy][ix] = z;
          else
@@ -345,7 +356,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) (zvals[1] * zscale);
+         uint z = (uint) zvals.f[1];
          if (z < ztile.t32[iy][ix+1])
             ztile.t32[iy][ix+1] = z;
          else
@@ -353,7 +364,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) (zvals[2] * zscale);
+         uint z = (uint) zvals.f[2];
          if (z < ztile.t32[iy+1][ix])
             ztile.t32[iy+1][ix] = z;
          else
@@ -361,7 +372,7 @@ do_depth_test(int x, int y, unsigned mask)
       }
 
       if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) (zvals[3] * zscale);
+         uint z = (uint) zvals.f[3];
          if (z < ztile.t32[iy+1][ix+1])
             ztile.t32[iy+1][ix+1] = z;
          else
@@ -373,11 +384,45 @@ do_depth_test(int x, int y, unsigned mask)
 }
 
 
+
+
+static vector unsigned int
+do_depth_test_simd(int x, int y, vector unsigned int quadmask)
+{
+   int ix = (x - setup.cliprect_minx) / 2;
+   int iy = (y - setup.cliprect_miny) / 2;
+   float4 zvals;
+
+   vector unsigned int zmask;
+
+   zvals.v = eval_z((float) x, (float) y);
+
+   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+      /* now, _really_ clear the tile */
+      clear_z_tile(&ztile);
+   }
+   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+      /* make sure we've got the tile from main mem */
+      wait_on_mask(1 << TAG_READ_TILE_Z);
+   }
+   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
+
+   /* XXX fetch Z value sooner to hide latency here */
+   zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
+   zmask = spu_and(zmask, quadmask);
+
+   ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask);
+   //ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4);
+
+   return zmask;
+}
+
+
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  */
 static INLINE void
-emit_quad( int x, int y, unsigned mask )
+emit_quad( int x, int y, mask_t mask )
 {
 #if 0
    struct softpipe_context *sp = setup.softpipe;
@@ -406,10 +451,17 @@ emit_quad( int x, int y, unsigned mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-      mask &= do_depth_test(x, y, mask);
+#if SIMD_Z
+      mask = do_depth_test_simd(x, y, mask);
+#else
+      mask = do_depth_test(x, y, mask);
+#endif
    }
 
-   if (mask) {
+#if !SIMD_Z
+   if (mask)
+#endif
+   {
       if (tile_status[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
@@ -420,6 +472,21 @@ emit_quad( int x, int y, unsigned mask )
       }
       tile_status[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
 
+#if SIMD_Z
+      if (spu_extract(mask, 0))
+         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
+      if (spu_extract(mask, 1))
+         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
+      if (spu_extract(mask, 2))
+         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
+      if (spu_extract(mask, 3))
+         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+#elif 0
+      /* SIMD_Z with swizzled color buffer (someday) */
+      vector float icolors = *((vector float *) &colors);
+      ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
+
+#else
       if (mask & MASK_TOP_LEFT)
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (mask & MASK_TOP_RIGHT)
@@ -428,7 +495,9 @@ emit_quad( int x, int y, unsigned mask )
          ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (mask & MASK_BOTTOM_RIGHT)
          ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+#endif
    }
+
 #endif
 }
 
@@ -450,8 +519,18 @@ static INLINE int block( int x )
  * this is pretty nasty...  may need to rework flush_spans again to
  * fix it, if possible.
  */
-static unsigned calculate_mask( int x )
+static mask_t calculate_mask( int x )
 {
+#if SIMD_Z
+   uint m0, m1, m2, m3;
+
+   m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0;
+   m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0;
+   m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0;
+   m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0;
+
+   return (vector unsigned int) {m0, m1, m2, m3};
+#else
    unsigned mask = 0x0;
 
    if (x >= setup.span.left[0] && x < setup.span.right[0]) 
@@ -467,6 +546,7 @@ static unsigned calculate_mask( int x )
       mask |= MASK_BOTTOM_RIGHT;
 
    return mask;
+#endif
 }
 
 
@@ -505,8 +585,7 @@ static void flush_spans( void )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-      emit_quad( x, setup.span.y, 
-                 calculate_mask( x ) );
+      emit_quad( x, setup.span.y, calculate_mask( x ) );
    }
 
    setup.span.y = 0;
-- 
cgit v1.2.3


From 524bba17a75cee597f588da9c19f25d758aa237b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 10:37:18 -0800
Subject: Initial pass at vertex shader on SPU using TGSI VM

All of the code is wired in on the SPU side, but it is not called from
the PPU yet.  Instruction / declaration fetch still needs to be
implemented in spu_exec.c.
---
 src/mesa/pipe/cell/common.h                |   38 +
 src/mesa/pipe/cell/spu/Makefile            |    6 +-
 src/mesa/pipe/cell/spu/spu_exec.c          | 2355 ++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_exec.h          |  171 ++
 src/mesa/pipe/cell/spu/spu_main.c          |   28 +
 src/mesa/pipe/cell/spu/spu_util.c          |  165 ++
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  |  493 ++++++
 src/mesa/pipe/cell/spu/spu_vertex_shader.c |  224 +++
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |   61 +
 9 files changed, 3540 insertions(+), 1 deletion(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_exec.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_exec.h
 create mode 100644 src/mesa/pipe/cell/spu/spu_util.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_fetch.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_shader.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_vertex_shader.h

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d5e86863d4..80a1425ec7 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -83,6 +83,9 @@
 #define CELL_CMD_STATE_SAMPLER       12
 #define CELL_CMD_STATE_TEXTURE       13
 #define CELL_CMD_STATE_VERTEX_INFO   14
+#define CELL_CMD_STATE_VIEWPORT      15
+#define CELL_CMD_STATE_VS_ARRAY_INFO 16
+#define CELL_CMD_VS_EXECUTE          17
 
 
 #define CELL_NUM_BUFFERS 4
@@ -116,6 +119,41 @@ struct cell_command_clear_surface
 } ALIGN16_ATTRIB;
 
 
+/**
+ * Array info used by the vertex shader's vertex puller.
+ */
+struct cell_array_info
+{
+    void *base;               /**< Base address of the 0th element. */
+    uint attr;                /**< Attribute that this state if for. */
+    uint pitch;               /**< Byte pitch from one entry to the next. */
+    enum pipe_format format;  /**< Pipe format of each entry. */
+} ALIGN16_ATTRIB;
+
+
+struct cell_shader_info
+{
+   unsigned processor;
+   unsigned num_outputs;
+
+   void *declarations;
+   unsigned num_declarations;
+   void *instructions;
+   unsigned num_instructions;
+   void *uniforms;
+} ALIGN16_ATTRIB;
+
+
+struct cell_command_vs
+{
+   struct cell_shader_info   shader;
+   void *elts;
+   unsigned num_elts;
+   unsigned bytes_per_elt;
+   void *vOut;
+} ALIGN16_ATTRIB;
+
+
 struct cell_command_render
 {
    uint opcode;       /**< CELL_CMD_RENDER */
diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index d5b30e1f27..2d031bfbc6 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -20,7 +20,11 @@ SOURCES = \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
-	spu_tri.c
+	spu_tri.c \
+	spu_exec.c \
+	spu_util.c \
+	spu_vertex_fetch.c \
+	spu_vertex_shader.c
 
 SPU_OBJECTS = $(SOURCES:.c=.o) \
 
diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
new file mode 100644
index 0000000000..6888e97caf
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -0,0 +1,2355 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+#include "spu_exec.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call spu_exec_machine_run() many times.
+ */
+void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor)
+{
+   uint i;
+
+   mach->Samplers = samplers;
+   mach->Processor = processor;
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   /* Setup constants. */
+   for( i = 0; i < 4; i++ ) {
+      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
+      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
+      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
+      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
+      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
+      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
+      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
+      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+   }
+}
+
+
+static void
+micro_abs(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = (float) fabs( (double) src->f[0] );
+   dst->f[1] = (float) fabs( (double) src->f[1] );
+   dst->f[2] = (float) fabs( (double) src->f[2] );
+   dst->f[3] = (float) fabs( (double) src->f[3] );
+}
+
+static void
+micro_add(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] + src1->f[0];
+   dst->f[1] = src0->f[1] + src1->f[1];
+   dst->f[2] = src0->f[2] + src1->f[2];
+   dst->f[3] = src0->f[3] + src1->f[3];
+}
+
+static void
+micro_iadd(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] + src1->i[0];
+   dst->i[1] = src0->i[1] + src1->i[1];
+   dst->i[2] = src0->i[2] + src1->i[2];
+   dst->i[3] = src0->i[3] + src1->i[3];
+}
+
+static void
+micro_and(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] & src1->u[0];
+   dst->u[1] = src0->u[1] & src1->u[1];
+   dst->u[2] = src0->u[2] & src1->u[2];
+   dst->u[3] = src0->u[3] & src1->u[3];
+}
+
+static void
+micro_ceil(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) ceil( (double) src->f[0] );
+   dst->f[1] = (float) ceil( (double) src->f[1] );
+   dst->f[2] = (float) ceil( (double) src->f[2] );
+   dst->f[3] = (float) ceil( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_cos(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) cos( (double) src->f[0] );
+   dst->f[1] = (float) cos( (double) src->f[1] );
+   dst->f[2] = (float) cos( (double) src->f[2] );
+   dst->f[3] = (float) cos( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_ddx(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_ddy(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_div(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] / src1->f[0];
+   dst->f[1] = src0->f[1] / src1->f[1];
+   dst->f[2] = src0->f[2] / src1->f[2];
+   dst->f[3] = src0->f[3] / src1->f[3];
+}
+
+static void
+micro_udiv(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] / src1->u[0];
+   dst->u[1] = src0->u[1] / src1->u[1];
+   dst->u[2] = src0->u[2] / src1->u[2];
+   dst->u[3] = src0->u[3] / src1->u[3];
+}
+
+static void
+micro_eq(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ieq(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_exp2(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src)
+{
+#if 0
+   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
+   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
+   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
+   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+#endif
+}
+
+static void
+micro_f2it(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->i[0] = (int) src->f[0];
+   dst->i[1] = (int) src->f[1];
+   dst->i[2] = (int) src->f[2];
+   dst->i[3] = (int) src->f[3];
+}
+
+static void
+micro_f2ut(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->u[0] = (uint) src->f[0];
+   dst->u[1] = (uint) src->f[1];
+   dst->u[2] = (uint) src->f[2];
+   dst->u[3] = (uint) src->f[3];
+}
+
+static void
+micro_flr(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) floor( (double) src->f[0] );
+   dst->f[1] = (float) floor( (double) src->f[1] );
+   dst->f[2] = (float) floor( (double) src->f[2] );
+   dst->f[3] = (float) floor( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_frc(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
+   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
+   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
+   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_ge(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_i2f(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = (float) src->i[0];
+   dst->f[1] = (float) src->i[1];
+   dst->f[2] = (float) src->i[2];
+   dst->f[3] = (float) src->i[3];
+}
+
+static void
+micro_lg2(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
+   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
+   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
+   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+#endif
+}
+
+static void
+micro_lt(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ilt(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_ult(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2,
+   const union spu_exec_channel *src3 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
+}
+
+static void
+micro_max(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imax(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umax(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_min(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imin(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umin(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_umod(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] % src1->u[0];
+   dst->u[1] = src0->u[1] % src1->u[1];
+   dst->u[2] = src0->u[2] % src1->u[2];
+   dst->u[3] = src0->u[3] % src1->u[3];
+}
+
+static void
+micro_mul(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] * src1->f[0];
+   dst->f[1] = src0->f[1] * src1->f[1];
+   dst->f[2] = src0->f[2] * src1->f[2];
+   dst->f[3] = src0->f[3] * src1->f[3];
+}
+
+static void
+micro_imul(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] * src1->i[0];
+   dst->i[1] = src0->i[1] * src1->i[1];
+   dst->i[2] = src0->i[2] * src1->i[2];
+   dst->i[3] = src0->i[3] * src1->i[3];
+}
+
+static void
+micro_imul64(
+   union spu_exec_channel *dst0,
+   union spu_exec_channel *dst1,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst1->i[0] = src0->i[0] * src1->i[0];
+   dst1->i[1] = src0->i[1] * src1->i[1];
+   dst1->i[2] = src0->i[2] * src1->i[2];
+   dst1->i[3] = src0->i[3] * src1->i[3];
+   dst0->i[0] = 0;
+   dst0->i[1] = 0;
+   dst0->i[2] = 0;
+   dst0->i[3] = 0;
+}
+
+static void
+micro_umul64(
+   union spu_exec_channel *dst0,
+   union spu_exec_channel *dst1,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst1->u[0] = src0->u[0] * src1->u[0];
+   dst1->u[1] = src0->u[1] * src1->u[1];
+   dst1->u[2] = src0->u[2] * src1->u[2];
+   dst1->u[3] = src0->u[3] * src1->u[3];
+   dst0->u[0] = 0;
+   dst0->u[1] = 0;
+   dst0->u[2] = 0;
+   dst0->u[3] = 0;
+}
+
+static void
+micro_movc(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1,
+   const union spu_exec_channel *src2 )
+{
+   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
+   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
+   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
+   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
+}
+
+static void
+micro_neg(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = -src->f[0];
+   dst->f[1] = -src->f[1];
+   dst->f[2] = -src->f[2];
+   dst->f[3] = -src->f[3];
+}
+
+static void
+micro_ineg(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->i[0] = -src->i[0];
+   dst->i[1] = -src->i[1];
+   dst->i[2] = -src->i[2];
+   dst->i[3] = -src->i[3];
+}
+
+static void
+micro_not(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->u[0] = ~src->u[0];
+   dst->u[1] = ~src->u[1];
+   dst->u[2] = ~src->u[2];
+   dst->u[3] = ~src->u[3];
+}
+
+static void
+micro_or(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] | src1->u[0];
+   dst->u[1] = src0->u[1] | src1->u[1];
+   dst->u[2] = src0->u[2] | src1->u[2];
+   dst->u[3] = src0->u[3] | src1->u[3];
+}
+
+static void
+micro_pow(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+#if 0
+   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
+   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
+   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
+   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+#endif
+}
+
+static void
+micro_rnd(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
+   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
+   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
+   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+#endif
+}
+
+static void
+micro_shl(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] << src1->i[0];
+   dst->i[1] = src0->i[1] << src1->i[1];
+   dst->i[2] = src0->i[2] << src1->i[2];
+   dst->i[3] = src0->i[3] << src1->i[3];
+}
+
+static void
+micro_ishr(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] >> src1->i[0];
+   dst->i[1] = src0->i[1] >> src1->i[1];
+   dst->i[2] = src0->i[2] >> src1->i[2];
+   dst->i[3] = src0->i[3] >> src1->i[3];
+}
+
+static void
+micro_trunc(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0 )
+{
+   dst->f[0] = (float) (int) src0->f[0];
+   dst->f[1] = (float) (int) src0->f[1];
+   dst->f[2] = (float) (int) src0->f[2];
+   dst->f[3] = (float) (int) src0->f[3];
+}
+
+static void
+micro_ushr(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] >> src1->u[0];
+   dst->u[1] = src0->u[1] >> src1->u[1];
+   dst->u[2] = src0->u[2] >> src1->u[2];
+   dst->u[3] = src0->u[3] >> src1->u[3];
+}
+
+static void
+micro_sin(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) sin( (double) src->f[0] );
+   dst->f[1] = (float) sin( (double) src->f[1] );
+   dst->f[2] = (float) sin( (double) src->f[2] );
+   dst->f[3] = (float) sin( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_sqrt( union spu_exec_channel *dst,
+            const union spu_exec_channel *src )
+{
+#if 0
+   dst->f[0] = (float) sqrt( (double) src->f[0] );
+   dst->f[1] = (float) sqrt( (double) src->f[1] );
+   dst->f[2] = (float) sqrt( (double) src->f[2] );
+   dst->f[3] = (float) sqrt( (double) src->f[3] );
+#endif
+}
+
+static void
+micro_sub(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] - src1->f[0];
+   dst->f[1] = src0->f[1] - src1->f[1];
+   dst->f[2] = src0->f[2] - src1->f[2];
+   dst->f[3] = src0->f[3] - src1->f[3];
+}
+
+static void
+micro_u2f(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src )
+{
+   dst->f[0] = (float) src->u[0];
+   dst->f[1] = (float) src->u[1];
+   dst->f[2] = (float) src->u[2];
+   dst->f[3] = (float) src->u[3];
+}
+
+static void
+micro_xor(
+   union spu_exec_channel *dst,
+   const union spu_exec_channel *src0,
+   const union spu_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] ^ src1->u[0];
+   dst->u[1] = src0->u[1] ^ src1->u[1];
+   dst->u[2] = src0->u[2] ^ src1->u[2];
+   dst->u[3] = src0->u[3] ^ src1->u[3];
+}
+
+static void
+fetch_src_file_channel(
+   const struct spu_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union spu_exec_channel *index,
+   union spu_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT:
+         chan->f[0] = mach->Consts[index->i[0]][swizzle];
+         chan->f[1] = mach->Consts[index->i[1]][swizzle];
+         chan->f[2] = mach->Consts[index->i[2]][swizzle];
+         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         assert( index->i[0] < (int) mach->ImmLimit );
+         assert( index->i[1] < (int) mach->ImmLimit );
+         assert( index->i[2] < (int) mach->ImmLimit );
+         assert( index->i[3] < (int) mach->ImmLimit );
+
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct spu_exec_machine *mach,
+   union spu_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union spu_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union spu_exec_channel index2;
+      union spu_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->SrcRegisterInd,
+                                                   CHAN_X);
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.i[0] += indir_index.i[0];
+      index.i[1] += indir_index.i[1];
+      index.i[2] += indir_index.i[2];
+      index.i[3] += indir_index.i[3];
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.i[0] *= 17;
+         index.i[1] *= 17;
+         index.i[2] *= 17;
+         index.i[3] *= 17;
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.i[0] *= 4096;
+         index.i[1] *= 4096;
+         index.i[2] *= 4096;
+         index.i[3] *= 4096;
+         break;
+      default:
+         assert( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union spu_exec_channel index2;
+         union spu_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.i[0] += indir_index.i[0];
+         index.i[1] += indir_index.i[1];
+         index.i[2] += indir_index.i[2];
+         index.i[3] += indir_index.i[3];
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      micro_abs( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      micro_abs( chan, chan );
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
+   }
+}
+
+static void
+store_dest(
+   struct spu_exec_machine *mach,
+   const union spu_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union spu_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      assert( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
+      micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kilp(struct spu_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union spu_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct spu_sampler *sampler,
+             const union spu_exec_channel *s,
+             const union spu_exec_channel *t,
+             const union spu_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union spu_exec_channel *r,
+             union spu_exec_channel *g,
+             union spu_exec_channel *b,
+             union spu_exec_channel *a )
+{
+   uint j;
+   float rgba[NUM_CHANNELS][QUAD_SIZE];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+
+   for (j = 0; j < 4; j++) {
+      r->f[j] = rgba[0][j];
+      g->f[j] = rgba[1][j];
+      b->f[j] = rgba[2][j];
+      a->f[j] = rgba[3][j];
+   }
+}
+
+
+static void
+exec_tex(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union spu_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[1], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[1] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      assert (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+
+static void
+constant_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+static void
+linear_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+static void
+perspective_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* interpolation_func)(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(struct spu_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         interpolation_func interp;
+
+         assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
+
+         first = decl->u.DeclarationRange.First;
+         last = decl->u.DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Interpolation.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = constant_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = linear_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = perspective_interpolation;
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  interp( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     interp( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct spu_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union spu_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+	 micro_f2it( &r[0], &r[0] );
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   /* TGSI_OPCODE_SWZ */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	    FETCH( &r[1], 0, CHAN_Y );
+	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+	    FETCH( &r[2], 0, CHAN_W );
+	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
+	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
+	    micro_pow( &r[1], &r[1], &r[2] );
+	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Z );
+	 }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sqrt( &r[0], &r[0] );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_mul( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+       micro_mul( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+	 micro_mul( &r[0], &r[0], &r[1] );
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_min()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_max()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_mul( &r[0], &r[0], &r[1] );
+         FETCH( &r[1], 2, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_sub( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_sub( &r[1], &r[1], &r[2] );
+         micro_mul( &r[0], &r[0], &r[1] );
+         micro_add( &r[0], &r[0], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_frc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_flr( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_rnd( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_lg2( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_pow( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+
+      micro_mul( &r[2], &r[0], &r[1] );
+
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      micro_mul( &r[5], &r[3], &r[4] );
+      micro_sub( &r[2], &r[2], &r[5] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+
+      micro_mul( &r[3], &r[3], &r[2] );
+
+      FETCH(&r[5], 0, CHAN_X);
+
+      micro_mul( &r[1], &r[1], &r[5] );
+      micro_sub( &r[3], &r[3], &r[1] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      micro_mul( &r[5], &r[5], &r[4] );
+      micro_mul( &r[0], &r[0], &r[2] );
+      micro_sub( &r[5], &r[5], &r[0] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       assert (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          micro_abs( &r[0], &r[0] );
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_cos( &r[0], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddx( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddy( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* for enabled ExecMask bits, set the killed bit */
+      mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask;
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1],
+                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
+                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sin( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         assert(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         micro_cos( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         micro_sin( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         assert(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      assert(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       assert (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ceil( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_i2f( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_not( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_trunc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_shl( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ishr( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_and( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_or( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_xor( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      assert(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+spu_exec_machine_run( struct spu_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this assertion */
+   assert(mach->CondStackTop == 0);
+   assert(mach->LoopStackTop == 0);
+   assert(mach->ContStackTop == 0);
+   assert(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   for (i = 0; i < mach->NumDeclarations; i++) {
+      exec_declaration( mach, mach->Declarations+i );
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      assert(pc < mach->NumInstructions);
+      exec_instruction( mach, mach->Instructions + pc, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/mesa/pipe/cell/spu/spu_exec.h b/src/mesa/pipe/cell/spu/spu_exec.h
new file mode 100644
index 0000000000..89e422ba48
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_exec.h
@@ -0,0 +1,171 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined SPU_EXEC_H
+#define SPU_EXEC_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/tgsi/exec/tgsi_exec.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union spu_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct spu_exec_vector
+{
+   union spu_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct spu_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct spu_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct spu_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct spu_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    */
+   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
+				      + TGSI_EXEC_NUM_ADDRS + 1]
+       ALIGN16_ATTRIB;
+
+   struct spu_exec_vector       *Addrs;
+
+   struct spu_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct spu_exec_vector       *Inputs;
+   struct spu_exec_vector       *Outputs;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct spu_interp_coef *InterpCoefs;
+   struct spu_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+};
+
+
+extern void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor);
+
+extern uint
+spu_exec_machine_run( struct spu_exec_machine *mach );
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SPU_EXEC_H */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 6886f283be..9daa3ec735 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -36,6 +36,7 @@
 #include "spu_render.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
+#include "spu_vertex_shader.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
 
@@ -50,6 +51,7 @@ boolean Debug = FALSE;
 
 struct spu_global spu;
 
+struct spu_vs_context draw;
 
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
@@ -264,6 +266,18 @@ cmd_state_vertex_info(const struct vertex_info *vinfo)
 }
 
 
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_ATTRIB_MAX);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.format[attr] = vs_info->format;
+   draw.vertex_fetch.dirty = 1;
+}
+
 
 static void
 cmd_finish(void)
@@ -374,6 +388,20 @@ cmd_batch(uint opcode)
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
          pos += (1 + sizeof(struct vertex_info) / 4);
          break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + sizeof(struct pipe_viewport_state) / 4);
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + sizeof(struct cell_array_info) / 4);
+         break;
+      case CELL_CMD_VS_EXECUTE:
+         spu_execute_vertex_shader(&draw,
+                                   (struct cell_command_vs *) &buffer[pos+1]);
+         pos += (1 + sizeof(struct cell_command_vs) / 4);
+         break;
       default:
          printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
          ASSERT(0);
diff --git a/src/mesa/pipe/cell/spu/spu_util.c b/src/mesa/pipe/cell/spu/spu_util.c
new file mode 100644
index 0000000000..ac373240c1
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_util.c
@@ -0,0 +1,165 @@
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+//#include "tgsi_build.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         component );
+   }
+
+   return swizzle;
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..b8f8c52eed
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,493 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+
+
+#define DRAW_DBG 0
+
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define FETCH_ATTRIB( NAME, SZ, CVT )			\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   static const float defaults[4] = { 0,0,0,1 };	\
+   int i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = CVT;					\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}
+
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
+#define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
+#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
+#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
+FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
+FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
+FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
+FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+
+
+
+static spu_fetch_func get_fetch_func( enum pipe_format format )
+{
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
+   case 0:
+      return NULL;		/* not sure why this is needed */
+
+   default:
+      assert(0);
+      return NULL;
+   }
+}
+
+
+static void 
+transpose_4x4( float *out, const float *in )
+{
+   /* This can be achieved in 12 sse instructions, plus the final
+    * stores I guess.  This is probably a bit more than that - maybe
+    * 32 or so?
+    */
+   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
+   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
+   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
+   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
+}
+
+
+
+static void fetch_xyz_rgb( struct spu_vs_context *draw,
+			   struct spu_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
+{
+   assert(count <= 4);
+
+//   _mesa_printf("%s\n", __FUNCTION__);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
+			      struct spu_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch( struct spu_vs_context *draw,
+				  struct spu_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+
+      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
+      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      float p[4][4];
+
+
+      /* Fetch four attributes for four vertices.  
+       * 
+       * Could fetch directly into AOS format, but this is meant to be
+       * a prototype for an sse implementation, which would have
+       * difficulties doing that.
+       */
+      for (i = 0; i < count; i++) 
+	 fetch( src + elts[i] * pitch, p[i] );
+
+      /* Be nice and zero out any missing vertices: 
+       */
+      for (/* empty */; i < 4; i++) 
+	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+      
+      /* Transpose/swizzle into sse-friendly format.  Currently
+       * assuming that all vertex shader inputs are float[4], but this
+       * isn't true -- if the vertex shader only wants tex0.xy, we
+       * could optimize for that.
+       *
+       * To do so fully without codegen would probably require an
+       * excessive number of fetch functions, but we could at least
+       * minimize the transpose step:
+       */
+      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   unsigned i;
+
+   
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      draw->vertex_fetch.fetch[i] =
+          get_fetch_func(draw->vertex_fetch.format[i]);
+   }
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (draw->vertex_fetch.nr_attrs) {
+   case 2:
+      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT)
+          draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT &&
+          draw->vertex_fetch.format[2] == PIPE_FORMAT_R32G32_FLOAT)
+          draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
new file mode 100644
index 0000000000..e694ff729f
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,224 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
+#include "pipe/draw/draw_private.h"
+#include "pipe/draw/draw_context.h"
+#include "pipe/cell/common.h"
+
+#define DBG_VS 0
+
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct spu_vs_context *draw,
+                   unsigned elts[4], unsigned count,
+                   struct vertex_header *vOut[])
+{
+   struct spu_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   assert(count <= 4);
+
+   /* Consts does not require 16 byte alignment. */
+   ASSERT_ALIGN16(draw->constants);
+   machine->Consts = (float (*)[4]) draw->constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   spu_vertex_fetch( draw, machine, elts, count );
+
+   /* run shader */
+   spu_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
+					   draw->nr_planes);
+      vOut[j]->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      vOut[j]->data[0][0] = x * scale[0] + trans[0];
+      vOut[j]->data[0][1] = y * scale[1] + trans[1];
+      vOut[j]->data[0][2] = z * scale[2] + trans[2];
+      vOut[j]->data[0][3] = w;
+
+#if DBG_VS
+      printf("output[%d]win: %f %f %f %f\n", j,
+             vOut[j]->data[0][0],
+             vOut[j]->data[0][1],
+             vOut[j]->data[0][2],
+             vOut[j]->data[0][3]);
+#endif
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+#if DBG_VS
+         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
+                vOut[j]->data[slot][0],
+                vOut[j]->data[slot][1],
+                vOut[j]->data[slot][2],
+                vOut[j]->data[slot][3]);
+#endif
+      }
+   } /* loop over vertices */
+}
+
+
+static void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       void *uniforms,
+		       void *planes,
+		       unsigned nr_planes,
+		       unsigned num_outputs
+		       )
+{
+   draw->constants = (float (*)[4]) uniforms;
+
+   (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes);
+   draw->nr_planes = nr_planes;
+   draw->num_vs_outputs = num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
+}
+
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs)
+{
+   unsigned i;
+   unsigned j;
+
+   draw->machine.Instructions = (struct tgsi_full_instruction *)
+       vs->shader.instructions;
+   draw->machine.NumInstructions = vs->shader.num_instructions;
+
+   draw->machine.Declarations = (struct tgsi_full_declaration *)
+       vs->shader.declarations;
+   draw->machine.NumDeclarations = vs->shader.num_declarations;
+
+   spu_bind_vertex_shader(draw, vs->shader.uniforms,
+			  NULL, -1,
+			  vs->shader.num_outputs);
+   
+   for (i = 0; i < vs->num_elts; i += 4) {
+      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
+      unsigned elts[4];
+
+      for (j = 0; j < batch_size; j++) {
+	 switch (vs->bytes_per_elt) {
+	 case 1: elts[j] = ((unsigned char *) vs->elts)[i + j]; break;
+	 case 2: elts[j] = ((unsigned short *)vs->elts)[i + j]; break;
+	 case 4: elts[j] = ((unsigned int *)  vs->elts)[i + j]; break;
+	 }
+      }
+
+      run_vertex_program(draw, elts, batch_size,
+			 (struct vertex_header (*)[]) vs->vOut);
+   }
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
new file mode 100644
index 0000000000..c52f38fd02
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,61 @@
+#ifndef SPU_VERTEX_SHADER_H
+#define SPU_VERTEX_SHADER_H
+
+#include "pipe/p_format.h"
+#include "spu_exec.h"
+
+struct spu_vs_context;
+
+typedef void (*spu_fetch_func)(const void *ptr, float *attrib);
+typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
+				     struct spu_exec_machine *machine,
+				     const unsigned *elts,
+				     unsigned count );
+
+struct spu_vs_context {
+   struct pipe_viewport_state viewport;
+
+   struct {
+      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      unsigned pitch[PIPE_ATTRIB_MAX];
+      enum pipe_format format[PIPE_ATTRIB_MAX];
+      unsigned nr_attrs;
+      boolean dirty;
+
+      spu_fetch_func fetch[PIPE_ATTRIB_MAX];
+      spu_full_fetch_func fetch_func;
+   } vertex_fetch;
+   
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   struct spu_exec_machine machine;
+   const float (*constants)[4];
+
+   unsigned num_vs_outputs;
+};
+
+extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
+
+static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
+				    struct spu_exec_machine *machine,
+				    const unsigned *elts,
+				    unsigned count)
+{
+   if (draw->vertex_fetch.dirty) {
+      spu_update_vertex_fetch(draw);
+      draw->vertex_fetch.dirty = 0;
+   }
+   
+   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
+}
+
+struct cell_command_vs;
+
+extern void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs);
+
+#endif /* SPU_VERTEX_SHADER_H */
-- 
cgit v1.2.3


From 33cac4824195337d9cf3dfda3fc1147c429ae43c Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 10:56:53 -0800
Subject: Initial pass at instruction / declaration fetch

---
 src/mesa/pipe/cell/spu/spu_exec.c | 22 ++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_main.h |  1 +
 2 files changed, 23 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 6888e97caf..f43278198e 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -50,6 +50,9 @@
  *   Brian Paul
  */
 
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
@@ -57,6 +60,7 @@
 #include "pipe/tgsi/util/tgsi_parse.h"
 #include "pipe/tgsi/util/tgsi_util.h"
 #include "spu_exec.h"
+#include "spu_main.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -2329,12 +2333,30 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
    /* execute declarations (interpolants) */
    for (i = 0; i < mach->NumDeclarations; i++) {
+      uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
+      struct tgsi_full_declaration decl;
+      unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
+      unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+
+      mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+      memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
       exec_declaration( mach, mach->Declarations+i );
    }
 
    /* execute instructions, until pc is set to -1 */
    while (pc != -1) {
+      uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB;
+      struct tgsi_full_instruction inst;
+      unsigned long inst_addr = (unsigned long) (mach->Instructions + pc);
+      unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f);
+
       assert(pc < mach->NumInstructions);
+      mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+      memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
       exec_instruction( mach, mach->Instructions + pc, &pc );
    }
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 73f9ed29d6..8be5268f52 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -96,6 +96,7 @@ extern boolean Debug;
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
 #define TAG_TEXTURE_TILE      19
+#define TAG_INSTRUCTION_FETCH 20
 
 
-- 
cgit v1.2.3


From 13eec106881b846538bef13d694c9d2d9cf1ae6b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 11:28:06 -0800
Subject: Implement vertex fetch / vertex shader output write-back

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 32 +++++++++++----
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 62 +++++++++++++++---------------
 src/mesa/pipe/draw/draw_context.c          |  5 ++-
 3 files changed, 58 insertions(+), 41 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index b8f8c52eed..0192227d57 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -30,11 +30,13 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
+#include <spu_mfcio.h>
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_exec.h"
 #include "spu_vertex_shader.h"
+#include "spu_main.h"
 
 
 #define DRAW_DBG 0
@@ -412,16 +414,18 @@ static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-static void generic_vertex_fetch( struct spu_vs_context *draw,
-				  struct spu_exec_machine *machine,
-				  const unsigned *elts,
-				  unsigned count )
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
 {
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
 
    assert(count <= 4);
 
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
 //   _mesa_printf("%s %d\n", __FUNCTION__, count);
 
    /* loop over vertex attributes (vertex shader inputs)
@@ -441,13 +445,23 @@ static void generic_vertex_fetch( struct spu_vs_context *draw,
        * a prototype for an sse implementation, which would have
        * difficulties doing that.
        */
-      for (i = 0; i < count; i++) 
-	 fetch( src + elts[i] * pitch, p[i] );
+      for (i = 0; i < count; i++) {
+         uint8_t buffer[32 + (sizeof(float) * 4)] ALIGN16_ATTRIB;
+         const unsigned long addr = src + elts[i] * pitch;
+         const unsigned size = (sizeof(float) * 4) + (addr & 0x0f);
+
+         mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+         wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+         memcpy(& buffer, buffer + (addr & 0x0f), sizeof(float) * 4);
+
+         fetch(buffer, p[i]);
+      }
 
       /* Be nice and zero out any missing vertices: 
        */
       for (/* empty */; i < 4; i++) 
-	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+          p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
       
       /* Transpose/swizzle into sse-friendly format.  Currently
        * assuming that all vertex shader inputs are float[4], but this
@@ -475,6 +489,9 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
 
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
 
+   /* Disable the fast path because they don't use mfc_get yet.
+    */
+#if 0
    switch (draw->vertex_fetch.nr_attrs) {
    case 2:
       if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
@@ -490,4 +507,5 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    default:
       break;
    }
+#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index e694ff729f..595f54b0eb 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -32,6 +32,8 @@
   *   Ian Romanick <idr@us.ibm.com>
   */
 
+#include <spu_mfcio.h>
+
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -40,9 +42,7 @@
 #include "pipe/draw/draw_private.h"
 #include "pipe/draw/draw_context.h"
 #include "pipe/cell/common.h"
-
-#define DBG_VS 0
-
+#include "spu_main.h"
 
 static INLINE unsigned
 compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
@@ -110,6 +110,12 @@ run_vertex_program(struct spu_vs_context *draw,
    for (j = 0; j < count; j++) {
       unsigned slot;
       float x, y, z, w;
+      unsigned char buffer[sizeof(struct vertex_header)
+			   + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+      struct vertex_header *const tmpOut =
+	  (struct vertex_header *) buffer;
+      const unsigned vert_size = sizeof(struct vertex_header)
+	  + (sizeof(float) * 4 * draw->num_vs_outputs);
 
       /* Handle attr[0] (position) specially:
        *
@@ -117,14 +123,14 @@ run_vertex_program(struct spu_vs_context *draw,
        * program as a set of DP4 instructions appended to the
        * user-provided code.
        */
-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
 
-      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
 					   draw->nr_planes);
-      vOut[j]->edgeflag = 1;
+      tmpOut->edgeflag = 1;
 
       /* divide by w */
       w = 1.0f / w;
@@ -133,35 +139,27 @@ run_vertex_program(struct spu_vs_context *draw,
       z *= w;
 
       /* Viewport mapping */
-      vOut[j]->data[0][0] = x * scale[0] + trans[0];
-      vOut[j]->data[0][1] = y * scale[1] + trans[1];
-      vOut[j]->data[0][2] = z * scale[2] + trans[2];
-      vOut[j]->data[0][3] = w;
-
-#if DBG_VS
-      printf("output[%d]win: %f %f %f %f\n", j,
-             vOut[j]->data[0][0],
-             vOut[j]->data[0][1],
-             vOut[j]->data[0][2],
-             vOut[j]->data[0][3]);
-#endif
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
       /* Remaining attributes are packed into sequential post-transform
        * vertex attrib slots.
        */
       for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-#if DBG_VS
-         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
-                vOut[j]->data[slot][0],
-                vOut[j]->data[slot][1],
-                vOut[j]->data[slot][2],
-                vOut[j]->data[slot][3]);
-#endif
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
       }
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+
    } /* loop over vertices */
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
 }
 
 
diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index e8ca1f035b..711bcd02f6 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -71,10 +71,11 @@ struct draw_context *draw_create( void )
     */
    {
       uint i;
-      char *tmp = (char*) MALLOC( Elements(draw->vcache.vertex) * MAX_VERTEX_SIZE );
+      const unsigned size = (MAX_VERTEX_SIZE + 0x0f) & ~0x0f;
+      char *tmp = align_malloc(Elements(draw->vcache.vertex) * size, 16);
 
       for (i = 0; i < Elements(draw->vcache.vertex); i++)
-	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * MAX_VERTEX_SIZE);
+	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
    }
 
    draw->convert_wide_points = TRUE;
-- 
cgit v1.2.3


From 3d13605ee5fc92a1e3d82f1dbbcb8342066d8af0 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 29 Jan 2008 11:43:04 -0800
Subject: Provide mechanism to hook in custom vertex shader cache flush
 function

---
 src/mesa/pipe/draw/draw_context.c | 2 ++
 src/mesa/pipe/draw/draw_prim.c    | 2 +-
 src/mesa/pipe/draw/draw_private.h | 5 +++++
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index 711bcd02f6..87f4969983 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -78,6 +78,8 @@ struct draw_context *draw_create( void )
 	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
    }
 
+   draw->shader_queue_flush = draw_vertex_shader_queue_flush;
+
    draw->convert_wide_points = TRUE;
    draw->convert_wide_lines = TRUE;
 
diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 41b3fddcc1..58400213d7 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -127,7 +127,7 @@ void draw_do_flush( struct draw_context *draw, unsigned flags )
 
    if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
       if (draw->vs.queue_nr)
-	 draw_vertex_shader_queue_flush(draw);
+         (*draw->shader_queue_flush)(draw);
 
       if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
 	 if (draw->pq.queue_nr)
diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index 21de400676..fea6d94ed8 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -240,6 +240,11 @@ struct draw_context
       unsigned queue_nr;
    } vs;
 
+   /**
+    * Run the vertex shader on all vertices in the vertex queue.
+    */
+   void (*shader_queue_flush)(struct draw_context *draw);
+
    /* Prim pipeline queue:
     */
    struct {
-- 
cgit v1.2.3


From fcf944177325cdf8bf6e4f1b70296c19476e2375 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 10:43:23 -0800
Subject: Pass ptr to local memory copy instead of main memory to
 exec_instruction

This was essentially a cut-and-paste bug when the instruction fetcher
was added.  Also, the test for TGSI_PROCESSOR_FRAGMENT was moved
outside the loop for exec_declaration.
---
 src/mesa/pipe/cell/spu/spu_exec.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index f43278198e..b3db6716d5 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -2332,17 +2332,19 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 
 
    /* execute declarations (interpolants) */
-   for (i = 0; i < mach->NumDeclarations; i++) {
-      uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
-      struct tgsi_full_declaration decl;
-      unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
-      unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      for (i = 0; i < mach->NumDeclarations; i++) {
+	 uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
+	 struct tgsi_full_declaration decl;
+	 unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
+	 unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
 
-      mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
-      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+	 mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
 
-      memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
-      exec_declaration( mach, mach->Declarations+i );
+	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
+	 exec_declaration( mach, decl );
+      }
    }
 
    /* execute instructions, until pc is set to -1 */
@@ -2357,7 +2359,7 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
       wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
 
       memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
-      exec_instruction( mach, mach->Instructions + pc, &pc );
+      exec_instruction( mach, & inst, &pc );
    }
 
 #if 0
-- 
cgit v1.2.3


From a0a707342a353024271f09cd52bd955d8df310a8 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 10:46:55 -0800
Subject: Missing amperstand in previous commit.  Oops.

---
 src/mesa/pipe/cell/spu/spu_exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index b3db6716d5..85b5815cad 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -2343,7 +2343,7 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
 	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
 
 	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
-	 exec_declaration( mach, decl );
+	 exec_declaration( mach, &decl );
       }
    }
 
-- 
cgit v1.2.3


From 708d699e0cebb2dfbca7b6639ee5b177dc8c4c61 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 12:59:09 -0800
Subject: Fetch uniforms from main memory.

---
 src/mesa/pipe/cell/spu/spu_exec.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 85b5815cad..78f7d0962f 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -791,12 +791,23 @@ fetch_src_file_channel(
    case TGSI_EXTSWIZZLE_Z:
    case TGSI_EXTSWIZZLE_W:
       switch( file ) {
-      case TGSI_FILE_CONSTANT:
-         chan->f[0] = mach->Consts[index->i[0]][swizzle];
-         chan->f[1] = mach->Consts[index->i[1]][swizzle];
-         chan->f[2] = mach->Consts[index->i[2]][swizzle];
-         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+      case TGSI_FILE_CONSTANT: {
+         unsigned char buffer[32] ALIGN16_ATTRIB;
+         unsigned i;
+
+         for (i = 0; i < 4; i++) {
+            const float *ptr = mach->Consts[index->i[i]];
+            const uint64_t addr = (uint64_t)(uintptr_t) ptr;
+            const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
+
+            mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+            wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+            (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) 
+                + (sizeof(float) * swizzle)], sizeof(float));
+         }
          break;
+      }
 
       case TGSI_FILE_INPUT:
          chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
-- 
cgit v1.2.3


From 7b27d9fd660c122fb2ec50007129d67e78814587 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 17:26:22 -0800
Subject: Fix size calculation in attribute fetch.

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 0192227d57..1e846868e3 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -446,14 +446,14 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * difficulties doing that.
        */
       for (i = 0; i < count; i++) {
-         uint8_t buffer[32 + (sizeof(float) * 4)] ALIGN16_ATTRIB;
-         const unsigned long addr = src + elts[i] * pitch;
-         const unsigned size = (sizeof(float) * 4) + (addr & 0x0f);
+         uint8_t buffer[32] ALIGN16_ATTRIB;
+         const unsigned long addr = src + (elts[i] * pitch);
+         const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-         memcpy(& buffer, buffer + (addr & 0x0f), sizeof(float) * 4);
+         memmove(& buffer, buffer + (addr & 0x0f), 16);
 
          fetch(buffer, p[i]);
       }
-- 
cgit v1.2.3


From 334986114665df650649634b63184be6f1b9cd9b Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 17:28:48 -0800
Subject: Implement micro_pow and micro_sqrt

Unimplemented micro ops get assertions for now.
---
 src/mesa/pipe/cell/spu/spu_exec.c | 43 ++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 78f7d0962f..168bada3bb 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,6 +52,8 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/powf4.h>
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
@@ -207,6 +209,7 @@ micro_ceil(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) ceil( (double) src->f[0] );
    dst->f[1] = (float) ceil( (double) src->f[1] );
@@ -220,6 +223,7 @@ micro_cos(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) cos( (double) src->f[0] );
    dst->f[1] = (float) cos( (double) src->f[1] );
@@ -307,6 +311,7 @@ micro_exp2(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src)
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
    dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
@@ -342,6 +347,7 @@ micro_flr(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) floor( (double) src->f[0] );
    dst->f[1] = (float) floor( (double) src->f[1] );
@@ -355,6 +361,7 @@ micro_frc(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
    dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
@@ -393,6 +400,7 @@ micro_lg2(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
    dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
@@ -649,12 +657,18 @@ micro_pow(
    const union spu_exec_channel *src0,
    const union spu_exec_channel *src1 )
 {
-#if 0
-   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
-   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
-   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
-   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
-#endif
+   vec_float4 s0 = (vec_float4) {
+      src0->f[0], src0->f[1], src0->f[2], src0->f[3]
+   };
+   vec_float4 s1 = (vec_float4) {
+      src1->f[0], src1->f[1], src1->f[2], src1->f[3]
+   };
+   vec_float4 d = _powf4(s0, s1);
+
+   dst->f[0] = spu_extract(d, 0);
+   dst->f[1] = spu_extract(d, 1);
+   dst->f[2] = spu_extract(d, 2);
+   dst->f[3] = spu_extract(d, 3);
 }
 
 static void
@@ -662,6 +676,7 @@ micro_rnd(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
    dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
@@ -722,6 +737,7 @@ micro_sin(
    union spu_exec_channel *dst,
    const union spu_exec_channel *src )
 {
+   ASSERT(0);
 #if 0
    dst->f[0] = (float) sin( (double) src->f[0] );
    dst->f[1] = (float) sin( (double) src->f[1] );
@@ -734,12 +750,15 @@ static void
 micro_sqrt( union spu_exec_channel *dst,
             const union spu_exec_channel *src )
 {
-#if 0
-   dst->f[0] = (float) sqrt( (double) src->f[0] );
-   dst->f[1] = (float) sqrt( (double) src->f[1] );
-   dst->f[2] = (float) sqrt( (double) src->f[2] );
-   dst->f[3] = (float) sqrt( (double) src->f[3] );
-#endif
+   vec_float4 s = (vec_float4) {
+      src->f[0], src->f[1], src->f[2], src->f[3]
+   };
+   vec_float4 d = _sqrtf4(s);
+
+   dst->f[0] = spu_extract(d, 0);
+   dst->f[1] = spu_extract(d, 1);
+   dst->f[2] = spu_extract(d, 2);
+   dst->f[3] = spu_extract(d, 3);
 }
 
 static void
-- 
cgit v1.2.3


From 137cb72284a115d8f5ffadf2154b6f5eb5323a7d Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:24:40 -0800
Subject: Elts are always ints, pass vOut pointers in-line in command

---
 src/mesa/pipe/cell/common.h                |  6 +++---
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 14 ++------------
 2 files changed, 5 insertions(+), 15 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 80a1425ec7..fbbdf728a1 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -144,13 +144,13 @@ struct cell_shader_info
 } ALIGN16_ATTRIB;
 
 
+#define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
    struct cell_shader_info   shader;
-   void *elts;
    unsigned num_elts;
-   unsigned bytes_per_elt;
-   void *vOut;
+   unsigned elts[SPU_VERTS_PER_BATCH];
+   uint64_t vOut[SPU_VERTS_PER_BATCH];
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index 595f54b0eb..82165501c5 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -81,7 +81,7 @@ compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
 static void
 run_vertex_program(struct spu_vs_context *draw,
                    unsigned elts[4], unsigned count,
-                   struct vertex_header *vOut[])
+                   const uint64_t *vOut)
 {
    struct spu_exec_machine *machine = &draw->machine;
    unsigned int j;
@@ -206,17 +206,7 @@ spu_execute_vertex_shader(struct spu_vs_context *draw,
    
    for (i = 0; i < vs->num_elts; i += 4) {
       const unsigned batch_size = MIN2(vs->num_elts - i, 4);
-      unsigned elts[4];
-
-      for (j = 0; j < batch_size; j++) {
-	 switch (vs->bytes_per_elt) {
-	 case 1: elts[j] = ((unsigned char *) vs->elts)[i + j]; break;
-	 case 2: elts[j] = ((unsigned short *)vs->elts)[i + j]; break;
-	 case 4: elts[j] = ((unsigned int *)  vs->elts)[i + j]; break;
-	 }
-      }
 
-      run_vertex_program(draw, elts, batch_size,
-			 (struct vertex_header (*)[]) vs->vOut);
+      run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]);
    }
 }
-- 
cgit v1.2.3


From fb348c2cb16d0bc216d29889474972d5c14d0980 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:25:47 -0800
Subject: Set machine->Processor

The default value is 0, which is TGSI_PROCESSOR_FRAGMENT...not correct
for a vertex shader!
---
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index 82165501c5..125b2c3a43 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -93,7 +93,8 @@ run_vertex_program(struct spu_vs_context *draw,
 
    assert(count <= 4);
 
-   /* Consts does not require 16 byte alignment. */
+   machine->Processor = TGSI_PROCESSOR_VERTEX;
+
    ASSERT_ALIGN16(draw->constants);
    machine->Consts = (float (*)[4]) draw->constants;
 
-- 
cgit v1.2.3


From 193491cbd3ad2ad95243181c201da4640f3a29c2 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:30:15 -0800
Subject: Handle CELL_CMD_VS_EXECUTE *only* outside batch commands.

---
 src/mesa/pipe/cell/common.h       | 3 ++-
 src/mesa/pipe/cell/spu/spu_main.c | 8 +++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index fbbdf728a1..a40cfb8210 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -133,7 +133,6 @@ struct cell_array_info
 
 struct cell_shader_info
 {
-   unsigned processor;
    unsigned num_outputs;
 
    void *declarations;
@@ -147,6 +146,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
+   uint opcode;       /**< CELL_CMD_VS_EXECUTE */
    struct cell_shader_info   shader;
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -190,6 +190,7 @@ struct cell_command
    struct cell_command_framebuffer fb;
    struct cell_command_clear_surface clear;
    struct cell_command_render render;
+   struct cell_command_vs vs;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 9daa3ec735..7105c0f897 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -397,11 +397,6 @@ cmd_batch(uint opcode)
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
          pos += (1 + sizeof(struct cell_array_info) / 4);
          break;
-      case CELL_CMD_VS_EXECUTE:
-         spu_execute_vertex_shader(&draw,
-                                   (struct cell_command_vs *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_command_vs) / 4);
-         break;
       default:
          printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
          ASSERT(0);
@@ -470,6 +465,9 @@ main_loop(void)
             assert(pos_incr == 0);
          }
          break;
+      case CELL_CMD_VS_EXECUTE:
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+         break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
          break;
-- 
cgit v1.2.3


From 10270fbe2d362fe8f27384b9a5423381e2882460 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:33:30 -0800
Subject: Correctly read / write vertex header from / to main memory

---
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index 125b2c3a43..ea5ffae6bc 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -112,11 +112,16 @@ run_vertex_program(struct spu_vs_context *draw,
       unsigned slot;
       float x, y, z, w;
       unsigned char buffer[sizeof(struct vertex_header)
-			   + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
       struct vertex_header *const tmpOut =
-	  (struct vertex_header *) buffer;
-      const unsigned vert_size = sizeof(struct vertex_header)
-	  + (sizeof(float) * 4 * draw->num_vs_outputs);
+          (struct vertex_header *) buffer;
+      const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
+                                           + (sizeof(float) * 4 
+                                              * draw->num_vs_outputs));
+
+      mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
 
       /* Handle attr[0] (position) specially:
        *
@@ -155,12 +160,8 @@ run_vertex_program(struct spu_vs_context *draw,
          tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
       }
 
-      wait_on_mask(1 << TAG_VERTEX_BUFFER);
       mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
-
    } /* loop over vertices */
-
-   wait_on_mask(1 << TAG_VERTEX_BUFFER);
 }
 
 
-- 
cgit v1.2.3


From de949a471ed66f0e6db0819bf55b2ec74b7e4048 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:34:22 -0800
Subject: cell_array_info should not be 16-byte aligned

Forcing cell_array_info to be 16-byte aligned makes it more difficult
to stuff that state in batch commands.
---
 src/mesa/pipe/cell/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index a40cfb8210..533ad2cf6e 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -128,7 +128,7 @@ struct cell_array_info
     uint attr;                /**< Attribute that this state if for. */
     uint pitch;               /**< Byte pitch from one entry to the next. */
     enum pipe_format format;  /**< Pipe format of each entry. */
-} ALIGN16_ATTRIB;
+};
 
 
 struct cell_shader_info
-- 
cgit v1.2.3


From 9ad986b88763f6baefa73830dcd5762156ab9b20 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 19:40:24 -0800
Subject: Numerous small fixed to PPU-SPU vertex shader protocol

---
 src/mesa/pipe/cell/common.h                | 19 ++++++++++++-------
 src/mesa/pipe/cell/spu/spu_vertex_shader.c | 27 ++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 533ad2cf6e..28b0c59a0a 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -124,10 +124,10 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    void *base;               /**< Base address of the 0th element. */
-    uint attr;                /**< Attribute that this state if for. */
-    uint pitch;               /**< Byte pitch from one entry to the next. */
-    enum pipe_format format;  /**< Pipe format of each entry. */
+    uint64_t base;      /**< Base address of the 0th element. */
+    uint attr;          /**< Attribute that this state if for. */
+    uint pitch;         /**< Byte pitch from one entry to the next. */
+    uint format;        /**< Pipe format of each entry. */
 };
 
 
@@ -135,11 +135,13 @@ struct cell_shader_info
 {
    unsigned num_outputs;
 
-   void *declarations;
+   uint64_t declarations;
    unsigned num_declarations;
-   void *instructions;
+   uint64_t instructions;
    unsigned num_instructions;
-   void *uniforms;
+   uint64_t uniforms;
+   uint64_t  immediates;
+   unsigned num_immediates;
 } ALIGN16_ATTRIB;
 
 
@@ -151,6 +153,9 @@ struct cell_command_vs
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
    uint64_t vOut[SPU_VERTS_PER_BATCH];
+   float plane[12][4];
+   unsigned nr_planes;
+   unsigned nr_attrs;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
index ea5ffae6bc..c1cbbb6d1e 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -187,12 +187,22 @@ spu_bind_vertex_shader(struct spu_vs_context *draw,
 }
 
 
+unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
+    ALIGN16_ATTRIB;
+
 void
 spu_execute_vertex_shader(struct spu_vs_context *draw,
-			  const struct cell_command_vs *vs)
+                          const struct cell_command_vs *vs)
 {
    unsigned i;
-   unsigned j;
+
+   const uint64_t immediate_addr = vs->shader.immediates;
+   const unsigned immediate_size = 
+       ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates)
+                 + (immediate_addr & 0x0f));
+
+   mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
+           TAG_VERTEX_BUFFER, 0, 0);
 
    draw->machine.Instructions = (struct tgsi_full_instruction *)
        vs->shader.instructions;
@@ -202,10 +212,17 @@ spu_execute_vertex_shader(struct spu_vs_context *draw,
        vs->shader.declarations;
    draw->machine.NumDeclarations = vs->shader.num_declarations;
 
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+   (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
+                 sizeof(float) * 4 * vs->shader.num_immediates);
+
    spu_bind_vertex_shader(draw, vs->shader.uniforms,
-			  NULL, -1,
-			  vs->shader.num_outputs);
-   
+                          vs->plane, vs->nr_planes,
+                          vs->shader.num_outputs);
+
    for (i = 0; i < vs->num_elts; i += 4) {
       const unsigned batch_size = MIN2(vs->num_elts - i, 4);
 
-- 
cgit v1.2.3


From a89ee8a96db1ac7674a5ae82e518ce5c380d9195 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 20:10:45 -0800
Subject: Add driver_private field for drivers that hook shader_queue_flush.

---
 src/mesa/pipe/draw/draw_private.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index fea6d94ed8..7782db0477 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -259,6 +259,8 @@ struct draw_context
 #ifdef MESA_LLVM
    struct gallivm_cpu_engine *engine;
 #endif
+   
+   void *driver_private;
 };
 
 
-- 
cgit v1.2.3


From 5a6fd9393021b9476273b7831bcda2186c9324a1 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 20:12:00 -0800
Subject: Use SPUs for vertex shader processing

---
 src/mesa/pipe/cell/ppu/Makefile             |   1 +
 src/mesa/pipe/cell/ppu/cell_context.c       |  12 ++-
 src/mesa/pipe/cell/ppu/cell_context.h       |   2 +
 src/mesa/pipe/cell/ppu/cell_vertex_shader.c | 118 ++++++++++++++++++++++++++++
 4 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 src/mesa/pipe/cell/ppu/cell_vertex_shader.c

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/Makefile b/src/mesa/pipe/cell/ppu/Makefile
index e7f2562da7..50060f5cd3 100644
--- a/src/mesa/pipe/cell/ppu/Makefile
+++ b/src/mesa/pipe/cell/ppu/Makefile
@@ -34,6 +34,7 @@ SOURCES = \
 	cell_surface.c \
 	cell_texture.c \
 	cell_vbuf.c \
+	cell_vertex_shader.c \
 	cell_winsys.c
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index e8020a49bc..4885cd0d2c 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -39,6 +39,7 @@
 #include "pipe/p_winsys.h"
 #include "pipe/cell/common.h"
 #include "pipe/draw/draw_context.h"
+#include "pipe/draw/draw_private.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
@@ -156,6 +157,15 @@ cell_destroy_context( struct pipe_context *pipe )
 }
 
 
+static struct draw_context *
+cell_draw_create(struct cell_context *cell)
+{
+   struct draw_context *draw = draw_create();
+
+   draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+   draw->driver_private = cell;
+   return draw;
+}
 
 
 struct pipe_context *
@@ -242,7 +252,7 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws)
 
    cell_init_surface_functions(cell);
 
-   cell->draw = draw_create();
+   cell->draw = cell_draw_create(cell);
 
    cell_init_vbuf(cell);
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 65b89518ad..3b63419b5e 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -126,6 +126,8 @@ cell_context(struct pipe_context *pipe)
 extern struct pipe_context *
 cell_create_context(struct pipe_winsys *ws, struct cell_winsys *cws);
 
+extern void
+cell_vertex_shader_queue_flush(struct draw_context *draw);
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
new file mode 100644
index 0000000000..aef329a902
--- /dev/null
+++ b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
@@ -0,0 +1,118 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file cell_vertex_shader.c
+ * Vertex shader interface routines for Cell.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_spu.h"
+#include "cell_batch.h"
+
+#include "pipe/cell/common.h"
+#include "pipe/draw/draw_context.h"
+#include "pipe/draw/draw_private.h"
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+cell_vertex_shader_queue_flush(struct draw_context *draw)
+{
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct cell_command_vs *const vs = &cell_global.command[0].vs;
+   unsigned *batch;
+   struct cell_array_info array_info;
+   unsigned i, j;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch(draw);
+
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      array_info.opcode = CELL_CMD_STATE_VS_ARRAY_INFO;
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info.base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info.attr = i;
+      array_info.pitch = draw->vertex_fetch.pitch[i];
+      array_info.format = draw->vertex_element[i].src_format;
+
+      cell_batch_append(cell, & array_info, sizeof(array_info));
+   }
+
+   batch = cell_batch_alloc(cell, sizeof(unsigned)
+                            + sizeof(struct pipe_viewport_state));
+   batch[0] = CELL_CMD_STATE_VIEWPORT;
+   (void) memcpy(&batch[1], &draw->viewport,
+                 sizeof(struct pipe_viewport_state));
+
+   cell_batch_flush(cell);
+
+   vs->opcode = CELL_CMD_VS_EXECUTE;
+   vs->shader.num_outputs = draw->num_vs_outputs;
+   vs->shader.declarations = (uintptr_t) draw->machine.Declarations;
+   vs->shader.num_declarations = draw->machine.NumDeclarations;
+   vs->shader.instructions = (uintptr_t) draw->machine.Instructions;
+   vs->shader.num_instructions = draw->machine.NumInstructions;
+   vs->shader.uniforms = (uintptr_t) draw->user.constants;
+   vs->shader.immediates = (uintptr_t) draw->machine.Imms;
+   vs->shader.num_immediates = draw->machine.ImmLimit / 4;
+   vs->nr_attrs = draw->vertex_fetch.nr_attrs;
+
+   (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane));
+   vs->nr_planes = draw->nr_planes;
+
+   for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) {
+      const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         vs->elts[j] = draw->vs.queue[i + j].elt;
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].dest;
+      }
+
+      for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) {
+         vs->elts[j] = vs->elts[0];
+         vs->vOut[j] = vs->vOut[0];
+      }
+
+      vs->num_elts = n;
+      send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
+
+      cell_flush_int(& cell->pipe, PIPE_FLUSH_WAIT);
+   }
+
+   draw->vs.queue_nr = 0;
+}
-- 
cgit v1.2.3


From 62d11b98c4a4904b56fab153407f49619d6d331d Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Wed, 30 Jan 2008 20:14:14 -0800
Subject: I don't know why using uint64_t for "base" doesn't work.  Ugh.

---
 src/mesa/pipe/cell/common.h       | 5 +++--
 src/mesa/pipe/cell/spu/spu_main.c | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 28b0c59a0a..05aeed83ab 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -124,11 +124,12 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint64_t base;      /**< Base address of the 0th element. */
+    uint opcode;
+    uint base;          /**< Base address of the 0th element. */
     uint attr;          /**< Attribute that this state if for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
-};
+} ALIGN16_ATTRIB;
 
 
 struct cell_shader_info
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 7105c0f897..d6393048f5 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -394,8 +394,8 @@ cmd_batch(uint opcode)
          pos += (1 + sizeof(struct pipe_viewport_state) / 4);
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_array_info) / 4);
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos]);
+         pos += (sizeof(struct cell_array_info) / 4);
          break;
       default:
          printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
-- 
cgit v1.2.3


From c42e6254cffb8ef480868e9c1942f73129fc4f80 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Thu, 31 Jan 2008 13:14:35 +0900
Subject: gallium: Add SCons as alternative build system for Gallium.

---
 .gitignore                                   |   2 +
 SConstruct                                   | 214 +++++++++++++
 src/mesa/SConscript                          | 435 +++++++++++++++++++++++++++
 src/mesa/drivers/dri/SConscript              |  48 +++
 src/mesa/drivers/dri/intel_winsys/SConscript |  41 +++
 src/mesa/pipe/SConscript                     |   9 +
 src/mesa/pipe/i915simple/SConscript          |  29 ++
 src/mesa/pipe/i965simple/SConscript          |  55 ++++
 src/mesa/pipe/softpipe/SConscript            |  42 +++
 9 files changed, 875 insertions(+)
 create mode 100644 SConstruct
 create mode 100644 src/mesa/SConscript
 create mode 100644 src/mesa/drivers/dri/SConscript
 create mode 100644 src/mesa/drivers/dri/intel_winsys/SConscript
 create mode 100644 src/mesa/pipe/SConscript
 create mode 100644 src/mesa/pipe/i915simple/SConscript
 create mode 100644 src/mesa/pipe/i965simple/SConscript
 create mode 100644 src/mesa/pipe/softpipe/SConscript

(limited to 'src')

diff --git a/.gitignore b/.gitignore
index 033e6e10bd..bf50291fc1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ depend
 depend.bak
 lib
 lib64
+.sconsign*
+config.py
diff --git a/SConstruct b/SConstruct
new file mode 100644
index 0000000000..db6161ed51
--- /dev/null
+++ b/SConstruct
@@ -0,0 +1,214 @@
+#######################################################################
+# Top-level SConstruct
+
+import os
+import sys
+
+
+#######################################################################
+# Configuration options
+#
+# For example, invoke scons as 
+#
+#   scons debug=1 dri=0 x86=1
+#
+# to set configuration variables. Or you can write those options to a file
+# named config.py:
+#
+#   # config.py
+#   debug=1
+#   dri=0
+#   x86=1
+# 
+# Invoke
+#
+#   scons -h
+#
+# to get the full list of options. See scons manpage for more info.
+#  
+
+# TODO: auto-detect defaults
+opts = Options('config.py')
+opts.Add(BoolOption('debug', 'build debug version', False))
+opts.Add(BoolOption('dri', 'build dri drivers', False))
+opts.Add(EnumOption('machine', 'use machine-specific assembly code', 'x86',
+                     allowed_values=('generic', 'x86', 'x86-64')))
+
+env = Environment(options = opts)
+Help(opts.GenerateHelpText(env))
+
+# for debugging
+#print env.Dump()
+
+if 1:
+	# platform will be typically 'posix' or 'win32' 
+	platform = env['PLATFORM']
+else:
+	# platform will be one of 'linux', 'freebsd', 'win32', 'darwin', etc.
+	platform = sys.platform
+	if platform == 'linux2':
+		platform = 'linux' 
+
+# replicate options values in local variables
+debug = env['debug']
+dri = env['dri']
+machine = env['machine']
+
+# derived options
+x86 = machine == 'x86'
+gcc = platform == 'posix'
+msvc = platform == 'win32'
+
+Export([
+	'debug', 
+	'x86', 
+	'dri', 
+	'platform',
+	'gcc',
+	'msvc',
+])
+
+
+#######################################################################
+# Environment setup
+#
+# TODO: put the compiler specific settings in seperate files
+# TODO: auto-detect as much as possible
+
+         
+# Optimization flags
+if gcc:
+	if debug:
+		env.Append(CFLAGS = '-O0 -g3')
+		env.Append(CXXFLAGS = '-O0 -g3')
+	else:
+		env.Append(CFLAGS = '-O3 -g3')
+		env.Append(CXXFLAGS = '-O3 -g3')
+
+	env.Append(CFLAGS = '-Wall -Wmissing-prototypes -std=c99 -ffast-math -pedantic')
+	env.Append(CXXFLAGS = '-Wall -pedantic')
+	
+	# Be nice to Eclipse
+	env.Append(CFLAGS = '-fmessage-length=0')
+	env.Append(CXXFLAGS = '-fmessage-length=0')
+
+# Defines
+env.Append(CPPDEFINES = [
+	'_POSIX_SOURCE',
+	('_POSIX_C_SOURCE', '199309L'), 
+	'_SVID_SOURCE',
+	'_BSD_SOURCE', 
+	'_GNU_SOURCE',
+	
+	'PTHREADS',
+	'HAVE_ALIAS', 
+	'HAVE_POSIX_MEMALIGN',
+])
+
+if debug:
+	env.Append(CPPDEFINES = ['DEBUG'])
+else:
+	env.Append(CPPDEFINES = ['NDEBUG'])
+
+
+# Includes
+env.Append(CPPPATH = [
+	'#/include',
+	'#/src/mesa',
+	'#/src/mesa/main',
+	'#/src/mesa/pipe',
+	
+	'/usr/X11R6/include',
+])
+
+
+# x86 assembly
+if x86:
+	env.Append(CPPDEFINES = [
+		'USE_X86_ASM', 
+		'USE_MMX_ASM',
+		'USE_3DNOW_ASM',
+		'USE_SSE_ASM',
+	])
+	if gcc:	
+		env.Append(CFLAGS = '-m32')
+		env.Append(CXXFLAGS = '-m32')
+
+env.Append(LIBPATH = ['/usr/X11R6/lib'])
+
+env.Append(LIBS = [
+	'm',
+	'pthread',
+	'expat',
+	'dl',
+])
+
+# DRI
+if dri:
+	env.ParseConfig('pkg-config --cflags --libs libdrm')
+	env.Append(CPPDEFINES = [
+		('USE_EXTERNAL_DXTN_LIB', '1'), 
+		'IN_DRI_DRIVER',
+		'GLX_DIRECT_RENDERING',
+		'GLX_INDIRECT_RENDERING',
+	])
+
+# libGL
+if 1:
+	env.Append(LIBS = [
+		'X11',
+		'Xext',
+		'Xxf86vm',
+		'Xdamage',
+		'Xfixes',
+	])
+
+Export('env')
+
+
+#######################################################################
+# Convenience Library Builder
+# based on the stock StaticLibrary and SharedLibrary builders
+
+def createConvenienceLibBuilder(env):
+    """This is a utility function that creates the ConvenienceLibrary
+    Builder in an Environment if it is not there already.
+
+    If it is already there, we return the existing one.
+    """
+
+    try:
+        convenience_lib = env['BUILDERS']['ConvenienceLibrary']
+    except KeyError:
+        action_list = [ Action("$ARCOM", "$ARCOMSTR") ]
+        if env.Detect('ranlib'):
+            ranlib_action = Action("$RANLIBCOM", "$RANLIBCOMSTR")
+            action_list.append(ranlib_action)
+
+        convenience_lib = Builder(action = action_list,
+                                  emitter = '$LIBEMITTER',
+                                  prefix = '$LIBPREFIX',
+                                  suffix = '$LIBSUFFIX',
+                                  src_suffix = '$SHOBJSUFFIX',
+                                  src_builder = 'SharedObject')
+        env['BUILDERS']['ConvenienceLibrary'] = convenience_lib
+        env['BUILDERS']['Library'] = convenience_lib
+
+    return convenience_lib
+
+createConvenienceLibBuilder(env)
+
+
+#######################################################################
+# Invoke SConscripts
+
+# Put build output in a separate dir
+# TODO: make build_dir depend on platform and build type (check  
+#       http://www.scons.org/wiki/AdvancedBuildExample for an example)
+build_dir = 'build'
+
+SConscript(
+	'src/mesa/SConscript',
+	build_dir = build_dir,
+	duplicate = 0 # http://www.scons.org/doc/0.97/HTML/scons-user/x2261.html
+)
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
new file mode 100644
index 0000000000..70a98f3129
--- /dev/null
+++ b/src/mesa/SConscript
@@ -0,0 +1,435 @@
+#######################################################################
+# SConscript for mesa
+#
+# TODO: Split this into per-module SConscripts 
+
+
+Import('*')
+
+
+#######################################################################
+# Core sources
+
+MAIN_SOURCES = [
+	'main/api_arrayelt.c',
+	'main/api_loopback.c',
+	'main/api_noop.c',
+	'main/api_validate.c',
+	'main/accum.c',
+	'main/attrib.c',
+	'main/arrayobj.c',
+	'main/blend.c',
+	'main/bufferobj.c',
+	'main/buffers.c',
+	'main/clip.c',
+	'main/colortab.c',
+	'main/context.c',
+	'main/convolve.c',
+	'main/debug.c',
+	'main/depth.c',
+	'main/depthstencil.c',
+	'main/dlist.c',
+	'main/drawpix.c',
+	'main/enable.c',
+	'main/enums.c',
+	'main/eval.c',
+	'main/execmem.c',
+	'main/extensions.c',
+	'main/fbobject.c',
+	'main/feedback.c',
+	'main/ffvertex_prog.c',
+	'main/fog.c',
+	'main/framebuffer.c',
+	'main/get.c',
+	'main/getstring.c',
+	'main/hash.c',
+	'main/hint.c',
+	'main/histogram.c',
+	'main/image.c',
+	'main/imports.c',
+	'main/light.c',
+	'main/lines.c',
+	'main/matrix.c',
+	'main/mipmap.c',
+	'main/mm.c',
+	'main/pixel.c',
+	'main/points.c',
+	'main/polygon.c',
+	'main/queryobj.c',
+	'main/rastpos.c',
+	'main/rbadaptors.c',
+	'main/renderbuffer.c',
+	'main/shaders.c',
+	'main/state.c',
+	'main/stencil.c',
+	'main/texcompress.c',
+	'main/texcompress_s3tc.c',
+	'main/texcompress_fxt1.c',
+	'main/texenvprogram.c',
+	'main/texformat.c',
+	'main/teximage.c',
+	'main/texobj.c',
+	'main/texrender.c',
+	'main/texstate.c',
+	'main/texstore.c',
+	'main/varray.c',
+	'main/vtxfmt.c',
+]
+
+GLAPI_SOURCES = [
+	'main/dispatch.c',
+	'glapi/glapi.c',
+	'glapi/glthread.c',
+]
+
+MATH_SOURCES = [
+	'math/m_debug_clip.c',
+	'math/m_debug_norm.c',
+	'math/m_debug_xform.c',
+	'math/m_eval.c',
+	'math/m_matrix.c',
+	'math/m_translate.c',
+	'math/m_vector.c',
+	'math/m_xform.c',
+]
+
+VBO_SOURCES = [
+	'vbo/vbo_context.c',
+	'vbo/vbo_exec.c',
+	'vbo/vbo_exec_api.c',
+	'vbo/vbo_exec_array.c',
+	'vbo/vbo_exec_draw.c',
+	'vbo/vbo_exec_eval.c',
+	'vbo/vbo_rebase.c',
+	'vbo/vbo_split.c',
+	'vbo/vbo_split_copy.c',
+	'vbo/vbo_split_inplace.c',
+	'vbo/vbo_save.c',
+	'vbo/vbo_save_api.c',
+	'vbo/vbo_save_draw.c',
+	'vbo/vbo_save_loopback.c',
+]
+
+VF_SOURCES = [
+	'vf/vf.c',
+	'vf/vf_generic.c',
+	'vf/vf_sse.c',
+]
+
+DRAW_SOURCES = [
+	'pipe/draw/draw_clip.c',
+	'pipe/draw/draw_context.c',
+	'pipe/draw/draw_cull.c',
+	'pipe/draw/draw_debug.c',
+	'pipe/draw/draw_flatshade.c',
+	'pipe/draw/draw_offset.c',
+	'pipe/draw/draw_prim.c',
+	'pipe/draw/draw_stipple.c',
+	'pipe/draw/draw_twoside.c',
+	'pipe/draw/draw_unfilled.c',
+	'pipe/draw/draw_validate.c',
+	'pipe/draw/draw_vbuf.c',
+	'pipe/draw/draw_vertex.c',
+	'pipe/draw/draw_vertex_cache.c',
+	'pipe/draw/draw_vertex_fetch.c',
+	'pipe/draw/draw_vertex_shader.c',
+	'pipe/draw/draw_vertex_shader_llvm.c',
+	'pipe/draw/draw_vf.c',
+	'pipe/draw/draw_vf_generic.c',
+	'pipe/draw/draw_vf_sse.c',
+	'pipe/draw/draw_wide_prims.c',
+]
+
+TGSIEXEC_SOURCES = [
+	'pipe/tgsi/exec/tgsi_exec.c',
+	'pipe/tgsi/exec/tgsi_sse2.c',
+]
+
+TGSIUTIL_SOURCES = [
+	'pipe/tgsi/util/tgsi_build.c',
+	'pipe/tgsi/util/tgsi_dump.c',
+	'pipe/tgsi/util/tgsi_parse.c',
+	'pipe/tgsi/util/tgsi_util.c',
+]
+
+STATECACHE_SOURCES = [
+	'pipe/cso_cache/cso_hash.c',
+	'pipe/cso_cache/cso_cache.c',
+]
+
+PIPEUTIL_SOURCES = [
+	'pipe/util/p_tile.c',
+	'pipe/util/p_util.c',
+]
+
+STATETRACKER_SOURCES = [
+	'state_tracker/st_atom.c',
+	'state_tracker/st_atom_blend.c',
+	'state_tracker/st_atom_clip.c',
+	'state_tracker/st_atom_constbuf.c',
+	'state_tracker/st_atom_depth.c',
+	'state_tracker/st_atom_framebuffer.c',
+	'state_tracker/st_atom_pixeltransfer.c',
+	'state_tracker/st_atom_sampler.c',
+	'state_tracker/st_atom_scissor.c',
+	'state_tracker/st_atom_shader.c',
+	'state_tracker/st_atom_rasterizer.c',
+	'state_tracker/st_atom_stipple.c',
+	'state_tracker/st_atom_texture.c',
+	'state_tracker/st_atom_viewport.c',
+	'state_tracker/st_cb_accum.c',
+	'state_tracker/st_cb_bufferobjects.c',
+	'state_tracker/st_cb_clear.c',
+	'state_tracker/st_cb_flush.c',
+	'state_tracker/st_cb_drawpixels.c',
+	'state_tracker/st_cb_fbo.c',
+	'state_tracker/st_cb_feedback.c',
+	'state_tracker/st_cb_program.c',
+	'state_tracker/st_cb_queryobj.c',
+	'state_tracker/st_cb_rasterpos.c',
+	'state_tracker/st_cb_readpixels.c',
+	'state_tracker/st_cb_strings.c',
+	'state_tracker/st_cb_texture.c',
+	'state_tracker/st_cache.c',
+	'state_tracker/st_context.c',
+	'state_tracker/st_debug.c',
+	'state_tracker/st_draw.c',
+	'state_tracker/st_extensions.c',
+	'state_tracker/st_format.c',
+	'state_tracker/st_framebuffer.c',
+	'state_tracker/st_mesa_to_tgsi.c',
+	'state_tracker/st_program.c',
+	'state_tracker/st_texture.c',
+]
+
+SHADER_SOURCES = [
+	'shader/arbprogparse.c',
+	'shader/arbprogram.c',
+	'shader/atifragshader.c',
+	'shader/grammar/grammar_mesa.c',
+	'shader/nvfragparse.c',
+	'shader/nvprogram.c',
+	'shader/nvvertparse.c',
+	'shader/program.c',
+	'shader/prog_cache.c',
+	'shader/prog_debug.c',
+	'shader/prog_execute.c',
+	'shader/prog_instruction.c',
+	'shader/prog_parameter.c',
+	'shader/prog_print.c',
+	'shader/prog_statevars.c',
+	'shader/programopt.c',
+	'shader/shader_api.c',
+]
+
+SLANG_SOURCES = [
+	'shader/slang/slang_builtin.c',
+	'shader/slang/slang_codegen.c',
+	'shader/slang/slang_compile.c',
+	'shader/slang/slang_compile_function.c',
+	'shader/slang/slang_compile_operation.c',
+	'shader/slang/slang_compile_struct.c',
+	'shader/slang/slang_compile_variable.c',
+	'shader/slang/slang_emit.c',
+	'shader/slang/slang_ir.c',
+	'shader/slang/slang_label.c',
+	'shader/slang/slang_library_noise.c',
+	'shader/slang/slang_link.c',
+	'shader/slang/slang_log.c',
+	'shader/slang/slang_mem.c',
+	'shader/slang/slang_preprocess.c',
+	'shader/slang/slang_print.c',
+	'shader/slang/slang_simplify.c',
+	'shader/slang/slang_storage.c',
+	'shader/slang/slang_typeinfo.c',
+	'shader/slang/slang_vartable.c',
+	'shader/slang/slang_utility.c',
+]
+
+
+#######################################################################
+# Assembly sources
+
+ASM_C_SOURCES = [
+	'x86/common_x86.c',
+	'x86/x86.c',
+	'x86/3dnow.c',
+	'x86/sse.c',
+	'x86/rtasm/x86sse.c',
+	'sparc/sparc.c',
+	'ppc/common_ppc.c',
+	'x86-64/x86-64.c',
+]
+
+X86_SOURCES = [
+	'x86/common_x86_asm.S',
+	'x86/x86_xform2.S',
+	'x86/x86_xform3.S',
+	'x86/x86_xform4.S',
+	'x86/x86_cliptest.S',
+	'x86/mmx_blend.S',
+	'x86/3dnow_xform1.S',
+	'x86/3dnow_xform2.S',
+	'x86/3dnow_xform3.S',
+	'x86/3dnow_xform4.S',
+	'x86/3dnow_normal.S',
+	'x86/sse_xform1.S',
+	'x86/sse_xform2.S',
+	'x86/sse_xform3.S',
+	'x86/sse_xform4.S',
+	'x86/sse_normal.S',
+	'x86/read_rgba_span_x86.S',
+]
+
+X86_API = [
+	'x86/glapi_x86.S',
+]
+
+X86_64_SOURCES = [
+	'x86-64/xform4.S',
+]
+
+X86_64_API = [
+	'x86-64/glapi_x86-64.S',
+]
+
+SPARC_SOURCES = [
+	'sparc/clip.S',
+	'sparc/norm.S',
+	'sparc/xform.S',
+]
+
+SPARC_API = [
+	'sparc/glapi_sparc.S',
+]
+
+if x86:
+	ASM_SOURCES = ASM_C_SOURCES + X86_SOURCES 
+	API_SOURCES = X86_API
+else:
+	ASM_SOURCES = []
+	API_SOURCES = []
+
+
+#######################################################################
+# Driver sources
+
+
+X11_DRIVER_SOURCES = [
+	'pipe/xlib/glxapi.c',
+	'pipe/xlib/fakeglx.c',
+	'pipe/xlib/xfonts.c',
+	'pipe/xlib/xm_api.c',
+	'pipe/xlib/xm_winsys.c',
+	'pipe/xlib/xm_winsys_aub.c',
+	'pipe/xlib/brw_aub.c',
+]
+
+OSMESA_DRIVER_SOURCES = [
+	'drivers/osmesa/osmesa.c',
+]
+
+GLIDE_DRIVER_SOURCES = [
+	'drivers/glide/fxapi.c',
+	'drivers/glide/fxdd.c',
+	'drivers/glide/fxddspan.c',
+	'drivers/glide/fxddtex.c',
+	'drivers/glide/fxsetup.c',
+	'drivers/glide/fxtexman.c',
+	'drivers/glide/fxtris.c',
+	'drivers/glide/fxvb.c',
+	'drivers/glide/fxglidew.c',
+	'drivers/glide/fxg.c',
+]
+
+SVGA_DRIVER_SOURCES = [
+	'drivers/svga/svgamesa.c',
+	'drivers/svga/svgamesa8.c',
+	'drivers/svga/svgamesa15.c',
+	'drivers/svga/svgamesa16.c',
+	'drivers/svga/svgamesa24.c',
+	'drivers/svga/svgamesa32.c',
+]
+
+FBDEV_DRIVER_SOURCES = [
+	'drivers/fbdev/glfbdev.c',
+]
+
+
+### All the core C sources
+
+SOLO_SOURCES = \
+	MAIN_SOURCES + \
+	MATH_SOURCES + \
+	VBO_SOURCES + \
+	VF_SOURCES + \
+	DRAW_SOURCES + \
+	TGSIEXEC_SOURCES + \
+	TGSIUTIL_SOURCES + \
+	PIPEUTIL_SOURCES + \
+	STATECACHE_SOURCES + \
+	STATETRACKER_SOURCES + \
+	SHADER_SOURCES + \
+	ASM_SOURCES + \
+	SLANG_SOURCES
+
+CORE_SOURCES = \
+	GLAPI_SOURCES + API_SOURCES + \
+	SOLO_SOURCES
+
+ALL_SOURCES = \
+	GLAPI_SOURCES + API_SOURCES + \
+	SOLO_SOURCES + \
+	ASM_SOURCES + \
+	X11_DRIVER_SOURCES + \
+	FBDEV_DRIVER_SOURCES + \
+	OSMESA_DRIVER_SOURCES
+
+
+######################################################################
+# Gallium sources
+
+SConscript([
+	'pipe/SConscript',
+])
+
+
+######################################################################
+# libGL
+
+if not dri:
+	STAND_ALONE_DRIVER_SOURCES = \
+		CORE_SOURCES + \
+		X11_DRIVER_SOURCES
+	
+	Import(
+		'softpipe', 
+		'i915simple',
+		'i965simple'
+	)
+	
+	pipe_drivers = [
+		softpipe,
+		i965simple
+	]
+	
+	env.SharedLibrary(
+		target ='GL',
+		source = STAND_ALONE_DRIVER_SOURCES,
+		LIBS = [softpipe, i965simple] + env['LIBS'],
+	)
+
+
+######################################################################
+# Driver sources
+
+if dri:
+	mesa = env.ConvenienceLibrary(
+		target = 'mesa',
+		source = SOLO_SOURCES,
+	)
+	env.Prepend(LIBS = [mesa])
+
+	SConscript([
+		'drivers/dri/SConscript',
+	])
diff --git a/src/mesa/drivers/dri/SConscript b/src/mesa/drivers/dri/SConscript
new file mode 100644
index 0000000000..d32bd08669
--- /dev/null
+++ b/src/mesa/drivers/dri/SConscript
@@ -0,0 +1,48 @@
+Import('*')
+
+drienv = env.Clone()
+
+drienv.Replace(CPPPATH = [
+	'#src/mesa/drivers/dri/common',
+	'#include',
+	'#include/GL/internal',
+	'#src/mesa',
+	'#src/mesa/main',
+	'#src/mesa/glapi',
+	'#src/mesa/math',
+	'#src/mesa/transform',
+	'#src/mesa/shader',
+	'#src/mesa/swrast',
+	'#src/mesa/swrast_setup',
+	'#src/egl/main',
+	'#src/egl/drivers/dri',
+])
+
+drienv.ParseConfig('pkg-config --cflags --libs libdrm')
+
+COMMON_GALLIUM_SOURCES = [
+	'../common/utils.c',
+	'../common/vblank.c',
+	'../common/dri_util.c',
+	'../common/xmlconfig.c',
+]
+
+COMMON_BM_SOURCES = [
+	'../common/dri_bufmgr.c',
+	'../common/dri_drmpool.c',
+]
+
+Export([
+	'drienv',
+	'COMMON_GALLIUM_SOURCES',
+	'COMMON_BM_SOURCES',
+])
+
+# TODO: Installation
+#install: $(LIBNAME)
+#	$(INSTALL) -d $(DRI_DRIVER_INSTALL_DIR)
+#	$(INSTALL) -m 755 $(LIBNAME) $(DRI_DRIVER_INSTALL_DIR)
+
+SConscript([
+	'intel_winsys/SConscript',
+])
diff --git a/src/mesa/drivers/dri/intel_winsys/SConscript b/src/mesa/drivers/dri/intel_winsys/SConscript
new file mode 100644
index 0000000000..a7cc10450e
--- /dev/null
+++ b/src/mesa/drivers/dri/intel_winsys/SConscript
@@ -0,0 +1,41 @@
+Import('*')
+
+env = drienv.Clone()
+
+env.Append(CPPPATH = [
+	'../intel',
+	'server'
+])
+
+#MINIGLX_SOURCES = server/intel_dri.c
+
+pipe_drivers = [
+	softpipe,
+	i915simple
+]
+
+DRIVER_SOURCES = [
+	'intel_winsys_pipe.c',
+	'intel_winsys_softpipe.c',
+	'intel_winsys_i915.c',
+	'intel_batchbuffer.c',
+	'intel_swapbuffers.c',
+	'intel_context.c',
+	'intel_lock.c',
+	'intel_screen.c',
+	'intel_batchpool.c',
+]
+
+sources = \
+	COMMON_GALLIUM_SOURCES + \
+	COMMON_BM_SOURCES + \
+	DRIVER_SOURCES
+
+# DRIVER_DEFINES = -I../intel $(shell pkg-config libdrm --atleast-version=2.3.1 \
+#				&& echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
+
+env.SharedLibrary(
+	target ='i915tex_dri.so',
+	source = sources,
+	LIBS = pipe_drivers + env['LIBS'],
+)
\ No newline at end of file
diff --git a/src/mesa/pipe/SConscript b/src/mesa/pipe/SConscript
new file mode 100644
index 0000000000..d9c20e0100
--- /dev/null
+++ b/src/mesa/pipe/SConscript
@@ -0,0 +1,9 @@
+Import('*')
+
+#env = env.Clone()
+
+SConscript([
+	'softpipe/SConscript',
+	'i915simple/SConscript',
+	'i965simple/SConscript',
+])
diff --git a/src/mesa/pipe/i915simple/SConscript b/src/mesa/pipe/i915simple/SConscript
new file mode 100644
index 0000000000..f5fb96b995
--- /dev/null
+++ b/src/mesa/pipe/i915simple/SConscript
@@ -0,0 +1,29 @@
+Import('*')
+
+env = env.Clone()
+
+i915simple = env.ConvenienceLibrary(
+	target = 'i915simple',
+	source = [
+		'i915_blit.c',
+		'i915_clear.c',
+		'i915_context.c',
+		'i915_debug.c',
+		'i915_debug_fp.c',
+		'i915_flush.c',
+		'i915_fpc_emit.c',
+		'i915_fpc_translate.c',
+		'i915_prim_emit.c',
+		'i915_prim_vbuf.c',
+		'i915_state.c',
+		'i915_state_derived.c',
+		'i915_state_dynamic.c',
+		'i915_state_emit.c',
+		'i915_state_immediate.c',
+		'i915_state_sampler.c',
+		'i915_strings.c',
+		'i915_surface.c',
+		'i915_texture.c',
+	])
+
+Export('i915simple')
diff --git a/src/mesa/pipe/i965simple/SConscript b/src/mesa/pipe/i965simple/SConscript
new file mode 100644
index 0000000000..74621de84c
--- /dev/null
+++ b/src/mesa/pipe/i965simple/SConscript
@@ -0,0 +1,55 @@
+Import('*')
+
+env = env.Clone()
+
+i965simple = env.ConvenienceLibrary(
+	target = 'i965simple',
+	source = [
+		'brw_blit.c',
+		'brw_cc.c',
+		'brw_clip.c',
+		'brw_clip_line.c',
+		'brw_clip_point.c',
+		'brw_clip_state.c',
+		'brw_clip_tri.c',
+		'brw_clip_util.c',
+		'brw_context.c',
+		'brw_curbe.c',
+		'brw_draw.c',
+		'brw_draw_upload.c',
+		'brw_eu.c',
+		'brw_eu_debug.c',
+		'brw_eu_emit.c',
+		'brw_eu_util.c',
+		'brw_flush.c',
+		'brw_gs.c',
+		'brw_gs_emit.c',
+		'brw_gs_state.c',
+		'brw_misc_state.c',
+		'brw_sf.c',
+		'brw_sf_emit.c',
+		'brw_sf_state.c',
+		'brw_shader_info.c',
+		'brw_state.c',
+		'brw_state_batch.c',
+		'brw_state_cache.c',
+		'brw_state_pool.c',
+		'brw_state_upload.c',
+		'brw_strings.c',
+		'brw_surface.c',
+		'brw_tex_layout.c',
+		'brw_urb.c',
+		'brw_util.c',
+		'brw_vs.c',
+		'brw_vs_emit.c',
+		'brw_vs_state.c',
+		'brw_wm.c',
+		'brw_wm_decl.c',
+		'brw_wm_glsl.c',
+		'brw_wm_iz.c',
+		'brw_wm_sampler_state.c',
+		'brw_wm_state.c',
+		'brw_wm_surface_state.c',
+	])
+
+Export('i965simple')
diff --git a/src/mesa/pipe/softpipe/SConscript b/src/mesa/pipe/softpipe/SConscript
new file mode 100644
index 0000000000..d581ee8d3c
--- /dev/null
+++ b/src/mesa/pipe/softpipe/SConscript
@@ -0,0 +1,42 @@
+Import('*')
+
+env = env.Clone()
+
+softpipe = env.ConvenienceLibrary(
+	target = 'softpipe',
+	source = [
+		'sp_clear.c',
+		'sp_context.c',
+		'sp_draw_arrays.c',
+		'sp_flush.c',
+		'sp_prim_setup.c',
+		'sp_prim_vbuf.c',
+		'sp_quad_alpha_test.c',
+		'sp_quad_blend.c',
+		'sp_quad_bufloop.c',
+		'sp_quad.c',
+		'sp_quad_colormask.c',
+		'sp_quad_coverage.c',
+		'sp_quad_depth_test.c',
+		'sp_quad_earlyz.c',
+		'sp_quad_fs.c',
+		'sp_quad_occlusion.c',
+		'sp_quad_output.c',
+		'sp_quad_stencil.c',
+		'sp_quad_stipple.c',
+		'sp_query.c',
+		'sp_state_blend.c',
+		'sp_state_clip.c',
+		'sp_state_derived.c',
+		'sp_state_fs.c',
+		'sp_state_rasterizer.c',
+		'sp_state_sampler.c',
+		'sp_state_surface.c',
+		'sp_state_vertex.c',
+		'sp_surface.c',
+		'sp_tex_sample.c',
+		'sp_texture.c',
+		'sp_tile_cache.c',
+	])
+
+Export('softpipe')
\ No newline at end of file
-- 
cgit v1.2.3


From 50d5f304ad9eba8fe95a6cedfc56cd7213b33fea Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Thu, 31 Jan 2008 14:26:39 +0900
Subject: gallium: Portability fixes.

---
 src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
index a2657dac59..b6af7cdedc 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
@@ -367,7 +367,7 @@ struct mm_pb_manager
 };
 
 
-static inline struct mm_pb_manager *
+static INLINE struct mm_pb_manager *
 mm_pb_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -385,7 +385,7 @@ struct mm_buffer
 };
 
 
-static inline struct mm_buffer *
+static INLINE struct mm_buffer *
 mm_buffer(struct pb_buffer *buf)
 {
    assert(buf);
-- 
cgit v1.2.3


From e82c9b867cc18af5da7375871a685f98d1c1527d Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 31 Jan 2008 11:57:15 +0000
Subject: tgsi: Use ESI instead of EBX as temp reg on non-win32

---
 src/mesa/pipe/tgsi/exec/tgsi_sse2.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
index f8660e7ad1..df0c698301 100755
--- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
@@ -198,9 +198,15 @@ get_output_base( void )
 static struct x86_reg
 get_temp_base( void )
 {
+#ifdef WIN32
    return x86_make_reg(
       file_REG32,
       reg_BX );
+#else
+   return x86_make_reg(
+      file_REG32,
+      reg_SI );
+#endif
 }
 
 static struct x86_reg
-- 
cgit v1.2.3


From 256486829f0bc2be7a986a6bdc08df5fc16b77d8 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 08:12:47 -0700
Subject: Cell: set GALLIUM_CELL_VS env var to enable SPU-based vertex
 transformation

---
 src/mesa/pipe/cell/ppu/cell_context.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index 4885cd0d2c..bbe1fd7a11 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -162,8 +162,12 @@ cell_draw_create(struct cell_context *cell)
 {
    struct draw_context *draw = draw_create();
 
-   draw->shader_queue_flush = cell_vertex_shader_queue_flush;
-   draw->driver_private = cell;
+   if (getenv("GALLIUM_CELL_VS")) {
+      /* plug in SPU-based vertex transformation code */
+      draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+      draw->driver_private = cell;
+   }
+
    return draw;
 }
 
-- 
cgit v1.2.3


From cd53eb0db19daf1c9aac94011a54e902eb10fe75 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 08:21:38 -0700
Subject: Cell: SIMD-ize const_coeff()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index e436e153ec..08b8bf0c9c 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -723,24 +723,18 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
 
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex->data[slot][i].
- * The result will be put into setup.coef[slot].a0[i].
+ * The value value comes from vertex->data[slot].
+ * The result will be put into setup.coef[slot].a0.
  * \param slot  which attribute slot 
- * \param i  which component of the slot (0..3)
  */
-static void const_coeff(uint slot)
+static INLINE void const_coeff(uint slot)
 {
-   uint i;
-   ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
-   for (i = 0; i < 4; i++) {
-      setup.coef[slot].dadx.f[i] = 0;
-      setup.coef[slot].dady.f[i] = 0;
-
-      /* need provoking vertex info!
-       */
-      setup.coef[slot].a0.f[i] = setup.vprovoke->data[slot][i];
-   }
+   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0.f[0] = setup.vprovoke->data[slot][0];
+   setup.coef[slot].a0.f[1] = setup.vprovoke->data[slot][1];
+   setup.coef[slot].a0.f[2] = setup.vprovoke->data[slot][2];
+   setup.coef[slot].a0.f[3] = setup.vprovoke->data[slot][3];
 }
 
 
-- 
cgit v1.2.3


From c36cdc61561fee21108f0a68ca661f0d8c7f5d94 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 13:32:29 -0700
Subject: gallium: fix get/put typo regression

This came from commit f3aa4de034b0d791ce2e38e8aeb3b3abdb4e3b50 on 1/22/08.
Fixes strange Z buffer glitches seen in progs/glsl/texdemo1.c
---
 src/mesa/pipe/softpipe/sp_tile_cache.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/softpipe/sp_tile_cache.c b/src/mesa/pipe/softpipe/sp_tile_cache.c
index 451e157abf..ccf367a5e4 100644
--- a/src/mesa/pipe/softpipe/sp_tile_cache.c
+++ b/src/mesa/pipe/softpipe/sp_tile_cache.c
@@ -415,8 +415,8 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
          /* put dirty tile back in framebuffer */
          if (tc->depth_stencil) {
             pipe_put_tile_raw(pipe, ps,
-                           tile->x, tile->y, TILE_SIZE, TILE_SIZE,
-                           tile->data.depth32, 0/*STRIDE*/);
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_put_tile_rgba(pipe, ps,
@@ -441,9 +441,9 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
       else {
          /* get new tile data from surface */
          if (tc->depth_stencil) {
-            pipe_put_tile_raw(pipe, ps,
-                           tile->x, tile->y, TILE_SIZE, TILE_SIZE,
-                           tile->data.depth32, 0/*STRIDE*/);
+            pipe_get_tile_raw(pipe, ps,
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_get_tile_rgba(pipe, ps,
-- 
cgit v1.2.3


From 9536314a6c99d0acc249180034865b5cfb927e9d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 13:36:00 -0700
Subject: gallium: Fix z clear bug when TILE_CLEAR_OPTIMIZATION==0

---
 src/mesa/pipe/softpipe/sp_clear.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/softpipe/sp_clear.c b/src/mesa/pipe/softpipe/sp_clear.c
index 571f64b38d..8d295a30ca 100644
--- a/src/mesa/pipe/softpipe/sp_clear.c
+++ b/src/mesa/pipe/softpipe/sp_clear.c
@@ -55,7 +55,9 @@ softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
 
    if (ps == sp_tile_cache_get_surface(softpipe->zsbuf_cache)) {
       sp_tile_cache_clear(softpipe->zsbuf_cache, clearValue);
+#if TILE_CLEAR_OPTIMIZATION
       return;
+#endif
    }
 
    for (i = 0; i < softpipe->framebuffer.num_cbufs; i++) {
-- 
cgit v1.2.3


From acb81374c1d476ebffbcfc8405db7fff6cc6d6c3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 13:37:01 -0700
Subject: gallium: comments about fragment Z computation

---
 src/mesa/pipe/softpipe/sp_quad_fs.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/softpipe/sp_quad_fs.c b/src/mesa/pipe/softpipe/sp_quad_fs.c
index c9cc8afa0c..90691c6065 100644
--- a/src/mesa/pipe/softpipe/sp_quad_fs.c
+++ b/src/mesa/pipe/softpipe/sp_quad_fs.c
@@ -168,6 +168,11 @@ shade_quad(
              sizeof( quad->outputs.color ) );
    }
 
+   /*
+    * XXX the following code for updating quad->outputs.depth
+    * isn't really needed if we did early z testing.
+    */
+
    /* store result Z */
    if (qss->depthOutSlot >= 0) {
       /* output[slot] is new Z */
@@ -181,6 +186,10 @@ shade_quad(
       uint i;
       for (i = 0; i < 4; i++) {
          quad->outputs.depth[i] = machine->Inputs[0].xyzw[2].f[i];
+         /* XXX not sure the above line is always correct.  The following
+          * might be better:
+         quad->outputs.depth[i] = machine->QuadPos.xyzw[2].f[i];
+          */
       }
    }
 
-- 
cgit v1.2.3


From 86787043fae59869133180474cb09dac4f2e619a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 14:05:04 -0700
Subject: Fix problem in mapping vertex program outputs (found with "spring"
 game engine)

If the vertex program writes to an output that's not consumed by the
fragment program, map the vp output to an unused slot.
---
 src/mesa/state_tracker/st_atom_shader.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 1ed9333556..9196918509 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -226,9 +226,11 @@ find_translated_vp(struct st_context *st,
             GLint fpInAttrib = vp_out_to_fp_in(outAttr);
             if (fpInAttrib >= 0) {
                GLuint fpInSlot = stfp->input_to_slot[fpInAttrib];
-               GLuint vpOutSlot = stfp->fs->state.input_map[fpInSlot];
-               xvp->output_to_slot[outAttr] = vpOutSlot;
-               numVpOuts++;
+               if (fpInSlot != ~0) {
+                  GLuint vpOutSlot = stfp->fs->state.input_map[fpInSlot];
+                  xvp->output_to_slot[outAttr] = vpOutSlot;
+                  numVpOuts++;
+               }
             }
             else if (outAttr == VERT_RESULT_PSIZ ||
                      outAttr == VERT_RESULT_BFC0 ||
@@ -247,7 +249,7 @@ find_translated_vp(struct st_context *st,
        * We could use this info to do dead code elimination in the
        * vertex program.
        */
-      dummySlot = stfp->num_input_slots;
+      dummySlot = numVpOuts;
 
       /* Map vert program outputs that aren't used to the dummy slot */
       for (outAttr = 0; outAttr < VERT_RESULT_MAX; outAttr++) {
-- 
cgit v1.2.3


From 633e1133aeed04df650b97c8d25a041014fd6f5e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 14:35:25 -0700
Subject: gallium: fix problem in which texcoords and varying vars got mapped
 to the same slot

This fixes the glsl/bump.c and glsl/texdemo1.c programs
---
 src/mesa/state_tracker/st_program.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 1f1e6500e0..84a9094001 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -33,6 +33,7 @@
 
 #include "main/imports.h"
 #include "main/mtypes.h"
+#include "shader/prog_print.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -68,6 +69,7 @@ st_translate_vertex_program(struct st_context *st,
    struct pipe_shader_state vs;
    const struct cso_vertex_shader *cso;
    GLuint attr, i;
+   GLuint num_generic = 0;
 
    memset(&vs, 0, sizeof(vs));
 
@@ -117,7 +119,7 @@ st_translate_vertex_program(struct st_context *st,
          case VERT_ATTRIB_TEX6:
          case VERT_ATTRIB_TEX7:
             vs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.input_semantic_index[slot] = attr - VERT_ATTRIB_TEX0;
+            vs.input_semantic_index[slot] = num_generic++;
             break;
          case VERT_ATTRIB_GENERIC0:
          case VERT_ATTRIB_GENERIC1:
@@ -129,7 +131,7 @@ st_translate_vertex_program(struct st_context *st,
          case VERT_ATTRIB_GENERIC7:
             assert(attr < VERT_ATTRIB_MAX);
             vs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.input_semantic_index[slot] = attr - VERT_ATTRIB_GENERIC0;
+            vs.input_semantic_index[slot] = num_generic++;
             break;
          default:
             assert(0);
@@ -143,6 +145,7 @@ st_translate_vertex_program(struct st_context *st,
       vs.output_semantic_index[i] = 0;
    }
 
+   num_generic = 0;
    /*
     * Determine number of outputs, the (default) output register
     * mapping and the semantic information for each output.
@@ -207,14 +210,14 @@ st_translate_vertex_program(struct st_context *st,
          case VERT_RESULT_TEX6:
          case VERT_RESULT_TEX7:
             vs.output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.output_semantic_index[slot] = attr - VERT_RESULT_TEX0;
+            vs.output_semantic_index[slot] = num_generic++;
             break;
          case VERT_RESULT_VAR0:
             /* fall-through */
          default:
             assert(attr - VERT_RESULT_VAR0 < MAX_VARYING);
             vs.output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.output_semantic_index[slot] = attr - VERT_RESULT_VAR0;
+            vs.output_semantic_index[slot] = num_generic++;
          }
       }
    }
@@ -258,6 +261,9 @@ st_translate_vertex_program(struct st_context *st,
    cso = st_cached_vs_state(st, &vs);
    stvp->cso = cso;
 
+   if (0)
+      _mesa_print_program(&stvp->Base.Base);
+
    if (TGSI_DEBUG)
       tgsi_dump( tokensOut, 0 );
 }
@@ -286,6 +292,7 @@ st_translate_fragment_program(struct st_context *st,
    GLuint attr;
    const GLbitfield inputsRead = stfp->Base.Base.InputsRead;
    GLuint vslot = 0;
+   GLuint num_generic = 0;
 
    memset(&fs, 0, sizeof(fs));
 
@@ -338,14 +345,14 @@ st_translate_fragment_program(struct st_context *st,
          case FRAG_ATTRIB_TEX6:
          case FRAG_ATTRIB_TEX7:
             fs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            fs.input_semantic_index[slot] = attr - FRAG_ATTRIB_TEX0;
+            fs.input_semantic_index[slot] = num_generic++;
             interpMode[slot] = TGSI_INTERPOLATE_PERSPECTIVE;
             break;
          case FRAG_ATTRIB_VAR0:
             /* fall-through */
          default:
             fs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            fs.input_semantic_index[slot] = attr - FRAG_ATTRIB_VAR0;
+            fs.input_semantic_index[slot] = num_generic++;
             interpMode[slot] = TGSI_INTERPOLATE_PERSPECTIVE;
          }
       }
@@ -415,6 +422,9 @@ st_translate_fragment_program(struct st_context *st,
    cso = st_cached_fs_state(st, &fs);
    stfp->fs = cso;
 
+   if (0)
+      _mesa_print_program(&stfp->Base.Base);
+
    if (TGSI_DEBUG)
       tgsi_dump( tokensOut, 0/*TGSI_DUMP_VERBOSE*/ );
 
-- 
cgit v1.2.3


From 635341ec5b06b3db453e88f44663d2ad711c3f7d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Thu, 31 Jan 2008 17:05:43 -0700
Subject: fix typo

---
 src/mesa/pipe/p_shader_tokens.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/p_shader_tokens.h b/src/mesa/pipe/p_shader_tokens.h
index e9d1d66bda..3ce35310f6 100644
--- a/src/mesa/pipe/p_shader_tokens.h
+++ b/src/mesa/pipe/p_shader_tokens.h
@@ -626,7 +626,7 @@ struct tgsi_src_register_ext
 
 /*
  * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_SWZ,
- * it should be cast to tgsi_src_register_ext_extswz.
+ * it should be cast to tgsi_src_register_ext_swz.
  * 
  * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_MOD,
  * it should be cast to tgsi_src_register_ext_mod.
-- 
cgit v1.2.3


From 20df285b14bc655d5429c7d2b82446204f9a1f2e Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Thu, 31 Jan 2008 17:22:07 -0800
Subject: Fix using "ccache ppu-gcc" for CC and fix parallel builds

CC wasn't quoted in a couple places in src/mesa/Makefile.  Also, the
OSMesa link was missing a dependency.
---
 src/mesa/Makefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src')

diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index b16d74bf49..720f1b2e02 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -125,24 +125,25 @@ osmesa-only: depend subdirs $(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME)
 # Make the GL library
 $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME): $(STAND_ALONE_OBJECTS) $(PIPE_LIB) $(CELL_LIB) $(CELL_LIB_SPU) $(LLVM_LIB)
 	@ $(TOP)/bin/mklib -o $(GL_LIB) \
-		-linker $(CC) \
+		-linker "$(CC)" \
 		-major $(GL_MAJOR) -minor $(GL_MINOR) -patch $(GL_TINY) \
 		-install $(TOP)/$(LIB_DIR) \
 		$(MKLIB_OPTIONS) $(STAND_ALONE_OBJECTS) \
 		$(PIPE_LIB) $(CELL_LIB) $(CELL_LIB_SPU) $(LLVM_LIB) $(GL_LIB_DEPS)
 
 # Make the OSMesa library
-$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OSMESA_DRIVER_OBJECTS) $(OSMESA16_OBJECTS)
+$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OSMESA_DRIVER_OBJECTS) \
+		$(OSMESA16_OBJECTS) $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME)
 	@ if [ "${DRIVER_DIRS}" = "osmesa" ] ; then \
 		$(TOP)/bin/mklib -o $(OSMESA_LIB) \
-			-linker $(CC) \
+			-linker "$(CC)" \
 			-major $(MESA_MAJOR) \
 			-minor $(MESA_MINOR) -patch $(MESA_TINY) \
 			-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
 			$(OSMESA_LIB_DEPS) $(OSMESA16_OBJECTS) ; \
 	else \
 		$(TOP)/bin/mklib -o $(OSMESA_LIB) \
-			-linker $(CC) \
+			-linker "$(CC)" \
 			-major $(MESA_MAJOR) \
 			-minor $(MESA_MINOR) -patch $(GL_TINY) \
 			-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
-- 
cgit v1.2.3


From b108bea6b44c1abc6d61e3e47096e5122de89cd1 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 09:27:57 -0700
Subject: Cell: store current tile status in cur_tile_status_c/z, add
 TILE_STATUS_GETTING

---
 src/mesa/pipe/cell/spu/spu_render.c | 36 ++++++++++++++++-----
 src/mesa/pipe/cell/spu/spu_tile.c   |  1 +
 src/mesa/pipe/cell/spu/spu_tile.h   |  8 +++--
 src/mesa/pipe/cell/spu/spu_tri.c    | 62 ++++++++++++++++++++++++++++++-------
 src/mesa/pipe/cell/spu/spu_tri.h    |  2 +-
 5 files changed, 87 insertions(+), 22 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index f506095116..ca54a103bd 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -95,13 +95,15 @@ static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
    if (spu.depth_stencil.depth.enabled) {
-      if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
+      if (cur_tile_status_z != TILE_STATUS_CLEAR) {
          get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
+         cur_tile_status_z = TILE_STATUS_GETTING;
       }
    }
 
-   if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
+   if (cur_tile_status_c != TILE_STATUS_CLEAR) {
       get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
+      cur_tile_status_c = TILE_STATUS_GETTING;
    }
 }
 
@@ -112,14 +114,24 @@ get_cz_tiles(uint tx, uint ty)
 static INLINE void
 put_cz_tiles(uint tx, uint ty)
 {
-   if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
+   if (cur_tile_status_z == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
       put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-      tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
+      cur_tile_status_z = TILE_STATUS_DEFINED;
+   }
+   else if (cur_tile_status_z == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      cur_tile_status_z = TILE_STATUS_DEFINED;
    }
 
-   if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
+   if (cur_tile_status_c == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
       put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-      tile_status[ty][tx] = TILE_STATUS_DEFINED;
+      cur_tile_status_c = TILE_STATUS_DEFINED;
+   }
+   else if (cur_tile_status_c == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      cur_tile_status_c = TILE_STATUS_DEFINED;
    }
 }
 
@@ -238,8 +250,13 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
+      cur_tile_status_c = tile_status[ty][tx];
+      cur_tile_status_z = tile_status_z[ty][tx];
+
       get_cz_tiles(tx, ty);
 
+      uint drawn = 0;
+
       /* loop over tris */
       for (j = 0; j < render->num_indexes; j += 3) {
          const float *v0, *v1, *v2;
@@ -248,13 +265,18 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         tri_draw(v0, v1, v2, tx, ty);
+         drawn += tri_draw(v0, v1, v2, tx, ty);
       }
 
+      //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
+
       /* write color/z tiles back to main framebuffer, if dirtied */
       put_cz_tiles(tx, ty);
 
       wait_put_cz_tiles(); /* XXX seems unnecessary... */
+
+      tile_status[ty][tx] = cur_tile_status_c;
+      tile_status_z[ty][tx] = cur_tile_status_z;
    }
 
    if (Debug)
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index ca1352f9f8..aea4785bc2 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -37,6 +37,7 @@ tile_t ztile ALIGN16_ATTRIB;
 ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+ubyte cur_tile_status_c, cur_tile_status_z;
 
 
 void
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 18d1b3c117..1f123a2b7b 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -51,12 +51,16 @@ extern tile_t ztile ALIGN16_ATTRIB;
 
 
 #define TILE_STATUS_CLEAR   1
-#define TILE_STATUS_DEFINED 2  /**< defined pixel data */
-#define TILE_STATUS_DIRTY   3  /**< modified, but not put back yet */
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 extern ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 extern ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+extern ubyte cur_tile_status_c, cur_tile_status_z;
+
 
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 08b8bf0c9c..a32878d917 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -299,16 +299,23 @@ do_depth_test(int x, int y, unsigned int mask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+   if (cur_tile_status_c == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
+      cur_tile_status_z = TILE_STATUS_DIRTY;
    }
-   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+
+#if 0
+   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
+      /* now, _really_ clear the tile */
+      clear_z_tile(&ztile);
+   }
+   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
-   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
-
+   cur_tile_status_z = TILE_STATUS_DIRTY;
+#endif
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
       zvals.v = spu_mul(zvals.v, zscale16.v);
@@ -380,6 +387,9 @@ do_depth_test(int x, int y, unsigned int mask)
       }
    }
 
+   if (mask)
+      cur_tile_status_z = TILE_STATUS_DIRTY;
+
    return mask;
 }
 
@@ -397,15 +407,15 @@ do_depth_test_simd(int x, int y, vector unsigned int quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (tile_status_z[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
       clear_z_tile(&ztile);
    }
-   else if (tile_status_z[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
       /* make sure we've got the tile from main mem */
       wait_on_mask(1 << TAG_READ_TILE_Z);
    }
-   tile_status_z[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
+   cur_tile_status_z = TILE_STATUS_DIRTY;
 
    /* XXX fetch Z value sooner to hide latency here */
    zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
@@ -462,15 +472,23 @@ emit_quad( int x, int y, mask_t mask )
    if (mask)
 #endif
    {
-      if (tile_status[setup.ty][setup.tx] == TILE_STATUS_CLEAR) {
+      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
       }
-      else if (tile_status[setup.ty][setup.tx] != TILE_STATUS_DIRTY) {
+
+#if 0
+      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
+         /* now, _really_ clear the tile */
+         clear_c_tile(&ctile);
+         cur_tile_status_c = TILE_STATUS_DIRTY;
+      }
+      else if (cur_tile_status_c != TILE_STATUS_DIRTY) {
          /* make sure we've got the tile from main mem */
          wait_on_mask(1 << TAG_READ_TILE_COLOR);
       }
-      tile_status[setup.ty][setup.tx] = TILE_STATUS_DIRTY;
+#endif
+      cur_tile_status_c = TILE_STATUS_DIRTY;
 
 #if SIMD_Z
       if (spu_extract(mask, 0))
@@ -970,7 +988,7 @@ static void subtriangle( struct edge *eleft,
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
  */
-void
+boolean
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
    setup.tx = tx;
@@ -985,7 +1003,7 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
-      return; /* totally clipped */
+      return FALSE; /* totally clipped */
    }
 
    setup_tri_coefficients();
@@ -999,6 +1017,24 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    /*   init_constant_attribs( setup ); */
       
+   if (cur_tile_status_c == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      cur_tile_status_c = TILE_STATUS_CLEAN;
+   }
+
+   ASSERT(cur_tile_status_c != TILE_STATUS_DEFINED);
+
+   if (spu.depth_stencil.depth.enabled) {
+      if (cur_tile_status_z == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         cur_tile_status_z = TILE_STATUS_CLEAN;
+      }
+   ASSERT(cur_tile_status_z != TILE_STATUS_DEFINED);
+   }
+
+
    if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
@@ -1013,4 +1049,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    }
 
    flush_spans();
+
+   return TRUE;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_tri.h b/src/mesa/pipe/cell/spu/spu_tri.h
index 86c42b6339..aa694dd7c9 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.h
+++ b/src/mesa/pipe/cell/spu/spu_tri.h
@@ -30,7 +30,7 @@
 #define SPU_TRI_H
 
 
-extern void
+extern boolean
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
 
 
-- 
cgit v1.2.3


From 59be082909de6021ec7d08476253bd4c9920e137 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 13:45:58 -0700
Subject: Cell: implement Z16 and Z32 testing with SIMD instructions.

---
 src/mesa/pipe/cell/spu/spu_tile.h  |   3 +-
 src/mesa/pipe/cell/spu/spu_tri.c   | 222 +++++--------------------------------
 src/mesa/pipe/cell/spu/spu_ztest.h | 135 ++++++++++++++++++++++
 3 files changed, 163 insertions(+), 197 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_ztest.h

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 1f123a2b7b..4b1ef2a4c8 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -42,7 +42,8 @@
 typedef union {
    ushort t16[TILE_SIZE][TILE_SIZE];
    uint   t32[TILE_SIZE][TILE_SIZE];
-   float4 f4[TILE_SIZE/2][TILE_SIZE/2];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index a32878d917..a26a4f098d 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -39,18 +39,11 @@
 #include "spu_tile.h"
 #include "spu_tri.h"
 
+#include "spu_ztest.h"
 
-/*
- * If SIMD_Z=1 the Z buffer is floating point and we use vector instructions
- * to do Z testing/updating.
- */
-#define SIMD_Z 0
 
-#if SIMD_Z
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
-#else
-typedef uint mask_t;
-#endif
 
 
 /**
@@ -282,20 +275,11 @@ pack_colors(uint uicolors[4], const float4 fcolors[4])
 }
 
 
-
-static unsigned int
-do_depth_test(int x, int y, unsigned int mask)
+static INLINE mask_t
+do_depth_test(int x, int y, mask_t quadmask)
 {
-   static const float4 zscale16
-      = {.f={65535.0, 65535.0, 65535.0, 65535.0}};
-   static const float4 zscale32
-      = {.f={(float)0xffffffff,
-             (float)0xffffffff,
-             (float)0xffffffff,
-             (float)0xffffffff}};
-   int ix = x - setup.cliprect_minx;
-   int iy = y - setup.cliprect_miny;
    float4 zvals;
+   mask_t mask;
 
    zvals.v = eval_z((float) x, (float) y);
 
@@ -305,129 +289,20 @@ do_depth_test(int x, int y, unsigned int mask)
       cur_tile_status_z = TILE_STATUS_DIRTY;
    }
 
-#if 0
-   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-   }
-   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
-      /* make sure we've got the tile from main mem */
-      wait_on_mask(1 << TAG_READ_TILE_Z);
-   }
-   cur_tile_status_z = TILE_STATUS_DIRTY;
-#endif
-
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      zvals.v = spu_mul(zvals.v, zscale16.v);
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) zvals.f[0];
-         if (z < ztile.t16[iy][ix])
-            ztile.t16[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) zvals.f[1];
-         if (z < ztile.t16[iy][ix+1])
-            ztile.t16[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) zvals.f[2];
-         if (z < ztile.t16[iy+1][ix])
-            ztile.t16[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) zvals.f[3];
-         if (z < ztile.t16[iy+1][ix+1])
-            ztile.t16[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 4;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask);
    }
    else {
-      zvals.v = spu_mul(zvals.v, zscale32.v);
-      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) zvals.f[0];
-         if (z < ztile.t32[iy][ix])
-            ztile.t32[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) zvals.f[1];
-         if (z < ztile.t32[iy][ix+1])
-            ztile.t32[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) zvals.f[2];
-         if (z < ztile.t32[iy+1][ix])
-            ztile.t32[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) zvals.f[3];
-         if (z < ztile.t32[iy+1][ix+1])
-            ztile.t32[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 2;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask);
    }
-
-   if (mask)
-      cur_tile_status_z = TILE_STATUS_DIRTY;
-
    return mask;
 }
 
 
-
-
-static vector unsigned int
-do_depth_test_simd(int x, int y, vector unsigned int quadmask)
-{
-   int ix = (x - setup.cliprect_minx) / 2;
-   int iy = (y - setup.cliprect_miny) / 2;
-   float4 zvals;
-
-   vector unsigned int zmask;
-
-   zvals.v = eval_z((float) x, (float) y);
-
-   if (cur_tile_status_z == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-   }
-   else if (cur_tile_status_z != TILE_STATUS_DIRTY) {
-      /* make sure we've got the tile from main mem */
-      wait_on_mask(1 << TAG_READ_TILE_Z);
-   }
-   cur_tile_status_z = TILE_STATUS_DIRTY;
-
-   /* XXX fetch Z value sooner to hide latency here */
-   zmask = spu_cmpgt(ztile.f4[ix][iy].v, zvals.v);
-   zmask = spu_and(zmask, quadmask);
-
-   ztile.f4[ix][iy].v = spu_sel(ztile.f4[ix][iy].v, zvals.v, zmask);
-   //ztile.f4[ix][iy].v = spu_sel(zvals.v, ztile.f4[ix][iy].v, mask4);
-
-   return zmask;
-}
-
-
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  */
@@ -461,36 +336,18 @@ emit_quad( int x, int y, mask_t mask )
    }
 
    if (spu.depth_stencil.depth.enabled) {
-#if SIMD_Z
-      mask = do_depth_test_simd(x, y, mask);
-#else
       mask = do_depth_test(x, y, mask);
-#endif
    }
 
-#if !SIMD_Z
-   if (mask)
-#endif
-   {
-      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&ctile);
-      }
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
 
-#if 0
       if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
          clear_c_tile(&ctile);
-         cur_tile_status_c = TILE_STATUS_DIRTY;
       }
-      else if (cur_tile_status_c != TILE_STATUS_DIRTY) {
-         /* make sure we've got the tile from main mem */
-         wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      }
-#endif
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
-#if SIMD_Z
       if (spu_extract(mask, 0))
          ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
@@ -499,20 +356,11 @@ emit_quad( int x, int y, mask_t mask )
          ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
          ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-#elif 0
+
+#if 0
       /* SIMD_Z with swizzled color buffer (someday) */
       vector float icolors = *((vector float *) &colors);
       ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
-
-#else
-      if (mask & MASK_TOP_LEFT)
-         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
-      if (mask & MASK_TOP_RIGHT)
-         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
-      if (mask & MASK_BOTTOM_LEFT)
-         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
-      if (mask & MASK_BOTTOM_RIGHT)
-         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 #endif
    }
 
@@ -533,38 +381,20 @@ static INLINE int block( int x )
 /**
  * Compute mask which indicates which pixels in the 2x2 quad are actually inside
  * the triangle's bounds.
- *
- * this is pretty nasty...  may need to rework flush_spans again to
- * fix it, if possible.
+ * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
  */
-static mask_t calculate_mask( int x )
+static INLINE mask_t calculate_mask( int x )
 {
-#if SIMD_Z
-   uint m0, m1, m2, m3;
-
-   m0 = (x >= setup.span.left[0] && x < setup.span.right[0]) * ~0;
-   m1 = (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) * ~0;
-   m2 = (x >= setup.span.left[1] && x < setup.span.right[1]) * ~0;
-   m3 = (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) * ~0;
-
-   return (vector unsigned int) {m0, m1, m2, m3};
-#else
-   unsigned mask = 0x0;
-
-   if (x >= setup.span.left[0] && x < setup.span.right[0]) 
-      mask |= MASK_TOP_LEFT;
-
-   if (x >= setup.span.left[1] && x < setup.span.right[1]) 
-      mask |= MASK_BOTTOM_LEFT;
-      
-   if (x+1 >= setup.span.left[0] && x+1 < setup.span.right[0]) 
-      mask |= MASK_TOP_RIGHT;
-
-   if (x+1 >= setup.span.left[1] && x+1 < setup.span.right[1]) 
-      mask |= MASK_BOTTOM_RIGHT;
-
+   /* This is a little tricky.
+    * Use & instead of && to avoid branches.
+    * Use negation to convert true/false to ~0/0 values.
+    */
+   mask_t mask;
+   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
+   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
+   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
+   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
    return mask;
-#endif
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h
new file mode 100644
index 0000000000..5fefb15176
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_ztest.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Zbuffer/depth test code.
+ */
+
+
+#ifndef SPU_ZTEST_H
+#define SPU_ZTEST_H
+
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+
+
+/**
+ * Perform Z testing for a 16-bit/value Z buffer.
+ *
+ * \param zvals  vector of four fragment zvalues as floats
+ * \param zbuf   ptr to vector of ushort[8] zbuffer values.  Note that this
+ *               contains the Z values for 2 quads, 8 pixels.
+ * \param x      x coordinate of quad (only lsbit is significant)
+ * \param inMask indicates which fragments in the quad are alive
+ * \return new mask indicating which fragments are alive after ztest
+ */
+static INLINE vector unsigned int
+spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
+                  uint x, vector unsigned int inMask)
+{
+#define ZERO 0x80
+   vector unsigned int zvals_ui4, zbuf_ui4, mask;
+
+   /* convert floats to uints in [0, 65535] */
+   zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
+   zvals_ui4 = spu_rlmask(zvals_ui4, -16);  /* right shift 16 */
+
+   /* XXX this conditional could be removed with a bit of work */
+   if (x & 1) {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather lower four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             VEC_LITERAL(vector unsigned char,
+                                      ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
+                                      ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     VEC_LITERAL(vector unsigned char,
+                                 16, 17, 18, 19, 20, 21, 22, 23,
+                                 2, 3, 6, 7, 10, 11, 14, 15));
+   }
+   else {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather upper four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             VEC_LITERAL(vector unsigned char,
+                                         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+                                         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     VEC_LITERAL(vector unsigned char,
+                                 2, 3, 6, 7, 10, 11, 14, 15,
+                                 24, 25, 26, 27, 28, 29, 30, 31));
+   }
+   return mask;
+#undef ZERO
+}
+
+
+/**
+ * As above, but Zbuffer values as 32-bit uints
+ */
+static INLINE vector unsigned int
+spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
+                  vector unsigned int inMask)
+{
+   vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
+
+   /* convert floats to uints in [0, 0xffffffff] */
+   zvals_ui4 = spu_convtu(zvals, 32);
+   /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
+   mask = spu_cmpgt(zbuf, zvals_ui4);
+   /* mask &= inMask */
+   mask = spu_and(mask, inMask);
+   /* zbuf = mask ? zval : zbuf */
+   *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
+
+   return mask;
+}
+
+
+#endif /* SPU_ZTEST_H */
-- 
cgit v1.2.3


From c392cc8f1bcaaecc2cc723fc5550e5f6462602f3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 13:49:51 -0700
Subject: Cell: rename fields of the tile_t union

---
 src/mesa/pipe/cell/spu/spu_main.c    |  8 ++++++++
 src/mesa/pipe/cell/spu/spu_texture.c |  6 +++---
 src/mesa/pipe/cell/spu/spu_tile.c    |  4 ++--
 src/mesa/pipe/cell/spu/spu_tile.h    | 18 +++++-------------
 src/mesa/pipe/cell/spu/spu_tri.c     |  8 ++++----
 5 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index d6393048f5..7d6e910ad5 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -36,6 +36,7 @@
 #include "spu_render.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
+//#include "spu_test.h"
 #include "spu_vertex_shader.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
@@ -495,6 +496,7 @@ one_time_init(void)
 }
 
 
+
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -515,6 +517,8 @@ main(main_param_t speid, main_param_t argp)
 
    (void) speid;
 
+   ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+
    one_time_init();
 
    if (Debug)
@@ -528,6 +532,10 @@ main(main_param_t speid, main_param_t argp)
            0  /* rid */);
    wait_on_mask( 1 << tag );
 
+#if 0
+   if (spu.init.id==0)
+      spu_test_misc();
+#endif
 
    main_loop();
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 7a1ca097c0..c1dc6bfe90 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -97,10 +97,10 @@ get_tex_tile(uint i, uint j)
              spu.init.id, src, tex_tiles[pos].t32);
 #endif
 
-      ASSERT_ALIGN16(tex_tiles[pos].t32);
+      ASSERT_ALIGN16(tex_tiles[pos].ui);
       ASSERT_ALIGN16(src);
 
-      mfc_get(tex_tiles[pos].t32,  /* dest */
+      mfc_get(tex_tiles[pos].ui,  /* dest */
               (unsigned int) src,
               bytes_per_tile,      /* size */
               TAG_TEXTURE_TILE,
@@ -134,6 +134,6 @@ sample_texture(float4 texcoord)
    uint i = (uint) (texcoord.f[0] * spu.texture.width) % spu.texture.width;
    uint j = (uint) (texcoord.f[1] * spu.texture.height) % spu.texture.height;
    uint pos = get_tex_tile(i, j);
-   uint texel = tex_tiles[pos].t32[j % TILE_SIZE][i % TILE_SIZE];
+   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
    return texel;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index aea4785bc2..fd65c2b49c 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -56,7 +56,7 @@ get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
    printf("get_tile:  dest: %p  src: 0x%x  size: %d\n",
           tile, (unsigned int) src, bytesPerTile);
    */
-   mfc_get(tile->t32,  /* dest in local memory */
+   mfc_get(tile->ui,  /* dest in local memory */
            (unsigned int) src, /* src in main memory */
            bytesPerTile,
            tag,
@@ -82,7 +82,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
           spu.init.id,
           tile, (unsigned int) dst, bytesPerTile);
    */
-   mfc_put((void *) tile->t32,  /* src in local memory */
+   mfc_put((void *) tile->ui,  /* src in local memory */
            (unsigned int) dst,  /* dst in main memory */
            bytesPerTile,
            tag,
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 4b1ef2a4c8..85a0d55807 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -40,8 +40,8 @@
 
 
 typedef union {
-   ushort t16[TILE_SIZE][TILE_SIZE];
-   uint   t32[TILE_SIZE][TILE_SIZE];
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
    vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
    vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
 } tile_t;
@@ -74,7 +74,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
 static INLINE void
 clear_c_tile(tile_t *ctile)
 {
-   memset32((uint*) ctile->t32,
+   memset32((uint*) ctile->ui,
             spu.fb.color_clear_value,
             TILE_SIZE * TILE_SIZE);
 }
@@ -84,23 +84,15 @@ static INLINE void
 clear_z_tile(tile_t *ztile)
 {
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      memset16((ushort*) ztile->t16,
+      memset16((ushort*) ztile->us,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
    }
    else {
       ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
-#if SIMD_Z
-      union fi z;
-      z.f = 1.0;
-      memset32((uint*) ztile->t32,
-               z.i,/*spu.fb.depth_clear_value,*/
-               TILE_SIZE * TILE_SIZE);
-#else
-      memset32((uint*) ztile->t32,
+      memset32((uint*) ztile->ui,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
-#endif
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index a26a4f098d..b04b6841c0 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -349,13 +349,13 @@ emit_quad( int x, int y, mask_t mask )
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
       if (spu_extract(mask, 0))
-         ctile.t32[iy][ix] = colors[QUAD_TOP_LEFT];
+         ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
-         ctile.t32[iy][ix+1] = colors[QUAD_TOP_RIGHT];
+         ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
       if (spu_extract(mask, 2))
-         ctile.t32[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
+         ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
-         ctile.t32[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+         ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 
 #if 0
       /* SIMD_Z with swizzled color buffer (someday) */
-- 
cgit v1.2.3


From 0e9a370ae2fa7a6d8bbc7d236e63dae1e3dcac37 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 14:02:22 -0700
Subject: Cell: move ztest before color interp/packing

---
 src/mesa/pipe/cell/spu/spu_tri.c | 43 ++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index b04b6841c0..ae8fd17cc6 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -316,24 +316,6 @@ emit_quad( int x, int y, mask_t mask )
    setup.quad.mask = mask;
    sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
-   /* Cell: "write" quad fragments to the tile by setting prim color */
-   const int ix = x - setup.cliprect_minx;
-   const int iy = y - setup.cliprect_miny;
-   uint colors[4];  /* indexed by QUAD_x */
-
-   if (spu.texture.start) {
-      float4 texcoords[4];
-      uint i;
-      eval_coeff(2, (float) x, (float) y, texcoords);
-      for (i = 0; i < 4; i++) {
-         colors[i] = sample_texture(texcoords[i]);
-      }
-   }
-   else {
-      float4 fcolors[4];
-      eval_coeff(1, (float) x, (float) y, fcolors);
-      pack_colors(colors, fcolors);
-   }
 
    if (spu.depth_stencil.depth.enabled) {
       mask = do_depth_test(x, y, mask);
@@ -341,6 +323,23 @@ emit_quad( int x, int y, mask_t mask )
 
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
+      const int ix = x - setup.cliprect_minx;
+      const int iy = y - setup.cliprect_miny;
+      uint colors[4];  /* indexed by QUAD_x */
+
+      if (spu.texture.start) {
+         float4 texcoords[4];
+         uint i;
+         eval_coeff(2, (float) x, (float) y, texcoords);
+         for (i = 0; i < 4; i++) {
+            colors[i] = sample_texture(texcoords[i]);
+         }
+      }
+      else {
+         float4 fcolors[4];
+         eval_coeff(1, (float) x, (float) y, fcolors);
+         pack_colors(colors, fcolors);
+      }
 
       if (cur_tile_status_c == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
@@ -348,6 +347,7 @@ emit_quad( int x, int y, mask_t mask )
       }
       cur_tile_status_c = TILE_STATUS_DIRTY;
 
+#if 1
       if (spu_extract(mask, 0))
          ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
@@ -356,11 +356,10 @@ emit_quad( int x, int y, mask_t mask )
          ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
          ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-
-#if 0
+#else
       /* SIMD_Z with swizzled color buffer (someday) */
-      vector float icolors = *((vector float *) &colors);
-      ctile.f4[iy/2][ix/2].v = spu_sel(ctile.f4[iy/2][ix/2].v, icolors, mask);
+      vector unsigned int uicolors = *((vector unsigned int *) &colors);
+      ctile.ui4[iy/2][ix/2] = spu_sel(ctile.ui4[iy/2][ix/2], uicolors, mask);
 #endif
    }
 
-- 
cgit v1.2.3


From b1a472bfb7df5ba273574e1799c5b8e85ca5f2d9 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:20:07 -0700
Subject: Cell: remove commands from top-level while loop which should only
 appear in batch buffers

---
 src/mesa/pipe/cell/spu/spu_main.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 7d6e910ad5..1760de02b7 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -447,34 +447,22 @@ main_loop(void)
               0  /* rid */);
       wait_on_mask( 1 << tag );
 
+      /*
+       * NOTE: most commands should be contained in a batch buffer
+       */
+
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          if (Debug)
             printf("SPU %u: EXIT\n", spu.init.id);
          exitFlag = 1;
          break;
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         cmd_state_framebuffer(&cmd.fb);
-         break;
-      case CELL_CMD_CLEAR_SURFACE:
-         cmd_clear_surface(&cmd.clear);
-         break;
-      case CELL_CMD_RENDER:
-         {
-            uint pos_incr;
-            cmd_render(&cmd.render, &pos_incr);
-            assert(pos_incr == 0);
-         }
-         break;
       case CELL_CMD_VS_EXECUTE:
          spu_execute_vertex_shader(&draw, &cmd.vs);
          break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
          break;
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         break;
       default:
          printf("Bad opcode!\n");
       }
-- 
cgit v1.2.3


From 17305489f0d2a0681d4c0d4952957af517019ab6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:24:00 -0700
Subject: Cell: deprecate some use of struct cell_command - it should go away
 completely

Also, remove ALIGN16_ATTRIB from structs that no longer need it.
---
 src/mesa/pipe/cell/common.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 05aeed83ab..7e193f31be 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -105,7 +105,7 @@ struct cell_command_framebuffer
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
-} ALIGN16_ATTRIB;
+};
 
 
 /**
@@ -116,7 +116,7 @@ struct cell_command_clear_surface
    uint opcode;
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
-} ALIGN16_ATTRIB;
+};
 
 
 /**
@@ -173,7 +173,7 @@ struct cell_command_render
    uint dummy3;
    uint min_index;
    boolean inline_verts;
-} ALIGN16_ATTRIB;
+};
 
 
 struct cell_command_release_verts
@@ -191,11 +191,14 @@ struct cell_command_texture
 
 
 /** XXX unions don't seem to work */
+/* XXX this should go away; all commands should be placed in batch buffers */
 struct cell_command
 {
+#if 0
    struct cell_command_framebuffer fb;
    struct cell_command_clear_surface clear;
    struct cell_command_render render;
+#endif
    struct cell_command_vs vs;
 } ALIGN16_ATTRIB;
 
-- 
cgit v1.2.3


From 42201d7574ebb1582563988820c248680081c42f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:33:53 -0700
Subject: Cell: rename/move global vars

Put tile-related globals into spu_global struct.
Rename c/ztile fields to be more consistant.
---
 src/mesa/pipe/cell/spu/spu_main.c   | 28 +++++++++++++-------------
 src/mesa/pipe/cell/spu/spu_main.h   | 32 +++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_render.c | 40 ++++++++++++++++++-------------------
 src/mesa/pipe/cell/spu/spu_tile.c   | 11 +---------
 src/mesa/pipe/cell/spu/spu_tile.h   | 27 -------------------------
 src/mesa/pipe/cell/spu/spu_tri.c    | 38 +++++++++++++++++------------------
 6 files changed, 86 insertions(+), 90 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 1760de02b7..8e3987f6ef 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -92,24 +92,24 @@ really_clear_tiles(uint surfaceIndex)
    uint i;
 
    if (surfaceIndex == 0) {
-      clear_c_tile(&ctile);
+      clear_c_tile(&spu.ctile);
 
       for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
          uint tx = i % spu.fb.width_tiles;
          uint ty = i / spu.fb.width_tiles;
-         if (tile_status[ty][tx] == TILE_STATUS_CLEAR) {
-            put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0);
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
          }
       }
    }
    else {
-      clear_z_tile(&ztile);
+      clear_z_tile(&spu.ztile);
 
       for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
          uint tx = i % spu.fb.width_tiles;
          uint ty = i / spu.fb.width_tiles;
-         if (tile_status_z[ty][tx] == TILE_STATUS_CLEAR)
-            put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 1);
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
       }
    }
 
@@ -133,11 +133,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 #if CLEAR_OPT
    /* set all tile's status to CLEAR */
    if (clear->surface == 0) {
-      memset(tile_status, TILE_STATUS_CLEAR, sizeof(tile_status));
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
       spu.fb.color_clear_value = clear->value;
    }
    else {
-      memset(tile_status_z, TILE_STATUS_CLEAR, sizeof(tile_status_z));
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
       spu.fb.depth_clear_value = clear->value;
    }
    return;
@@ -145,11 +145,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
-      clear_c_tile(&ctile);
+      clear_c_tile(&spu.ctile);
    }
    else {
       spu.fb.depth_clear_value = clear->value;
-      clear_z_tile(&ztile);
+      clear_z_tile(&spu.ztile);
    }
 
    /*
@@ -161,9 +161,9 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
       uint tx = i % spu.fb.width_tiles;
       uint ty = i / spu.fb.width_tiles;
       if (clear->surface == 0)
-         put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0);
+         put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
       else
-         put_tile(tx, ty, &ztile, TAG_SURFACE_CLEAR, 1);
+         put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
       /* XXX we don't want this here, but it fixes bad tile results */
    }
 
@@ -478,8 +478,8 @@ main_loop(void)
 static void
 one_time_init(void)
 {
-   memset(tile_status, TILE_STATUS_DEFINED, sizeof(tile_status));
-   memset(tile_status_z, TILE_STATUS_DEFINED, sizeof(tile_status_z));
+   memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
+   memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 8be5268f52..cce5e70802 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -36,6 +36,11 @@
 #include "pipe/p_state.h"
 
 
+
+#define MAX_WIDTH 1024
+#define MAX_HEIGHT 1024
+
+
 typedef union
 {
    vector float v;
@@ -43,6 +48,21 @@ typedef union
 } float4;
 
 
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -75,6 +95,18 @@ struct spu_global
 
    /* XXX more state to come */
 
+
+   /** current color and Z tiles */
+   tile_t ctile ALIGN16_ATTRIB;
+   tile_t ztile ALIGN16_ATTRIB;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status, cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index ca54a103bd..ab711d67fe 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -95,15 +95,15 @@ static INLINE void
 get_cz_tiles(uint tx, uint ty)
 {
    if (spu.depth_stencil.depth.enabled) {
-      if (cur_tile_status_z != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         cur_tile_status_z = TILE_STATUS_GETTING;
+      if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
+         spu.cur_ztile_status = TILE_STATUS_GETTING;
       }
    }
 
-   if (cur_tile_status_c != TILE_STATUS_CLEAR) {
-      get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      cur_tile_status_c = TILE_STATUS_GETTING;
+   if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_GETTING;
    }
 }
 
@@ -114,24 +114,24 @@ get_cz_tiles(uint tx, uint ty)
 static INLINE void
 put_cz_tiles(uint tx, uint ty)
 {
-   if (cur_tile_status_z == TILE_STATUS_DIRTY) {
+   if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
-      put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-      cur_tile_status_z = TILE_STATUS_DEFINED;
+      put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
    }
-   else if (cur_tile_status_z == TILE_STATUS_GETTING) {
+   else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
-      cur_tile_status_z = TILE_STATUS_DEFINED;
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
    }
 
-   if (cur_tile_status_c == TILE_STATUS_DIRTY) {
+   if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
-      put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-      cur_tile_status_c = TILE_STATUS_DEFINED;
+      put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
    }
-   else if (cur_tile_status_c == TILE_STATUS_GETTING) {
+   else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
-      cur_tile_status_c = TILE_STATUS_DEFINED;
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
    }
 }
 
@@ -250,8 +250,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       if (!my_tile(tx, ty))
          continue;
 
-      cur_tile_status_c = tile_status[ty][tx];
-      cur_tile_status_z = tile_status_z[ty][tx];
+      spu.cur_ctile_status = spu.ctile_status[ty][tx];
+      spu.cur_ztile_status = spu.ztile_status[ty][tx];
 
       get_cz_tiles(tx, ty);
 
@@ -275,8 +275,8 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
       wait_put_cz_tiles(); /* XXX seems unnecessary... */
 
-      tile_status[ty][tx] = cur_tile_status_c;
-      tile_status_z[ty][tx] = cur_tile_status_z;
+      spu.ctile_status[ty][tx] = spu.cur_ctile_status;
+      spu.ztile_status[ty][tx] = spu.cur_ztile_status;
    }
 
    if (Debug)
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index fd65c2b49c..12dc246328 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -28,16 +28,7 @@
 
 
 #include "spu_tile.h"
-
-
-
-tile_t ctile ALIGN16_ATTRIB;
-tile_t ztile ALIGN16_ATTRIB;
-
-ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-
-ubyte cur_tile_status_c, cur_tile_status_z;
+#include "spu_main.h"
 
 
 void
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index 85a0d55807..e53340a55a 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -35,33 +35,6 @@
 #include "pipe/cell/common.h"
 
 
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
-
-
-typedef union {
-   ushort us[TILE_SIZE][TILE_SIZE];
-   uint   ui[TILE_SIZE][TILE_SIZE];
-   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
-   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
-} tile_t;
-
-
-extern tile_t ctile ALIGN16_ATTRIB;
-extern tile_t ztile ALIGN16_ATTRIB;
-
-
-#define TILE_STATUS_CLEAR   1
-#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
-#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
-#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
-#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
-
-extern ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-extern ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-
-extern ubyte cur_tile_status_c, cur_tile_status_z;
-
 
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index ae8fd17cc6..6f61a3d816 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -283,21 +283,21 @@ do_depth_test(int x, int y, mask_t quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (cur_tile_status_c == TILE_STATUS_CLEAR) {
+   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
       /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-      cur_tile_status_z = TILE_STATUS_DIRTY;
+      clear_z_tile(&spu.ztile);
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
    }
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
       int ix = (x - setup.cliprect_minx) / 4;
       int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z16_test_less(zvals.v, &ztile.us8[iy][ix], x>>1, quadmask);
+      mask = spu_z16_test_less(zvals.v, &spu.ztile.us8[iy][ix], x>>1, quadmask);
    }
    else {
       int ix = (x - setup.cliprect_minx) / 2;
       int iy = (y - setup.cliprect_miny) / 2;
-      mask = spu_z32_test_less(zvals.v, &ztile.ui4[iy][ix], quadmask);
+      mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
    }
    return mask;
 }
@@ -341,25 +341,25 @@ emit_quad( int x, int y, mask_t mask )
          pack_colors(colors, fcolors);
       }
 
-      if (cur_tile_status_c == TILE_STATUS_CLEAR) {
+      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
          /* now, _really_ clear the tile */
-         clear_c_tile(&ctile);
+         clear_c_tile(&spu.ctile);
       }
-      cur_tile_status_c = TILE_STATUS_DIRTY;
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
 #if 1
       if (spu_extract(mask, 0))
-         ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
+         spu.ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
       if (spu_extract(mask, 1))
-         ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
+         spu.ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
       if (spu_extract(mask, 2))
-         ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
+         spu.ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
       if (spu_extract(mask, 3))
-         ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
+         spu.ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
 #else
       /* SIMD_Z with swizzled color buffer (someday) */
       vector unsigned int uicolors = *((vector unsigned int *) &colors);
-      ctile.ui4[iy/2][ix/2] = spu_sel(ctile.ui4[iy/2][ix/2], uicolors, mask);
+      spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
 #endif
    }
 
@@ -846,21 +846,21 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    /*   init_constant_attribs( setup ); */
       
-   if (cur_tile_status_c == TILE_STATUS_GETTING) {
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
       /* wait for mfc_get() to complete */
       wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      cur_tile_status_c = TILE_STATUS_CLEAN;
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
    }
 
-   ASSERT(cur_tile_status_c != TILE_STATUS_DEFINED);
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
    if (spu.depth_stencil.depth.enabled) {
-      if (cur_tile_status_z == TILE_STATUS_GETTING) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
          /* wait for mfc_get() to complete */
          wait_on_mask(1 << TAG_READ_TILE_Z);
-         cur_tile_status_z = TILE_STATUS_CLEAN;
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
       }
-   ASSERT(cur_tile_status_z != TILE_STATUS_DEFINED);
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
    }
 
 
-- 
cgit v1.2.3


From d7c2eb0df47bd79291172727539b99331a3c6724 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 15:45:02 -0700
Subject: Cell: New color packing functions (A8R8G8B8 and B8G8R8A8)

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 60 ++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_tri.c       | 22 +++++--------
 2 files changed, 69 insertions(+), 13 deletions(-)
 create mode 100644 src/mesa/pipe/cell/spu/spu_colorpack.h

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
new file mode 100644
index 0000000000..56709bd9f3
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef SPU_COLORPACK_H
+#define SPU_COLORPACK_H
+
+
+#include <vec_literal.h>
+#include <spu_intrinsics.h>
+
+
+static INLINE unsigned int
+spu_pack_A8R8G8B8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
+					  12, 0, 4, 8, 0, 0, 0, 0, 
+                                          0, 0, 0, 0, 0, 0, 0, 0));
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_B8G8R8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
+					  8, 4, 0, 12, 0, 0, 0, 0, 
+                                          0, 0, 0, 0, 0, 0, 0, 0));
+  return spu_extract(out, 0);
+}
+
+
+#endif /* SPU_COLORPACK_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 6f61a3d816..c82ca51000 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -29,11 +29,10 @@
  * Triangle rendering within a tile.
  */
 
-#include <pack_rgba8.h>
-
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
+#include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
@@ -253,21 +252,18 @@ eval_z(float x, float y)
 static INLINE void
 pack_colors(uint uicolors[4], const float4 fcolors[4])
 {
-   /* XXX grab the code for _pack_rgba8() and use the shuffle
-    * command to do the swizzling seen here.
-    */
    switch (spu.fb.color_format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      uicolors[0] = _pack_rgba8(fcolors[0].f[3], fcolors[0].f[0], fcolors[0].f[1], fcolors[0].f[2]);
-      uicolors[1] = _pack_rgba8(fcolors[1].f[3], fcolors[1].f[0], fcolors[1].f[1], fcolors[1].f[2]);
-      uicolors[2] = _pack_rgba8(fcolors[2].f[3], fcolors[2].f[0], fcolors[2].f[1], fcolors[2].f[2]);
-      uicolors[3] = _pack_rgba8(fcolors[3].f[3], fcolors[0].f[0], fcolors[3].f[1], fcolors[3].f[2]);
+      uicolors[0] = spu_pack_A8R8G8B8(fcolors[0].v);
+      uicolors[1] = spu_pack_A8R8G8B8(fcolors[1].v);
+      uicolors[2] = spu_pack_A8R8G8B8(fcolors[2].v);
+      uicolors[3] = spu_pack_A8R8G8B8(fcolors[3].v);
       break;
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      uicolors[0] = _pack_rgba8(fcolors[0].f[2], fcolors[0].f[1], fcolors[0].f[0], fcolors[0].f[3]);
-      uicolors[1] = _pack_rgba8(fcolors[1].f[2], fcolors[1].f[1], fcolors[1].f[0], fcolors[1].f[3]);
-      uicolors[2] = _pack_rgba8(fcolors[2].f[2], fcolors[2].f[1], fcolors[2].f[0], fcolors[2].f[3]);
-      uicolors[3] = _pack_rgba8(fcolors[3].f[2], fcolors[3].f[1], fcolors[3].f[0], fcolors[3].f[3]);
+      uicolors[0] = spu_pack_B8G8R8A8(fcolors[0].v);
+      uicolors[1] = spu_pack_B8G8R8A8(fcolors[1].v);
+      uicolors[2] = spu_pack_B8G8R8A8(fcolors[2].v);
+      uicolors[3] = spu_pack_B8G8R8A8(fcolors[3].v);
       break;
    default:
       ASSERT(0);
-- 
cgit v1.2.3


From 8bd566a9cb8bb01ef5ce9c526047bafc0fbf0aef Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 16:25:42 -0700
Subject: Cell: use global color_shuffle to remove a switch stmnt

---
 src/mesa/pipe/cell/spu/Makefile        |  2 +
 src/mesa/pipe/cell/spu/spu_colorpack.h |  9 ++++
 src/mesa/pipe/cell/spu/spu_main.c      | 12 ++++++
 src/mesa/pipe/cell/spu/spu_main.h      |  3 ++
 src/mesa/pipe/cell/spu/spu_tri.c       | 76 ++++++++++++++--------------------
 5 files changed, 56 insertions(+), 46 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 2d031bfbc6..91a631b699 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -8,6 +8,8 @@ TOP = ../../../../..
 include $(TOP)/configs/linux-cell
 
 
+OPT_FLAGS=-g
+OPT_FLAGS=-O3
 PROG = g3d
 
 PROG_SPU = $(PROG)_spu
diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 56709bd9f3..9977a6ece0 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -57,4 +57,13 @@ spu_pack_B8G8R8A8(vector float rgba)
 }
 
 
+static INLINE unsigned int
+spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, shuffle);
+  return spu_extract(out, 0);
+}
+
+
 #endif /* SPU_COLORPACK_H */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 8e3987f6ef..ba4d180cc0 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,6 +31,7 @@
 
 #include <stdio.h>
 #include <libmisc.h>
+#include <vec_literal.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -217,6 +218,17 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 2;
    else
       spu.fb.zsize = 0;
+
+   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
+      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
+                                      12, 0, 4, 8, 0, 0, 0, 0, 
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
+                                      8, 4, 0, 12, 0, 0, 0, 0, 
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+   else
+      ASSERT(0);
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index cce5e70802..7a12715b0b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -107,6 +107,9 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+
+   /** for converting RGBA to PIPE_FORMAT_x colors */
+   vector unsigned char color_shuffle;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index c82ca51000..165e41a781 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -249,28 +249,6 @@ eval_z(float x, float y)
 }
 
 
-static INLINE void
-pack_colors(uint uicolors[4], const float4 fcolors[4])
-{
-   switch (spu.fb.color_format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      uicolors[0] = spu_pack_A8R8G8B8(fcolors[0].v);
-      uicolors[1] = spu_pack_A8R8G8B8(fcolors[1].v);
-      uicolors[2] = spu_pack_A8R8G8B8(fcolors[2].v);
-      uicolors[3] = spu_pack_A8R8G8B8(fcolors[3].v);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      uicolors[0] = spu_pack_B8G8R8A8(fcolors[0].v);
-      uicolors[1] = spu_pack_B8G8R8A8(fcolors[1].v);
-      uicolors[2] = spu_pack_B8G8R8A8(fcolors[2].v);
-      uicolors[3] = spu_pack_B8G8R8A8(fcolors[3].v);
-      break;
-   default:
-      ASSERT(0);
-   }
-}
-
-
 static INLINE mask_t
 do_depth_test(int x, int y, mask_t quadmask)
 {
@@ -321,38 +299,44 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
-      uint colors[4];  /* indexed by QUAD_x */
+
+      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+         /* now, _really_ clear the tile */
+         clear_c_tile(&spu.ctile);
+      }
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture.start) {
+         /* texture mapping */
          float4 texcoords[4];
-         uint i;
          eval_coeff(2, (float) x, (float) y, texcoords);
-         for (i = 0; i < 4; i++) {
-            colors[i] = sample_texture(texcoords[i]);
-         }
+
+         if (spu_extract(mask, 0))
+            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0]);
+         if (spu_extract(mask, 1))
+            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1]);
+         if (spu_extract(mask, 2))
+            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2]);
+         if (spu_extract(mask, 3))
+            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3]);
       }
       else {
-         float4 fcolors[4];
-         eval_coeff(1, (float) x, (float) y, fcolors);
-         pack_colors(colors, fcolors);
+         /* simple shading */
+         const vector unsigned char shuffle = spu.color_shuffle;
+         float4 colors[4];
+         eval_coeff(1, (float) x, (float) y, colors);
+
+         if (spu_extract(mask, 0))
+            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0].v, shuffle);
+         if (spu_extract(mask, 1))
+            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1].v, shuffle);
+         if (spu_extract(mask, 2))
+            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2].v, shuffle);
+         if (spu_extract(mask, 3))
+            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3].v, shuffle);
       }
 
-      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&spu.ctile);
-      }
-      spu.cur_ctile_status = TILE_STATUS_DIRTY;
-
-#if 1
-      if (spu_extract(mask, 0))
-         spu.ctile.ui[iy][ix] = colors[QUAD_TOP_LEFT];
-      if (spu_extract(mask, 1))
-         spu.ctile.ui[iy][ix+1] = colors[QUAD_TOP_RIGHT];
-      if (spu_extract(mask, 2))
-         spu.ctile.ui[iy+1][ix] = colors[QUAD_BOTTOM_LEFT];
-      if (spu_extract(mask, 3))
-         spu.ctile.ui[iy+1][ix+1] = colors[QUAD_BOTTOM_RIGHT];
-#else
+#if 0
       /* SIMD_Z with swizzled color buffer (someday) */
       vector unsigned int uicolors = *((vector unsigned int *) &colors);
       spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
-- 
cgit v1.2.3


From 44d32693562e2fb83572bd10e4d489a7cb6f74f3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 16:42:09 -0700
Subject: Cell: move some tile get/clear code

Also, we weren't marking the ztile as dirty after ztesting, fixes gears glitches.
---
 src/mesa/pipe/cell/spu/spu_tri.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 165e41a781..f0758c42e7 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -257,12 +257,6 @@ do_depth_test(int x, int y, mask_t quadmask)
 
    zvals.v = eval_z((float) x, (float) y);
 
-   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&spu.ztile);
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
-   }
-
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
       int ix = (x - setup.cliprect_minx) / 4;
       int iy = (y - setup.cliprect_miny) / 2;
@@ -273,6 +267,10 @@ do_depth_test(int x, int y, mask_t quadmask)
       int iy = (y - setup.cliprect_miny) / 2;
       mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
    }
+
+   if (spu_extract(spu_orx(mask), 0))
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+
    return mask;
 }
 
@@ -300,10 +298,6 @@ emit_quad( int x, int y, mask_t mask )
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
 
-      if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&spu.ctile);
-      }
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture.start) {
@@ -408,6 +402,18 @@ static void flush_spans( void )
       return;
    }
 
+
+   /* _really_ clear tiles now if needed */
+   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      clear_c_tile(&spu.ctile);
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   if (spu.depth_stencil.depth.enabled &&
+       spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+      clear_z_tile(&spu.ztile);
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+   }
+
    /* XXX this loop could be moved into the above switch cases and
     * calculate_mask() could be simplified a bit...
     */
@@ -831,7 +837,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
       wait_on_mask(1 << TAG_READ_TILE_COLOR);
       spu.cur_ctile_status = TILE_STATUS_CLEAN;
    }
-
    ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
 
    if (spu.depth_stencil.depth.enabled) {
-- 
cgit v1.2.3


From aa761b160520479efcf09d12ae4a161fc2f872f7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 1 Feb 2008 16:54:46 -0700
Subject: Cell: comment about emit_quad() mask

---
 src/mesa/pipe/cell/spu/spu_tri.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index f0758c42e7..83bb247b22 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -277,6 +277,9 @@ do_depth_test(int x, int y, mask_t quadmask)
 
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
+ * Note: about 1/5 to 1/7 of the time, mask is zero and this function
+ * should be skipped.  But adding the test for that slows things down
+ * overall.
  */
 static INLINE void
 emit_quad( int x, int y, mask_t mask )
-- 
cgit v1.2.3


From 69cc19751dd0122116cab03d808d5a1f5d0ade84 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 08:45:33 -0700
Subject: Cell: insert some draw_flush() calls

---
 src/mesa/pipe/cell/ppu/cell_state_blend.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_blend.c b/src/mesa/pipe/cell/ppu/cell_state_blend.c
index 34ae0128ea..2c19aa3971 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_blend.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_blend.c
@@ -29,6 +29,7 @@
  */
 
 #include "pipe/p_util.h"
+#include "pipe/draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 
@@ -49,6 +50,8 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->blend = (const struct pipe_blend_state *)blend;
 
    cell->dirty |= CELL_NEW_BLEND;
@@ -68,6 +71,8 @@ cell_set_blend_color(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->blend_color = *blend_color;
 
    cell->dirty |= CELL_NEW_BLEND;
@@ -93,6 +98,8 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->depth_stencil
       = (const struct pipe_depth_stencil_alpha_state *) depth_stencil;
 
-- 
cgit v1.2.3


From 6023311c7ce336f727d7aa6d5266e88a55b88d36 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 08:46:44 -0700
Subject: Cell: clamp txmax, tymax in tile_bounding_box()

Also, added some debug printfs
---
 src/mesa/pipe/cell/spu/spu_render.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index ab711d67fe..e8705eeeba 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -65,6 +65,10 @@ tile_bounding_box(const struct cell_command_render *render,
    *tymin = (uint) render->ymin / TILE_SIZE;
    txmax = (uint) render->xmax / TILE_SIZE;
    tymax = (uint) render->ymax / TILE_SIZE;
+   if (txmax >= spu.fb.width_tiles)
+      txmax = spu.fb.width_tiles-1;
+   if (tymax >= spu.fb.height_tiles)
+      tymax = spu.fb.height_tiles-1;
    *box_width_tiles = txmax - *txmin + 1;
    box_height_tiles = tymax - *tymin + 1;
    *box_num_tiles = *box_width_tiles * box_height_tiles;
@@ -96,12 +100,14 @@ get_cz_tiles(uint tx, uint ty)
 {
    if (spu.depth_stencil.depth.enabled) {
       if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
          get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
          spu.cur_ztile_status = TILE_STATUS_GETTING;
       }
    }
 
    if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
       get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
       spu.cur_ctile_status = TILE_STATUS_GETTING;
    }
@@ -116,22 +122,26 @@ put_cz_tiles(uint tx, uint ty)
 {
    if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
       put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
       spu.cur_ztile_status = TILE_STATUS_DEFINED;
    }
    else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
       spu.cur_ztile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
    }
 
    if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
       /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
       put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
       spu.cur_ctile_status = TILE_STATUS_DEFINED;
    }
    else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
       /* tile was never used */
       spu.cur_ctile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
    }
 }
 
-- 
cgit v1.2.3


From e967a5c746f340a76b27181b4ead1035101cece3 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 08:53:18 -0700
Subject: Cell: move tile clear code to flush_spans()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 51 ++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 83bb247b22..3f46e75d7c 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -406,22 +406,44 @@ static void flush_spans( void )
    }
 
 
-   /* _really_ clear tiles now if needed */
-   if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+   /* OK, we're very likely to need the tile data now.
+    * clear or finish waiting if needed.
+    */
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
       clear_c_tile(&spu.ctile);
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
-   if (spu.depth_stencil.depth.enabled &&
-       spu.cur_ztile_status == TILE_STATUS_CLEAR) {
-      clear_z_tile(&spu.ztile);
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
+
+   if (spu.depth_stencil.depth.enabled) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
+      }
+      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+         clear_z_tile(&spu.ztile);
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
    }
 
    /* XXX this loop could be moved into the above switch cases and
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
+#if 1
       emit_quad( x, setup.span.y, calculate_mask( x ) );
+#endif
    }
 
    setup.span.y = 0;
@@ -835,23 +857,6 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 
    /*   init_constant_attribs( setup ); */
       
-   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
-      /* wait for mfc_get() to complete */
-      wait_on_mask(1 << TAG_READ_TILE_COLOR);
-      spu.cur_ctile_status = TILE_STATUS_CLEAN;
-   }
-   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
-
-   if (spu.depth_stencil.depth.enabled) {
-      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
-         /* wait for mfc_get() to complete */
-         wait_on_mask(1 << TAG_READ_TILE_Z);
-         spu.cur_ztile_status = TILE_STATUS_CLEAN;
-      }
-      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
-   }
-
-
    if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
-- 
cgit v1.2.3


From 18105195a86b8294b578462febf47692832e8705 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 09:54:21 -0700
Subject: Cell: checkpoint: start to SIMD-ize texture sampling

---
 src/mesa/pipe/cell/spu/spu_main.c    | 10 ++++++++++
 src/mesa/pipe/cell/spu/spu_main.h    |  4 ++++
 src/mesa/pipe/cell/spu/spu_texture.c | 17 ++++++++++++++---
 src/mesa/pipe/cell/spu/spu_texture.h |  2 +-
 src/mesa/pipe/cell/spu/spu_tri.c     |  8 ++++----
 5 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index ba4d180cc0..412661061a 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -263,6 +263,16 @@ cmd_state_texture(const struct cell_command_texture *texture)
              spu.init.id, texture->start, texture->width, texture->height);
 
    memcpy(&spu.texture, texture, sizeof(*texture));
+   spu.tex_size = VEC_LITERAL(vector float,
+                              spu.texture.width,
+                              spu.texture.height,
+                              0.0,
+                              0.0);
+   spu.tex_size_mask = VEC_LITERAL(vector unsigned int,
+                                   spu.texture.width - 1,
+                                   spu.texture.height - 1,
+                                   0,
+                                   0);
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 7a12715b0b..02b62ee5cd 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -110,6 +110,10 @@ struct spu_global
 
    /** for converting RGBA to PIPE_FORMAT_x colors */
    vector unsigned char color_shuffle;
+
+   vector float tex_size;
+   vector unsigned int tex_size_mask; /**< == int(size - 1) */
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index c1dc6bfe90..1cf958806f 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -128,12 +128,23 @@ get_tex_tile(uint i, uint j)
  * XXX this is extremely primitive for now.
  */
 uint
-sample_texture(float4 texcoord)
+sample_texture(vector float texcoord)
 {
+#if 0
    /* wrap/repeat */
-   uint i = (uint) (texcoord.f[0] * spu.texture.width) % spu.texture.width;
-   uint j = (uint) (texcoord.f[1] * spu.texture.height) % spu.texture.height;
+   uint i = (uint) (spu_extract(texcoord, 0) * spu.texture.width) % spu.texture.width;
+   uint j = (uint) (spu_extract(texcoord, 1) * spu.texture.height) % spu.texture.height;
    uint pos = get_tex_tile(i, j);
    uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
    return texel;
+#else
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   vector unsigned int itc = spu_convtu(tc, 0);
+   itc = spu_and(itc, spu.tex_size_mask);
+   uint i = spu_extract(itc, 0);
+   uint j = spu_extract(itc, 1);
+   uint pos = get_tex_tile(i, j);
+   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
+   return texel;
+#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 938a42b549..5bc8e71879 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -37,7 +37,7 @@ invalidate_tex_cache(void);
 
 
 extern uint
-sample_texture(float4 texcoord);
+sample_texture(vector float texcoord);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 3f46e75d7c..c148c75dd6 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -309,13 +309,13 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0]);
+            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0].v);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1]);
+            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1].v);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2]);
+            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2].v);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3]);
+            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3].v);
       }
       else {
          /* simple shading */
-- 
cgit v1.2.3


From 703a8691553386242bf3d6662c314fc35b617194 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 11:02:47 -0700
Subject: Cell: SIMD-ize more of texture sampling

---
 src/mesa/pipe/cell/spu/spu_texture.c | 66 ++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 37 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 1cf958806f..b52df970d0 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <vec_literal.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -41,7 +43,7 @@
 
 static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
 
-static int tex_tile_x[CACHE_SIZE], tex_tile_y[CACHE_SIZE];
+static vector unsigned int tex_tile_xy[CACHE_SIZE];
 
 
@@ -53,20 +55,19 @@ invalidate_tex_cache(void)
 {
    /* XXX memset? */
    uint i;
-   for (i = 0; i < CACHE_SIZE; i++)
-      tex_tile_x[i] = tex_tile_y[i] = -1;
+   for (i = 0; i < CACHE_SIZE; i++) {
+      tex_tile_xy[i] = VEC_LITERAL(vector unsigned int, ~0U, ~0U, ~0U, ~0U);
+   }
 }
 
 
 /**
- * Return the cache pos/index which corresponds to texel (i,j)
+ * Return the cache pos/index which corresponds to tile (tx,ty)
  */
 static INLINE uint
-cache_pos(uint i, uint j)
+cache_pos(vector unsigned int txty)
 {
-   uint tx = i / TILE_SIZE;
-   uint ty = j / TILE_SIZE;
-   uint pos = (tx + ty * 4) % CACHE_SIZE;
+   uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE;
    return pos;
 }
 
@@ -76,26 +77,28 @@ cache_pos(uint i, uint j)
  * in the cache.
  */
 static uint
-get_tex_tile(uint i, uint j)
+get_tex_tile(vector unsigned int ij)
 {
-   const int tx = i / TILE_SIZE;
-   const int ty = j / TILE_SIZE;
-   const uint pos = cache_pos(i, j);
+   /* tile address: tx,ty */
+   const vector unsigned int txty = spu_rlmask(ij, -5);  /* divide by 32 */
+   const uint pos = cache_pos(txty);
+
+   if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) ||
+       (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) {
 
-   if (tex_tile_x[pos] != tx || tex_tile_y[pos] != ty) {
       /* texture cache miss, fetch tile from main memory */
       const uint tiles_per_row = spu.texture.width / TILE_SIZE;
       const uint bytes_per_tile = sizeof(tile_t);
       const void *src = (const ubyte *) spu.texture.start
-         + (ty * tiles_per_row + tx) * bytes_per_tile;
+         + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile;
 
       printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
-             spu.init.id, tx, ty, pos,
-             tex_tile_x[pos], tex_tile_y[pos]);
-#if 0
-      printf("SPU %u: get tex tile from %p to %p\n",
-             spu.init.id, src, tex_tiles[pos].t32);
-#endif
+             spu.init.id,
+             spu_extract(txty,0),
+             spu_extract(txty,1),
+             pos,
+             spu_extract(tex_tile_xy[pos],0),
+             spu_extract(tex_tile_xy[pos],1));
 
       ASSERT_ALIGN16(tex_tiles[pos].ui);
       ASSERT_ALIGN16(src);
@@ -109,8 +112,7 @@ get_tex_tile(uint i, uint j)
 
       wait_on_mask(1 << TAG_TEXTURE_TILE);
 
-      tex_tile_x[pos] = tx;
-      tex_tile_y[pos] = ty;
+      tex_tile_xy[pos] = txty;
    }
    else {
 #if 0
@@ -130,21 +132,11 @@ get_tex_tile(uint i, uint j)
 uint
 sample_texture(vector float texcoord)
 {
-#if 0
-   /* wrap/repeat */
-   uint i = (uint) (spu_extract(texcoord, 0) * spu.texture.width) % spu.texture.width;
-   uint j = (uint) (spu_extract(texcoord, 1) * spu.texture.height) % spu.texture.height;
-   uint pos = get_tex_tile(i, j);
-   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
-   return texel;
-#else
    vector float tc = spu_mul(texcoord, spu.tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);
-   itc = spu_and(itc, spu.tex_size_mask);
-   uint i = spu_extract(itc, 0);
-   uint j = spu_extract(itc, 1);
-   uint pos = get_tex_tile(i, j);
-   uint texel = tex_tiles[pos].ui[j % TILE_SIZE][i % TILE_SIZE];
+   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
+   itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
+   uint pos = get_tex_tile(itc);
+   uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
    return texel;
-#endif
 }
-- 
cgit v1.2.3


From 9a5074217fd3be8feff2be597bb124a2a3637d0a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 12:50:16 -0700
Subject: Cell: added spu_unpack_color(), spu_pack_R8G8B8A8()

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 9977a6ece0..0c93c06562 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -35,6 +35,17 @@
 #include <spu_intrinsics.h>
 
 
+static INLINE unsigned int
+spu_pack_R8G8B8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
+					  0, 4, 8, 12, 0, 0, 0, 0, 
+                                          0, 0, 0, 0, 0, 0, 0, 0));
+  return spu_extract(out, 0);
+}
+
+
 static INLINE unsigned int
 spu_pack_A8R8G8B8(vector float rgba)
 {
@@ -66,4 +77,18 @@ spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
 }
 
 
+static INLINE vector float
+spu_unpack_color(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          VEC_LITERAL(vector unsigned char,
+                                      0, 0, 0, 0,
+                                      5, 5, 5, 5,
+                                      10, 10, 10, 10,
+                                      15, 15, 15, 15));
+   return spu_convtf(color_u4, 32);
+}
+
+
 #endif /* SPU_COLORPACK_H */
-- 
cgit v1.2.3


From 0a45f7594870cb7296100fb5f5d5dc82888a467d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 12:50:42 -0700
Subject: Cell: implement basic bilinear texture sampler

---
 src/mesa/pipe/cell/spu/spu_texture.c | 67 ++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_texture.h |  4 +++
 2 files changed, 71 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index b52df970d0..26a5eefc48 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -32,6 +32,7 @@
 #include "spu_main.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
+#include "spu_colorpack.h"
 
 
 /**
@@ -140,3 +141,69 @@ sample_texture(vector float texcoord)
    uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
    return texel;
 }
+
+
+uint
+sample_texture_bilinear(vector float texcoord)
+{
+   static const vector unsigned int offset10 = {1, 0, 0, 0};
+   static const vector unsigned int offset01 = {0, 1, 0, 0};
+
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   /* itcST */
+   vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
+   vector unsigned int itc01 = spu_add(itc00, offset01);
+   vector unsigned int itc10 = spu_add(itc00, offset10);
+   vector unsigned int itc11 = spu_add(itc10, offset01);
+
+   itc00 = spu_and(itc00, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc01 = spu_and(itc01, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc10 = spu_and(itc10, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   itc11 = spu_and(itc11, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+
+   /* intra tile addr */
+   vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
+   vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1);
+   vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
+   vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
+
+   uint pos00 = get_tex_tile(itc00);
+   uint pos01 = get_tex_tile(itc01);
+   uint pos10 = get_tex_tile(itc10);
+   uint pos11 = get_tex_tile(itc11);
+
+   vector float texel00 = spu_unpack_color(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
+   vector float texel01 = spu_unpack_color(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
+   vector float texel10 = spu_unpack_color(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
+   vector float texel11 = spu_unpack_color(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
+   vector signed int itc1024 = spu_convts(tc1024, 0);
+   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
+   vector float weight = spu_convtf(itc1024, 10);
+
+   /* smeared frac and 1-frac */
+   vector float sfrac = spu_splats(spu_extract(weight, 0));
+   vector float tfrac = spu_splats(spu_extract(weight, 1));
+   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
+   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
+
+   /* multiply the samples (colors) by the S/T weights */
+   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
+   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
+   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
+   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
+
+   /* compute sum of weighted samples */
+   vector float texel_sum = spu_add(texel00, texel01);
+   texel_sum = spu_add(texel_sum, texel10);
+   texel_sum = spu_add(texel_sum, texel11);
+
+   /* convert to uint color */
+   uint texel = spu_pack_R8G8B8A8(texel_sum);
+
+   return texel;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 5bc8e71879..25cbe9b3c6 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -40,4 +40,8 @@ extern uint
 sample_texture(vector float texcoord);
 
 
+extern uint
+sample_texture_bilinear(vector float texcoord);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From ca1d2fc5f6fb138025f6848591e3494e4b881930 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 13:16:10 -0700
Subject: Cell: improved bilinear filtering

avoid calling get_tex_tile() if all texels are in same tile
---
 src/mesa/pipe/cell/spu/spu_texture.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 26a5eefc48..6e243f7fa3 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -150,16 +150,17 @@ sample_texture_bilinear(vector float texcoord)
    static const vector unsigned int offset01 = {0, 1, 0, 0};
 
    vector float tc = spu_mul(texcoord, spu.tex_size);
-   /* itcST */
+   /* integer texcoords S,T: */
    vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
    vector unsigned int itc01 = spu_add(itc00, offset01);
    vector unsigned int itc10 = spu_add(itc00, offset10);
    vector unsigned int itc11 = spu_add(itc10, offset01);
 
-   itc00 = spu_and(itc00, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   itc01 = spu_and(itc01, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   itc10 = spu_and(itc10, spu.tex_size_mask);        /* mask (GL_REPEAT) */
-   itc11 = spu_and(itc11, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   /* mask (GL_REPEAT) */
+   itc00 = spu_and(itc00, spu.tex_size_mask);
+   itc01 = spu_and(itc01, spu.tex_size_mask);
+   itc10 = spu_and(itc10, spu.tex_size_mask);
+   itc11 = spu_and(itc11, spu.tex_size_mask);
 
    /* intra tile addr */
    vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
@@ -167,11 +168,21 @@ sample_texture_bilinear(vector float texcoord)
    vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
    vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
 
+   /* get tile cache positions */
    uint pos00 = get_tex_tile(itc00);
-   uint pos01 = get_tex_tile(itc01);
-   uint pos10 = get_tex_tile(itc10);
-   uint pos11 = get_tex_tile(itc11);
+   uint pos01, pos10, pos11;
+   if ((spu_extract(ij00, 0) < TILE_SIZE-1) &&
+       (spu_extract(ij00, 1) < TILE_SIZE-1)) {
+      /* all texels are in the same tile */
+      pos01 = pos10 = pos11 = pos00;
+   }
+   else {
+      pos01 = get_tex_tile(itc01);
+      pos10 = get_tex_tile(itc10);
+      pos11 = get_tex_tile(itc11);
+   }
 
+   /* get texels from tiles and convert to float[4] */
    vector float texel00 = spu_unpack_color(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
    vector float texel01 = spu_unpack_color(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
    vector float texel10 = spu_unpack_color(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
-- 
cgit v1.2.3


From 8f924e4df06a5d45dda338e7a0a87308e48df57e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 13:23:07 -0700
Subject: Cell: choose bilinear vs. nearest filtering according to sampler
 state

---
 src/mesa/pipe/cell/spu/spu_main.c    | 4 ++++
 src/mesa/pipe/cell/spu/spu_main.h    | 2 ++
 src/mesa/pipe/cell/spu/spu_texture.c | 2 +-
 src/mesa/pipe/cell/spu/spu_texture.h | 2 +-
 src/mesa/pipe/cell/spu/spu_tri.c     | 8 ++++----
 5 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 412661061a..48e016fc8b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -252,6 +252,10 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
              spu.init.id);
 
    memcpy(&spu.sampler[0], state, sizeof(*state));
+   if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture = sample_texture_bilinear;
+   else
+      spu.sample_texture = sample_texture_nearest;
 }
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 02b62ee5cd..fb98b0d889 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -114,6 +114,8 @@ struct spu_global
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
 
+   uint (*sample_texture)(vector float texcoord);
+
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 6e243f7fa3..ecacf2ec88 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -131,7 +131,7 @@ get_tex_tile(vector unsigned int ij)
  * XXX this is extremely primitive for now.
  */
 uint
-sample_texture(vector float texcoord)
+sample_texture_nearest(vector float texcoord)
 {
    vector float tc = spu_mul(texcoord, spu.tex_size);
    vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 25cbe9b3c6..0e000bfebf 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -37,7 +37,7 @@ invalidate_tex_cache(void);
 
 
 extern uint
-sample_texture(vector float texcoord);
+sample_texture_nearest(vector float texcoord);
 
 
 extern uint
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index c148c75dd6..7b422f71a8 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -309,13 +309,13 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = sample_texture(texcoords[0].v);
+            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0].v);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = sample_texture(texcoords[1].v);
+            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1].v);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = sample_texture(texcoords[2].v);
+            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2].v);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = sample_texture(texcoords[3].v);
+            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3].v);
       }
       else {
          /* simple shading */
-- 
cgit v1.2.3


From 7a1d01f2a0d8f0875a265e7d4e31e1348fd82677 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 15:06:10 -0700
Subject: Cell: emit blend state to SPUs

---
 src/mesa/pipe/cell/common.h              |  3 ++-
 src/mesa/pipe/cell/ppu/cell_state_emit.c |  6 ++++++
 src/mesa/pipe/cell/spu/spu_main.c        | 17 +++++++++++++++++
 src/mesa/pipe/cell/spu/spu_main.h        |  1 +
 4 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 7e193f31be..d861e82d33 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -85,7 +85,8 @@
 #define CELL_CMD_STATE_VERTEX_INFO   14
 #define CELL_CMD_STATE_VIEWPORT      15
 #define CELL_CMD_STATE_VS_ARRAY_INFO 16
-#define CELL_CMD_VS_EXECUTE          17
+#define CELL_CMD_STATE_BLEND         17
+#define CELL_CMD_VS_EXECUTE          18
 
 
 #define CELL_NUM_BUFFERS 4
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 702184416b..3b2670f786 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -61,6 +61,12 @@ cell_emit_state(struct cell_context *cell)
       fb->height = cell->framebuffer.cbufs[0]->height;
    }
 
+   if (cell->dirty & CELL_NEW_BLEND) {
+      emit_state_cmd(cell, CELL_CMD_STATE_BLEND,
+                     cell->blend,
+                     sizeof(struct pipe_blend_state));
+   }
+
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
       emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL,
                      cell->depth_stencil,
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 48e016fc8b..9d8e6df0e3 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -232,6 +232,18 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+static void
+cmd_state_blend(const struct pipe_blend_state *state)
+{
+   if (Debug)
+      printf("SPU %u: BLEND: ztest %d\n",
+             spu.init.id,
+             state->blend_enable);
+
+   memcpy(&spu.blend, state, sizeof(*state));
+}
+
+
 static void
 cmd_state_depth_stencil(const struct pipe_depth_stencil_alpha_state *state)
 {
@@ -398,6 +410,11 @@ cmd_batch(uint opcode)
          cmd_finish();
          pos += 1;
          break;
+      case CELL_CMD_STATE_BLEND:
+         cmd_state_blend((struct pipe_blend_state *)
+                                 &buffer[pos+1]);
+         pos += (1 + sizeof(struct pipe_blend_state) / 4);
+         break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
                                  &buffer[pos+1]);
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index fb98b0d889..b22d563551 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -86,6 +86,7 @@ struct spu_global
    struct cell_init_info init;
 
    struct spu_framebuffer fb;
+   struct pipe_blend_state blend_stencil;
    struct pipe_depth_stencil_alpha_state depth_stencil;
    struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
-- 
cgit v1.2.3


From 168247d1caee28ef577ad4c3c4308451f1193062 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 15:10:35 -0700
Subject: Cell: replace float 4 with vector float in eval_coeff()

---
 src/mesa/pipe/cell/spu/spu_tri.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 7b422f71a8..199afa1aa6 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -32,6 +32,7 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
+#include "spu_blend.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -206,14 +207,14 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, float4 result[4])
+eval_coeff(uint slot, float x, float y, vector float result[4])
 {
    switch (spu.vertex_info.interp_mode[slot]) {
    case INTERP_CONSTANT:
       result[QUAD_TOP_LEFT] =
       result[QUAD_TOP_RIGHT] =
       result[QUAD_BOTTOM_LEFT] =
-      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
       break;
 
    case INTERP_LINEAR:
@@ -227,10 +228,10 @@ eval_coeff(uint slot, float x, float y, float4 result[4])
                       spu_add(spu_mul(spu_splats(x), dadx),
                               spu_mul(spu_splats(y), dady)));
 
-         result[QUAD_TOP_LEFT].v = topLeft;
-         result[QUAD_TOP_RIGHT].v = spu_add(topLeft, dadx);
-         result[QUAD_BOTTOM_LEFT].v = spu_add(topLeft, dady);
-         result[QUAD_BOTTOM_RIGHT].v = spu_add(spu_add(topLeft, dadx), dady);
+         result[QUAD_TOP_LEFT] = topLeft;
+         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
       }
    }
 }
@@ -305,32 +306,32 @@ emit_quad( int x, int y, mask_t mask )
 
       if (spu.texture.start) {
          /* texture mapping */
-         float4 texcoords[4];
+         vector float texcoords[4];
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0].v);
+            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0]);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1].v);
+            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1]);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2].v);
+            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2]);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3].v);
+            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3]);
       }
       else {
          /* simple shading */
          const vector unsigned char shuffle = spu.color_shuffle;
-         float4 colors[4];
+         vector float colors[4];
          eval_coeff(1, (float) x, (float) y, colors);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0].v, shuffle);
+            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1].v, shuffle);
+            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2].v, shuffle);
+            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3].v, shuffle);
+            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
       }
 
 #if 0
-- 
cgit v1.2.3


From bc1ad6bcbd5c63da9c10d0276c9d7535b6139437 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 15:17:50 -0700
Subject: Cell: some basic blending code

---
 src/mesa/pipe/cell/spu/Makefile    |  1 +
 src/mesa/pipe/cell/spu/spu_blend.c | 62 ++++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_blend.h | 37 +++++++++++++++++++++++
 src/mesa/pipe/cell/spu/spu_tri.c   |  5 +++
 4 files changed, 105 insertions(+)
 create mode 100644 src/mesa/pipe/cell/spu/spu_blend.c
 create mode 100644 src/mesa/pipe/cell/spu/spu_blend.h

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 91a631b699..66f16cde9b 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -19,6 +19,7 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_blend.c \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
diff --git a/src/mesa/pipe/cell/spu/spu_blend.c b/src/mesa/pipe/cell/spu/spu_blend.c
new file mode 100644
index 0000000000..23ec0eeb45
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_blend.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "spu_main.h"
+#include "spu_blend.h"
+#include "spu_colorpack.h"
+
+
+void
+blend_quad(uint itx, uint ity, vector float colors[4])
+{
+   /* simple SRC_ALPHA, ONE_MINUS_SRC_ALPHA blending */
+   vector float fbc00 = spu_unpack_color(spu.ctile.ui[ity][itx]);
+   vector float fbc01 = spu_unpack_color(spu.ctile.ui[ity][itx+1]);
+   vector float fbc10 = spu_unpack_color(spu.ctile.ui[ity+1][itx]);
+   vector float fbc11 = spu_unpack_color(spu.ctile.ui[ity+1][itx+1]);
+
+   vector float alpha00 = spu_splats(spu_extract(colors[0], 3));
+   vector float alpha01 = spu_splats(spu_extract(colors[1], 3));
+   vector float alpha10 = spu_splats(spu_extract(colors[2], 3));
+   vector float alpha11 = spu_splats(spu_extract(colors[3], 3));
+
+   vector float one_minus_alpha00 = spu_sub(spu_splats(1.0f), alpha00);
+   vector float one_minus_alpha01 = spu_sub(spu_splats(1.0f), alpha01);
+   vector float one_minus_alpha10 = spu_sub(spu_splats(1.0f), alpha10);
+   vector float one_minus_alpha11 = spu_sub(spu_splats(1.0f), alpha11);
+
+   colors[0] = spu_add(spu_mul(colors[0], alpha00),
+                       spu_mul(fbc00, one_minus_alpha00));
+   colors[1] = spu_add(spu_mul(colors[1], alpha01),
+                       spu_mul(fbc01, one_minus_alpha01));
+   colors[2] = spu_add(spu_mul(colors[2], alpha10),
+                       spu_mul(fbc10, one_minus_alpha10));
+   colors[3] = spu_add(spu_mul(colors[3], alpha11),
+                       spu_mul(fbc11, one_minus_alpha11));
+}
+
diff --git a/src/mesa/pipe/cell/spu/spu_blend.h b/src/mesa/pipe/cell/spu/spu_blend.h
new file mode 100644
index 0000000000..2b594b578b
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_blend.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_BLEND_H
+#define SPU_BLEND_H
+
+
+extern void
+blend_quad(uint itx, uint ity, vector float colors[4]);
+
+
+#endif /* SPU_BLEND_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 199afa1aa6..89aaca9a72 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -324,6 +324,11 @@ emit_quad( int x, int y, mask_t mask )
          vector float colors[4];
          eval_coeff(1, (float) x, (float) y, colors);
 
+#if 0
+         if (spu.blend.blend_enable)
+            blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
+#endif
+
          if (spu_extract(mask, 0))
             spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
          if (spu_extract(mask, 1))
-- 
cgit v1.2.3


From 76c1a10eb121f040ef510124bf6aa24c4c5c3f8f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:04:50 -0700
Subject: Cell: fix typo

---
 src/mesa/pipe/cell/spu/spu_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 9d8e6df0e3..b0311db1aa 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -236,7 +236,7 @@ static void
 cmd_state_blend(const struct pipe_blend_state *state)
 {
    if (Debug)
-      printf("SPU %u: BLEND: ztest %d\n",
+      printf("SPU %u: BLEND: enabled %d\n",
              spu.init.id,
              state->blend_enable);
 
-- 
cgit v1.2.3


From 5068b573c417bdb317e1938585bebfe931bda049 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:05:13 -0700
Subject: Cell: added spu_unpack_A8R8G8B8()

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 0c93c06562..57ea3525c2 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -91,4 +91,19 @@ spu_unpack_color(uint color)
 }
 
 
+static INLINE vector float
+spu_unpack_A8R8G8B8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          VEC_LITERAL(vector unsigned char,
+                                      5, 5, 5, 5,
+                                      10, 10, 10, 10,
+                                      15, 15, 15, 15,
+                                      0, 0, 0, 0));
+
+   return spu_convtf(color_u4, 32);
+}
+
+
 #endif /* SPU_COLORPACK_H */
-- 
cgit v1.2.3


From efa8e03a6f3f7c27b019d20cca93bf7e624d7035 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:06:51 -0700
Subject: Cell: texture sampler functions always return vector float now

Texture colors look the same now, regardless of X display/pixel format
---
 src/mesa/pipe/cell/spu/spu_main.h    |  2 +-
 src/mesa/pipe/cell/spu/spu_texture.c | 19 ++++++++-----------
 src/mesa/pipe/cell/spu/spu_texture.h |  4 ++--
 src/mesa/pipe/cell/spu/spu_tri.c     | 36 ++++++++++++++++++------------------
 4 files changed, 29 insertions(+), 32 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index b22d563551..cfd4d72729 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -115,7 +115,7 @@ struct spu_global
    vector float tex_size;
    vector unsigned int tex_size_mask; /**< == int(size - 1) */
 
-   uint (*sample_texture)(vector float texcoord);
+   vector float (*sample_texture)(vector float texcoord);
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index ecacf2ec88..9ee2b45e24 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -130,7 +130,7 @@ get_tex_tile(vector unsigned int ij)
  * Get texture sample at texcoord.
  * XXX this is extremely primitive for now.
  */
-uint
+vector float
 sample_texture_nearest(vector float texcoord)
 {
    vector float tc = spu_mul(texcoord, spu.tex_size);
@@ -139,11 +139,11 @@ sample_texture_nearest(vector float texcoord)
    vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
    uint pos = get_tex_tile(itc);
    uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
-   return texel;
+   return spu_unpack_A8R8G8B8(texel);
 }
 
 
-uint
+vector float
 sample_texture_bilinear(vector float texcoord)
 {
    static const vector unsigned int offset10 = {1, 0, 0, 0};
@@ -183,10 +183,10 @@ sample_texture_bilinear(vector float texcoord)
    }
 
    /* get texels from tiles and convert to float[4] */
-   vector float texel00 = spu_unpack_color(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
-   vector float texel01 = spu_unpack_color(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
-   vector float texel10 = spu_unpack_color(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
-   vector float texel11 = spu_unpack_color(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+   vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
+   vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
+   vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
+   vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
 
    /* Compute weighting factors in [0,1]
     * Multiply texcoord by 1024, AND with 1023, convert back to float.
@@ -213,8 +213,5 @@ sample_texture_bilinear(vector float texcoord)
    texel_sum = spu_add(texel_sum, texel10);
    texel_sum = spu_add(texel_sum, texel11);
 
-   /* convert to uint color */
-   uint texel = spu_pack_R8G8B8A8(texel_sum);
-
-   return texel;
+   return texel_sum;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
index 0e000bfebf..95eb87080f 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.h
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -36,11 +36,11 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern uint
+extern vector float
 sample_texture_nearest(vector float texcoord);
 
 
-extern uint
+extern vector float
 sample_texture_bilinear(vector float texcoord);
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 89aaca9a72..4c6de56eda 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -301,6 +301,8 @@ emit_quad( int x, int y, mask_t mask )
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
       const int iy = y - setup.cliprect_miny;
+      const vector unsigned char shuffle = spu.color_shuffle;
+      vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
 
@@ -310,34 +312,32 @@ emit_quad( int x, int y, mask_t mask )
          eval_coeff(2, (float) x, (float) y, texcoords);
 
          if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu.sample_texture(texcoords[0]);
+            colors[0] = spu.sample_texture(texcoords[0]);
          if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu.sample_texture(texcoords[1]);
+            colors[1] = spu.sample_texture(texcoords[1]);
          if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu.sample_texture(texcoords[2]);
+            colors[2] = spu.sample_texture(texcoords[2]);
          if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu.sample_texture(texcoords[3]);
+            colors[3] = spu.sample_texture(texcoords[3]);
       }
       else {
          /* simple shading */
-         const vector unsigned char shuffle = spu.color_shuffle;
-         vector float colors[4];
          eval_coeff(1, (float) x, (float) y, colors);
+      }
 
-#if 0
-         if (spu.blend.blend_enable)
-            blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
+#if 1
+      if (spu.blend.blend_enable)
+         blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
 #endif
 
-         if (spu_extract(mask, 0))
-            spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
-         if (spu_extract(mask, 1))
-            spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
-         if (spu_extract(mask, 2))
-            spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
-         if (spu_extract(mask, 3))
-            spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
-      }
+      if (spu_extract(mask, 0))
+         spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
+      if (spu_extract(mask, 1))
+         spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
+      if (spu_extract(mask, 2))
+         spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
+      if (spu_extract(mask, 3))
+         spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
 
 #if 0
       /* SIMD_Z with swizzled color buffer (someday) */
-- 
cgit v1.2.3


From 1a75464cdc12a1e83f1452707cd624c53f808308 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:48:00 -0700
Subject: Cell: fix small sampling error in sample_texture_bilinear()

---
 src/mesa/pipe/cell/spu/spu_texture.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 9ee2b45e24..01ff33a857 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -150,6 +150,8 @@ sample_texture_bilinear(vector float texcoord)
    static const vector unsigned int offset01 = {0, 1, 0, 0};
 
    vector float tc = spu_mul(texcoord, spu.tex_size);
+   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
+
    /* integer texcoords S,T: */
    vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
    vector unsigned int itc01 = spu_add(itc00, offset01);
-- 
cgit v1.2.3


From 93d061b217e31d27a1c54e50a14538e94f1404d6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 16:48:36 -0700
Subject: Cell: move float4 typedef (temporary datatype)

---
 src/mesa/pipe/cell/spu/spu_main.h | 7 -------
 src/mesa/pipe/cell/spu/spu_tri.c  | 6 ++++++
 2 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index cfd4d72729..1710a17512 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -41,13 +41,6 @@
 #define MAX_HEIGHT 1024
 
 
-typedef union
-{
-   vector float v;
-   float f[4];
-} float4;
-
-
 typedef union {
    ushort us[TILE_SIZE][TILE_SIZE];
    uint   ui[TILE_SIZE][TILE_SIZE];
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 4c6de56eda..688c8646ab 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -45,6 +45,12 @@
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
 typedef vector unsigned int mask_t;
 
+typedef union
+{
+   vector float v;
+   float f[4];
+} float4;
+
 
 /**
  * Simplified types taken from other parts of Gallium
-- 
cgit v1.2.3


From 7cbe5cf212d296c19ccf8e1b74d3a5b1bcb2d9e9 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:02:21 -0700
Subject: Cell: don't use VEC_LITERAL macro, doesn't work w/ SDK 3.0

---
 src/mesa/pipe/cell/spu/spu_colorpack.h | 41 +++++++++++++++++-----------------
 src/mesa/pipe/cell/spu/spu_ztest.h     | 24 ++++++++++----------
 2 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
index 57ea3525c2..e9fee8a3a6 100644
--- a/src/mesa/pipe/cell/spu/spu_colorpack.h
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -31,7 +31,6 @@
 #define SPU_COLORPACK_H
 
 
-#include <vec_literal.h>
 #include <spu_intrinsics.h>
 
 
@@ -39,9 +38,11 @@ static INLINE unsigned int
 spu_pack_R8G8B8A8(vector float rgba)
 {
   vector unsigned int out = spu_convtu(rgba, 32);
-  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
-					  0, 4, 8, 12, 0, 0, 0, 0, 
-                                          0, 0, 0, 0, 0, 0, 0, 0));
+
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  0, 4, 8, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0 }) );
+
   return spu_extract(out, 0);
 }
 
@@ -50,9 +51,9 @@ static INLINE unsigned int
 spu_pack_A8R8G8B8(vector float rgba)
 {
   vector unsigned int out = spu_convtu(rgba, 32);
-  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
-					  12, 0, 4, 8, 0, 0, 0, 0, 
-                                          0, 0, 0, 0, 0, 0, 0, 0));
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  12, 0, 4, 8, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
   return spu_extract(out, 0);
 }
 
@@ -61,9 +62,9 @@ static INLINE unsigned int
 spu_pack_B8G8R8A8(vector float rgba)
 {
   vector unsigned int out = spu_convtu(rgba, 32);
-  out = spu_shuffle(out, out, VEC_LITERAL(vector unsigned char,
-					  8, 4, 0, 12, 0, 0, 0, 0, 
-                                          0, 0, 0, 0, 0, 0, 0, 0));
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  8, 4, 0, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
   return spu_extract(out, 0);
 }
 
@@ -82,11 +83,11 @@ spu_unpack_color(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
-                          VEC_LITERAL(vector unsigned char,
-                                      0, 0, 0, 0,
-                                      5, 5, 5, 5,
-                                      10, 10, 10, 10,
-                                      15, 15, 15, 15));
+                          ((vector unsigned char) {
+                             0, 0, 0, 0,
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15}) );
    return spu_convtf(color_u4, 32);
 }
 
@@ -96,11 +97,11 @@ spu_unpack_A8R8G8B8(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
-                          VEC_LITERAL(vector unsigned char,
-                                      5, 5, 5, 5,
-                                      10, 10, 10, 10,
-                                      15, 15, 15, 15,
-                                      0, 0, 0, 0));
+                          ((vector unsigned char) {
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15,
+                             0, 0, 0, 0}) );
 
    return spu_convtf(color_u4, 32);
 }
diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h
index 5fefb15176..ce8ad00339 100644
--- a/src/mesa/pipe/cell/spu/spu_ztest.h
+++ b/src/mesa/pipe/cell/spu/spu_ztest.h
@@ -68,9 +68,9 @@ spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
       /* gather lower four ushorts */
       zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
                              (vector unsigned int) *zbuf,
-                             VEC_LITERAL(vector unsigned char,
-                                      ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
-                                      ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15));
+                             ((vector unsigned char) {
+                                ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
+                                ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15}));
       /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
       mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
       /* mask &= inMask */
@@ -80,18 +80,18 @@ spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
       /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
       *zbuf = (vector unsigned short)
          spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     VEC_LITERAL(vector unsigned char,
-                                 16, 17, 18, 19, 20, 21, 22, 23,
-                                 2, 3, 6, 7, 10, 11, 14, 15));
+                     ((vector unsigned char) {
+                        16, 17, 18, 19, 20, 21, 22, 23,
+                        2, 3, 6, 7, 10, 11, 14, 15}));
    }
    else {
       /* convert zbuffer values from ushorts to uints */
       /* gather upper four ushorts */
       zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
                              (vector unsigned int) *zbuf,
-                             VEC_LITERAL(vector unsigned char,
-                                         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
-                                         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7));
+                             ((vector unsigned char) {
+                                ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+                                ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7}));
       /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
       mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
       /* mask &= inMask */
@@ -101,9 +101,9 @@ spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
       /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
       *zbuf = (vector unsigned short)
          spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
-                     VEC_LITERAL(vector unsigned char,
-                                 2, 3, 6, 7, 10, 11, 14, 15,
-                                 24, 25, 26, 27, 28, 29, 30, 31));
+                     ((vector unsigned char) {
+                        2, 3, 6, 7, 10, 11, 14, 15,
+                        24, 25, 26, 27, 28, 29, 30, 31}));
    }
    return mask;
 #undef ZERO
-- 
cgit v1.2.3


From 684d320ea2e7ec03d01275a544068cc6b45e1e9a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:03:05 -0700
Subject: Cell: don't use VEC_LITERAL macro, doesn't work w/ SDK 3.0

---
 src/mesa/pipe/cell/spu/spu_texture.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
index 01ff33a857..3962aaa4a9 100644
--- a/src/mesa/pipe/cell/spu/spu_texture.c
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -26,8 +26,6 @@
  **************************************************************************/
 
 
-#include <vec_literal.h>
-
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -57,7 +55,7 @@ invalidate_tex_cache(void)
    /* XXX memset? */
    uint i;
    for (i = 0; i < CACHE_SIZE; i++) {
-      tex_tile_xy[i] = VEC_LITERAL(vector unsigned int, ~0U, ~0U, ~0U, ~0U);
+      tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U });
    }
 }
 
-- 
cgit v1.2.3


From 5db1593c78192b764ad2ef7bdc5182d8ec4aed7c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 4 Feb 2008 18:05:37 -0700
Subject: Cell: fix some alignment issues by aligning commands to 8-byte
 boundaries

Contributed by Ian Romanick.
Also, temporarily disable inlined vertex buffers.  They need to be 16-byte
aligned...
---
 src/mesa/pipe/cell/common.h                 | 16 ++++----
 src/mesa/pipe/cell/ppu/cell_batch.c         |  4 +-
 src/mesa/pipe/cell/ppu/cell_flush.c         |  2 +-
 src/mesa/pipe/cell/ppu/cell_state_emit.c    |  3 +-
 src/mesa/pipe/cell/ppu/cell_vbuf.c          |  4 +-
 src/mesa/pipe/cell/ppu/cell_vertex_shader.c | 22 ++++++-----
 src/mesa/pipe/cell/spu/spu_main.c           | 58 +++++++++++++----------------
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c   |  7 ++--
 src/mesa/pipe/cell/spu/spu_vertex_shader.h  |  2 +-
 9 files changed, 57 insertions(+), 61 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index d861e82d33..cf8fc94ebf 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -57,6 +57,9 @@
 /** round up value to next multiple of 4 */
 #define ROUNDUP4(k)  (((k) + 0x3) & ~0x3)
 
+/** round up value to next multiple of 8 */
+#define ROUNDUP8(k)  (((k) + 0x7) & ~0x7)
+
 /** round up value to next multiple of 16 */
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
@@ -102,7 +105,7 @@
  */
 struct cell_command_framebuffer
 {
-   uint opcode;
+   uint64_t opcode;
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
@@ -114,7 +117,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_clear_surface
 {
-   uint opcode;
+   uint64_t opcode;
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
 };
@@ -125,8 +128,7 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint opcode;
-    uint base;          /**< Base address of the 0th element. */
+    uint64_t base;          /**< Base address of the 0th element. */
     uint attr;          /**< Attribute that this state if for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
@@ -150,7 +152,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
-   uint opcode;       /**< CELL_CMD_VS_EXECUTE */
+   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
    struct cell_shader_info   shader;
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -163,7 +165,7 @@ struct cell_command_vs
 
 struct cell_command_render
 {
-   uint opcode;       /**< CELL_CMD_RENDER */
+   uint64_t opcode;   /**< CELL_CMD_RENDER */
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
@@ -179,7 +181,7 @@ struct cell_command_render
 
 struct cell_command_release_verts
 {
-   int opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
    uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
 };
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 2d032fc902..2fb49711b2 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -136,7 +136,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 {
    uint size;
 
-   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
    ASSERT(cell->cur_batch >= 0);
 
@@ -171,7 +171,7 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
    void *pos;
    uint size;
 
-   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
 
    assert(cell->cur_batch >= 0);
diff --git a/src/mesa/pipe/cell/ppu/cell_flush.c b/src/mesa/pipe/cell/ppu/cell_flush.c
index cf4e676645..f62bc4650c 100644
--- a/src/mesa/pipe/cell/ppu/cell_flush.c
+++ b/src/mesa/pipe/cell/ppu/cell_flush.c
@@ -59,7 +59,7 @@ cell_flush_int(struct pipe_context *pipe, unsigned flags)
    flushing = TRUE;
 
    if (flags & PIPE_FLUSH_WAIT) {
-      uint *cmd = (uint *) cell_batch_alloc(cell, sizeof(uint));
+      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
       *cmd = CELL_CMD_FINISH;
    }
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index 3b2670f786..5d2a786449 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -37,7 +37,8 @@ static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
 {
-   uint *dst = (uint *) cell_batch_alloc(cell, sizeof(uint) + state_size);
+   uint64_t *dst = (uint64_t *) 
+       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
    *dst = cmd;
    memcpy(dst + 1, state, state_size);
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index e63b34cf52..0fee61821a 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,7 +40,7 @@
 
 
 /** Allow vertex data to be inlined after RENDER command */
-#define ALLOW_INLINE_VERTS 1
+#define ALLOW_INLINE_VERTS 0
 
 
 /**
@@ -197,7 +197,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
    /* build/insert batch RENDER command */
    {
-      const uint index_bytes = ROUNDUP4(nr_indices * 2);
+      const uint index_bytes = ROUNDUP8(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
 
       const uint batch_size = sizeof(struct cell_command_render)
diff --git a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
index aef329a902..80dd500b34 100644
--- a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
+++ b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
@@ -52,8 +52,8 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    struct cell_context *const cell =
        (struct cell_context *) draw->driver_private;
    struct cell_command_vs *const vs = &cell_global.command[0].vs;
-   unsigned *batch;
-   struct cell_array_info array_info;
+   uint64_t *batch;
+   struct cell_array_info *array_info;
    unsigned i, j;
 
    assert(draw->vs.queue_nr != 0);
@@ -63,17 +63,19 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
    draw_update_vertex_fetch(draw);
 
    for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
-      array_info.opcode = CELL_CMD_STATE_VS_ARRAY_INFO;
-      assert(draw->vertex_fetch.src_ptr[i] != NULL);
-      array_info.base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
-      array_info.attr = i;
-      array_info.pitch = draw->vertex_fetch.pitch[i];
-      array_info.format = draw->vertex_element[i].src_format;
+      batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
+
+      batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
 
-      cell_batch_append(cell, & array_info, sizeof(array_info));
+      array_info = (struct cell_array_info *) &batch[1];
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info->attr = i;
+      array_info->pitch = draw->vertex_fetch.pitch[i];
+      array_info->format = draw->vertex_element[i].src_format;
    }
 
-   batch = cell_batch_alloc(cell, sizeof(unsigned)
+   batch = cell_batch_alloc(cell, sizeof(batch[0])
                             + sizeof(struct pipe_viewport_state));
    batch[0] = CELL_CMD_STATE_VIEWPORT;
    (void) memcpy(&batch[1], &draw->viewport,
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index b0311db1aa..4f126d5e5b 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,7 +31,6 @@
 
 #include <stdio.h>
 #include <libmisc.h>
-#include <vec_literal.h>
 
 #include "spu_main.h"
 #include "spu_render.h"
@@ -220,13 +219,13 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 0;
 
    if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
-      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
-                                      12, 0, 4, 8, 0, 0, 0, 0, 
-                                      0, 0, 0, 0, 0, 0, 0, 0);
+      spu.color_shuffle = ((vector unsigned char) {
+                              12, 0, 4, 8, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
    else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
-      spu.color_shuffle = VEC_LITERAL(vector unsigned char,
-                                      8, 4, 0, 12, 0, 0, 0, 0, 
-                                      0, 0, 0, 0, 0, 0, 0, 0);
+      spu.color_shuffle = ((vector unsigned char) {
+                              8, 4, 0, 12, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
    else
       ASSERT(0);
 }
@@ -279,16 +278,10 @@ cmd_state_texture(const struct cell_command_texture *texture)
              spu.init.id, texture->start, texture->width, texture->height);
 
    memcpy(&spu.texture, texture, sizeof(*texture));
-   spu.tex_size = VEC_LITERAL(vector float,
-                              spu.texture.width,
-                              spu.texture.height,
-                              0.0,
-                              0.0);
-   spu.tex_size_mask = VEC_LITERAL(vector unsigned int,
-                                   spu.texture.width - 1,
-                                   spu.texture.height - 1,
-                                   0,
-                                   0);
+   spu.tex_size = (vector float)
+      { spu.texture.width, spu.texture.height, 0.0, 0.0};
+   spu.tex_size_mask = (vector unsigned int)
+      { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
 }
 
 
@@ -341,8 +334,8 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint buffer[CELL_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
-   const uint usize = size / sizeof(uint);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
    if (Debug)
@@ -377,7 +370,7 @@ cmd_batch(uint opcode)
             struct cell_command_framebuffer *fb
                = (struct cell_command_framebuffer *) &buffer[pos];
             cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 4;
+            pos += sizeof(*fb) / 8;
          }
          break;
       case CELL_CMD_CLEAR_SURFACE:
@@ -385,7 +378,7 @@ cmd_batch(uint opcode)
             struct cell_command_clear_surface *clr
                = (struct cell_command_clear_surface *) &buffer[pos];
             cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 4;
+            pos += sizeof(*clr) / 8;
          }
          break;
       case CELL_CMD_RENDER:
@@ -394,7 +387,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += sizeof(*render) / 4 + pos_incr;
+            pos += sizeof(*render) / 8 + ((pos_incr + 1) / 2);
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -402,8 +395,7 @@ cmd_batch(uint opcode)
             struct cell_command_release_verts *release
                = (struct cell_command_release_verts *) &buffer[pos];
             cmd_release_verts(release);
-            ASSERT(sizeof(*release) == 8);
-            pos += sizeof(*release) / 4;
+            pos += sizeof(*release) / 8;
          }
          break;
       case CELL_CMD_FINISH:
@@ -413,36 +405,36 @@ cmd_batch(uint opcode)
       case CELL_CMD_STATE_BLEND:
          cmd_state_blend((struct pipe_blend_state *)
                                  &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_blend_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
          break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
                                  &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_depth_stencil_alpha_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8);
          break;
       case CELL_CMD_STATE_SAMPLER:
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_sampler_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8);
          break;
       case CELL_CMD_STATE_TEXTURE:
          cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct cell_command_texture) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8);
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct vertex_info) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
          break;
       case CELL_CMD_STATE_VIEWPORT:
          (void) memcpy(& draw.viewport, &buffer[pos+1],
                        sizeof(struct pipe_viewport_state));
-         pos += (1 + sizeof(struct pipe_viewport_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
-         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos]);
-         pos += (sizeof(struct cell_array_info) / 4);
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       default:
-         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
          break;
       }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 1e846868e3..5b0f2a6470 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -431,9 +431,8 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    /* loop over vertex attributes (vertex shader inputs)
     */
    for (attr = 0; attr < nr_attrs; attr++) {
-
-      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
-      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
       const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
       unsigned i;
       float p[4][4];
@@ -447,7 +446,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        */
       for (i = 0; i < count; i++) {
          uint8_t buffer[32] ALIGN16_ATTRIB;
-         const unsigned long addr = src + (elts[i] * pitch);
+         const uint64_t addr = src + (elts[i] * pitch);
          const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index c52f38fd02..b261ab44a2 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -16,7 +16,7 @@ struct spu_vs_context {
    struct pipe_viewport_state viewport;
 
    struct {
-      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      uint64_t src_ptr[PIPE_ATTRIB_MAX];
       unsigned pitch[PIPE_ATTRIB_MAX];
       enum pipe_format format[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
-- 
cgit v1.2.3


From 45b18c51c0f49731cb8fc0144d678da5fa814992 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Tue, 5 Feb 2008 07:50:56 -0700
Subject: gallium: Use align_free to free aligned memory.

---
 src/mesa/pipe/draw/draw_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index 87f4969983..b15f57c824 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -106,7 +106,7 @@ void draw_destroy( struct draw_context *draw )
    if (draw->pipeline.rasterize)
       draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
    tgsi_exec_machine_free_data(&draw->machine);
-   FREE( draw->vcache.vertex[0] ); /* Frees all the vertices. */
+   align_free( draw->vcache.vertex[0] ); /* Frees all the vertices. */
    FREE( draw );
 }
 
-- 
cgit v1.2.3


From 8fc2355949b67cd99403c1184ce711a344877375 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 14:58:38 -0800
Subject: Vectorize all micro ops

Fold single instruction micro ops inline.  Remove unused micro ops.
---
 src/mesa/pipe/cell/spu/spu_exec.c | 912 ++++++++++----------------------------
 src/mesa/pipe/cell/spu/spu_exec.h |   1 +
 2 files changed, 230 insertions(+), 683 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 168bada3bb..1ac9c031e3 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,8 +52,15 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
-#include <simdmath/sqrtf4.h>
+#include <simdmath/ceilf4.h>
+#include <simdmath/cosf4.h>
+#include <simdmath/divf4.h>
+#include <simdmath/floorf4.h>
+#include <simdmath/log2f4.h>
 #include <simdmath/powf4.h>
+#include <simdmath/sinf4.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/truncf4.h>
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
@@ -157,643 +164,175 @@ spu_exec_machine_init(struct spu_exec_machine *mach,
 }
 
 
-static void
-micro_abs(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] = (float) fabs( (double) src->f[0] );
-   dst->f[1] = (float) fabs( (double) src->f[1] );
-   dst->f[2] = (float) fabs( (double) src->f[2] );
-   dst->f[3] = (float) fabs( (double) src->f[3] );
-}
-
-static void
-micro_add(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] + src1->f[0];
-   dst->f[1] = src0->f[1] + src1->f[1];
-   dst->f[2] = src0->f[2] + src1->f[2];
-   dst->f[3] = src0->f[3] + src1->f[3];
-}
-
-static void
-micro_iadd(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] + src1->i[0];
-   dst->i[1] = src0->i[1] + src1->i[1];
-   dst->i[2] = src0->i[2] + src1->i[2];
-   dst->i[3] = src0->i[3] + src1->i[3];
-}
-
-static void
-micro_and(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] & src1->u[0];
-   dst->u[1] = src0->u[1] & src1->u[1];
-   dst->u[2] = src0->u[2] & src1->u[2];
-   dst->u[3] = src0->u[3] & src1->u[3];
-}
-
-static void
-micro_ceil(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) ceil( (double) src->f[0] );
-   dst->f[1] = (float) ceil( (double) src->f[1] );
-   dst->f[2] = (float) ceil( (double) src->f[2] );
-   dst->f[3] = (float) ceil( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_cos(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) cos( (double) src->f[0] );
-   dst->f[1] = (float) cos( (double) src->f[1] );
-   dst->f[2] = (float) cos( (double) src->f[2] );
-   dst->f[3] = (float) cos( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_ddx(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_ddy(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_div(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] / src1->f[0];
-   dst->f[1] = src0->f[1] / src1->f[1];
-   dst->f[2] = src0->f[2] / src1->f[2];
-   dst->f[3] = src0->f[3] / src1->f[3];
-}
-
-static void
-micro_udiv(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] / src1->u[0];
-   dst->u[1] = src0->u[1] / src1->u[1];
-   dst->u[2] = src0->u[2] / src1->u[2];
-   dst->u[3] = src0->u[3] / src1->u[3];
-}
-
-static void
-micro_eq(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_ieq(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
-}
-
-static void
-micro_exp2(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src)
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
-   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
-   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
-   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
-#endif
-}
-
-static void
-micro_f2it(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->i[0] = (int) src->f[0];
-   dst->i[1] = (int) src->f[1];
-   dst->i[2] = (int) src->f[2];
-   dst->i[3] = (int) src->f[3];
-}
-
-static void
-micro_f2ut(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->u[0] = (uint) src->f[0];
-   dst->u[1] = (uint) src->f[1];
-   dst->u[2] = (uint) src->f[2];
-   dst->u[3] = (uint) src->f[3];
-}
-
-static void
-micro_flr(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) floor( (double) src->f[0] );
-   dst->f[1] = (float) floor( (double) src->f[1] );
-   dst->f[2] = (float) floor( (double) src->f[2] );
-   dst->f[3] = (float) floor( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_frc(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
-   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
-   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
-   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
-#endif
-}
-
-static void
-micro_ge(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
-micro_i2f(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   dst->f[0] = (float) src->i[0];
-   dst->f[1] = (float) src->i[1];
-   dst->f[2] = (float) src->i[2];
-   dst->f[3] = (float) src->i[3];
-}
-
-static void
-micro_lg2(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
-   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
-   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
-   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
-#endif
-}
-
-static void
-micro_lt(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
+static INLINE qword
+micro_abs(qword src)
 {
-   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
+   return si_rotmi(si_shli(src, 1), -1);
 }
 
-static void
-micro_ilt(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
+static INLINE qword
+micro_ceil(qword src)
 {
-   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
+   return (qword) _ceilf4((vec_float4) src);
 }
 
-static void
-micro_ult(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2,
-   const union spu_exec_channel *src3 )
+static INLINE qword
+micro_cos(qword src)
 {
-   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
+   return (qword) _cosf4((vec_float4) src);
 }
 
-static void
-micro_max(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static const qword br_shuf = {
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+};
+
+static const qword bl_shuf = {
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+};
+
+static const qword tl_shuf = {
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+};
+
+static qword
+micro_ddx(qword src)
 {
-   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
-   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
-   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
-   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
-}
+   qword bottom_right = si_shufb(src, src, br_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
 
-static void
-micro_imax(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+   return si_fs(bottom_right, bottom_left);
 }
 
-static void
-micro_umax(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_ddy(qword src)
 {
-   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
-}
+   qword top_left = si_shufb(src, src, tl_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
 
-static void
-micro_min(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
-   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
-   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
-   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
+   return si_fs(top_left, bottom_left);
 }
 
-static void
-micro_imin(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static INLINE qword
+micro_div(qword src0, qword src1)
 {
-   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+   return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
 }
 
-static void
-micro_umin(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_flr(qword src)
 {
-   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+   return (qword) _floorf4((vec_float4) src);
 }
 
-static void
-micro_umod(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_frc(qword src)
 {
-   dst->u[0] = src0->u[0] % src1->u[0];
-   dst->u[1] = src0->u[1] % src1->u[1];
-   dst->u[2] = src0->u[2] % src1->u[2];
-   dst->u[3] = src0->u[3] % src1->u[3];
+   return si_fs(src, (qword) _floorf4((vec_float4) src));
 }
 
-static void
-micro_mul(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] * src1->f[0];
-   dst->f[1] = src0->f[1] * src1->f[1];
-   dst->f[2] = src0->f[2] * src1->f[2];
-   dst->f[3] = src0->f[3] * src1->f[3];
-}
-
-static void
-micro_imul(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] * src1->i[0];
-   dst->i[1] = src0->i[1] * src1->i[1];
-   dst->i[2] = src0->i[2] * src1->i[2];
-   dst->i[3] = src0->i[3] * src1->i[3];
-}
-
-static void
-micro_imul64(
-   union spu_exec_channel *dst0,
-   union spu_exec_channel *dst1,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst1->i[0] = src0->i[0] * src1->i[0];
-   dst1->i[1] = src0->i[1] * src1->i[1];
-   dst1->i[2] = src0->i[2] * src1->i[2];
-   dst1->i[3] = src0->i[3] * src1->i[3];
-   dst0->i[0] = 0;
-   dst0->i[1] = 0;
-   dst0->i[2] = 0;
-   dst0->i[3] = 0;
-}
-
-static void
-micro_umul64(
-   union spu_exec_channel *dst0,
-   union spu_exec_channel *dst1,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst1->u[0] = src0->u[0] * src1->u[0];
-   dst1->u[1] = src0->u[1] * src1->u[1];
-   dst1->u[2] = src0->u[2] * src1->u[2];
-   dst1->u[3] = src0->u[3] * src1->u[3];
-   dst0->u[0] = 0;
-   dst0->u[1] = 0;
-   dst0->u[2] = 0;
-   dst0->u[3] = 0;
-}
-
-static void
-micro_movc(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1,
-   const union spu_exec_channel *src2 )
-{
-   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
-   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
-   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
-   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
-}
-
-static void
-micro_neg(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static INLINE qword
+micro_ge(qword src0, qword src1)
 {
-   dst->f[0] = -src->f[0];
-   dst->f[1] = -src->f[1];
-   dst->f[2] = -src->f[2];
-   dst->f[3] = -src->f[3];
+   return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
 }
 
-static void
-micro_ineg(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static qword
+micro_lg2(qword src)
 {
-   dst->i[0] = -src->i[0];
-   dst->i[1] = -src->i[1];
-   dst->i[2] = -src->i[2];
-   dst->i[3] = -src->i[3];
+   return (qword) _log2f4((vec_float4) src);
 }
 
-static void
-micro_not(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static INLINE qword
+micro_lt(qword src0, qword src1)
 {
-   dst->u[0] = ~src->u[0];
-   dst->u[1] = ~src->u[1];
-   dst->u[2] = ~src->u[2];
-   dst->u[3] = ~src->u[3];
-}
+   const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
 
-static void
-micro_or(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] | src1->u[0];
-   dst->u[1] = src0->u[1] | src1->u[1];
-   dst->u[2] = src0->u[2] | src1->u[2];
-   dst->u[3] = src0->u[3] | src1->u[3];
+   return si_xori(tmp, 0xff);
 }
 
-static void
-micro_pow(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static INLINE qword
+micro_max(qword src0, qword src1)
 {
-   vec_float4 s0 = (vec_float4) {
-      src0->f[0], src0->f[1], src0->f[2], src0->f[3]
-   };
-   vec_float4 s1 = (vec_float4) {
-      src1->f[0], src1->f[1], src1->f[2], src1->f[3]
-   };
-   vec_float4 d = _powf4(s0, s1);
-
-   dst->f[0] = spu_extract(d, 0);
-   dst->f[1] = spu_extract(d, 1);
-   dst->f[2] = spu_extract(d, 2);
-   dst->f[3] = spu_extract(d, 3);
+   return si_selb(src1, src0, si_fcgt(src0, src1));
 }
 
-static void
-micro_rnd(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static INLINE qword
+micro_min(qword src0, qword src1)
 {
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
-   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
-   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
-   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
-#endif
+   return si_selb(src0, src1, si_fcgt(src0, src1));
 }
 
-static void
-micro_shl(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_neg(qword src)
 {
-   dst->i[0] = src0->i[0] << src1->i[0];
-   dst->i[1] = src0->i[1] << src1->i[1];
-   dst->i[2] = src0->i[2] << src1->i[2];
-   dst->i[3] = src0->i[3] << src1->i[3];
+   return si_xor(src, (qword) spu_splats(0x80000000));
 }
 
-static void
-micro_ishr(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_set_sign(qword src)
 {
-   dst->i[0] = src0->i[0] >> src1->i[0];
-   dst->i[1] = src0->i[1] >> src1->i[1];
-   dst->i[2] = src0->i[2] >> src1->i[2];
-   dst->i[3] = src0->i[3] >> src1->i[3];
+   return si_or(src, (qword) spu_splats(0x80000000));
 }
 
-static void
-micro_trunc(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0 )
+static qword
+micro_pow(qword src0, qword src1)
 {
-   dst->f[0] = (float) (int) src0->f[0];
-   dst->f[1] = (float) (int) src0->f[1];
-   dst->f[2] = (float) (int) src0->f[2];
-   dst->f[3] = (float) (int) src0->f[3];
+   return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
 }
 
-static void
-micro_ushr(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_rnd(qword src)
 {
-   dst->u[0] = src0->u[0] >> src1->u[0];
-   dst->u[1] = src0->u[1] >> src1->u[1];
-   dst->u[2] = src0->u[2] >> src1->u[2];
-   dst->u[3] = src0->u[3] >> src1->u[3];
-}
+   const qword half = (qword) spu_splats(0.5f);
 
-static void
-micro_sin(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
-{
-   ASSERT(0);
-#if 0
-   dst->f[0] = (float) sin( (double) src->f[0] );
-   dst->f[1] = (float) sin( (double) src->f[1] );
-   dst->f[2] = (float) sin( (double) src->f[2] );
-   dst->f[3] = (float) sin( (double) src->f[3] );
-#endif
+   /* May be able to use _roundf4.  There may be some difference, though.
+    */
+   return (qword) _floorf4((vec_float4) si_fa(src, half));
 }
 
-static void
-micro_sqrt( union spu_exec_channel *dst,
-            const union spu_exec_channel *src )
+static INLINE qword
+micro_ishr(qword src0, qword src1)
 {
-   vec_float4 s = (vec_float4) {
-      src->f[0], src->f[1], src->f[2], src->f[3]
-   };
-   vec_float4 d = _sqrtf4(s);
-
-   dst->f[0] = spu_extract(d, 0);
-   dst->f[1] = spu_extract(d, 1);
-   dst->f[2] = spu_extract(d, 2);
-   dst->f[3] = spu_extract(d, 3);
+   return si_rotma(src0, si_sfi(src1, 0));
 }
 
-static void
-micro_sub(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static qword
+micro_trunc(qword src)
 {
-   dst->f[0] = src0->f[0] - src1->f[0];
-   dst->f[1] = src0->f[1] - src1->f[1];
-   dst->f[2] = src0->f[2] - src1->f[2];
-   dst->f[3] = src0->f[3] - src1->f[3];
+   return (qword) _truncf4((vec_float4) src);
 }
 
-static void
-micro_u2f(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src )
+static qword
+micro_sin(qword src)
 {
-   dst->f[0] = (float) src->u[0];
-   dst->f[1] = (float) src->u[1];
-   dst->f[2] = (float) src->u[2];
-   dst->f[3] = (float) src->u[3];
+   return (qword) _sinf4((vec_float4) src);
 }
 
-static void
-micro_xor(
-   union spu_exec_channel *dst,
-   const union spu_exec_channel *src0,
-   const union spu_exec_channel *src1 )
+static INLINE qword
+micro_sqrt(qword src)
 {
-   dst->u[0] = src0->u[0] ^ src1->u[0];
-   dst->u[1] = src0->u[1] ^ src1->u[1];
-   dst->u[2] = src0->u[2] ^ src1->u[2];
-   dst->u[3] = src0->u[3] ^ src1->u[3];
+   return (qword) _sqrtf4((vec_float4) src);
 }
 
 static void
@@ -983,16 +522,15 @@ fetch_source(
 
    switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
    case TGSI_UTIL_SIGN_CLEAR:
-      micro_abs( chan, chan );
+      chan->q = micro_abs(chan->q);
       break;
 
    case TGSI_UTIL_SIGN_SET:
-      micro_abs( chan, chan );
-      micro_neg( chan, chan );
+      chan->q = micro_set_sign(chan->q);
       break;
 
    case TGSI_UTIL_SIGN_TOGGLE:
-      micro_neg( chan, chan );
+      chan->q = micro_neg(chan->q);
       break;
 
    case TGSI_UTIL_SIGN_KEEP:
@@ -1000,7 +538,7 @@ fetch_source(
    }
 
    if (reg->SrcRegisterExtMod.Complement) {
-      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
+      chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
    }
 }
 
@@ -1051,8 +589,8 @@ store_dest(
 
    case TGSI_SAT_ZERO_ONE:
       /* XXX need to obey ExecMask here */
-      micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+      dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
       break;
 
    case TGSI_SAT_MINUS_PLUS_ONE:
@@ -1162,7 +700,7 @@ exec_tex(struct spu_exec_machine *mach,
       switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
       case TGSI_EXTSWIZZLE_W:
          FETCH(&r[1], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[1] );
+         r[0].q = micro_div(r[0].q, r[1].q);
          break;
 
       case TGSI_EXTSWIZZLE_ONE:
@@ -1194,9 +732,9 @@ exec_tex(struct spu_exec_machine *mach,
       switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
       case TGSI_EXTSWIZZLE_W:
          FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
          break;
 
       case TGSI_EXTSWIZZLE_ONE:
@@ -1228,9 +766,9 @@ exec_tex(struct spu_exec_machine *mach,
       switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
       case TGSI_EXTSWIZZLE_W:
          FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
          break;
 
       case TGSI_EXTSWIZZLE_ONE:
@@ -1389,7 +927,7 @@ exec_instruction(
    case TGSI_OPCODE_ARL:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 FETCH( &r[0], 0, chan_index );
-	 micro_f2it( &r[0], &r[0] );
+         r[0].q = si_cflts(r[0].q, 0);
 	 STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1409,22 +947,27 @@ exec_instruction(
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
 	 FETCH( &r[0], 0, CHAN_X );
-	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
 	    STORE( &r[0], 0, CHAN_Y );
 	 }
 
-	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-	    FETCH( &r[1], 0, CHAN_Y );
-	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-
-	    FETCH( &r[2], 0, CHAN_W );
-	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
-	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
-	    micro_pow( &r[1], &r[1], &r[2] );
-	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
-	    STORE( &r[0], 0, CHAN_Z );
-	 }
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            FETCH( &r[1], 0, CHAN_Y );
+            r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+
+            FETCH( &r[2], 0, CHAN_W );
+            r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
+            r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
+            r[1].q = micro_pow(r[1].q, r[2].q);
+
+            /* r0 = (r0 > 0.0) ? r1 : 0.0
+             */
+            r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+            r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
+                             r[0].q);
+            STORE( &r[0], 0, CHAN_Z );
+         }
       }
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
@@ -1435,7 +978,7 @@ exec_instruction(
    case TGSI_OPCODE_RCP:
    /* TGSI_OPCODE_RECIP */
       FETCH( &r[0], 0, CHAN_X );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
       }
@@ -1444,8 +987,8 @@ exec_instruction(
    case TGSI_OPCODE_RSQ:
    /* TGSI_OPCODE_RECIPSQRT */
       FETCH( &r[0], 0, CHAN_X );
-      micro_sqrt( &r[0], &r[0] );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      r[0].q = micro_sqrt(r[0].q);
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
       }
@@ -1465,7 +1008,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         micro_mul( &r[0], &r[0], &r[1] );
+         r[0].q = si_fm(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1475,7 +1018,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
+         r[0].q = si_fa(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1484,17 +1027,16 @@ exec_instruction(
    /* TGSI_OPCODE_DOT3 */
       FETCH( &r[0], 0, CHAN_X );
       FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
       FETCH( &r[1], 0, CHAN_Y );
       FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
 
       FETCH( &r[1], 0, CHAN_Z );
       FETCH( &r[2], 1, CHAN_Z );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
@@ -1506,25 +1048,22 @@ exec_instruction(
        FETCH(&r[0], 0, CHAN_X);
        FETCH(&r[1], 1, CHAN_X);
 
-       micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
        FETCH(&r[1], 0, CHAN_Y);
        FETCH(&r[2], 1, CHAN_Y);
 
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
        FETCH(&r[1], 0, CHAN_Z);
        FETCH(&r[2], 1, CHAN_Z);
 
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
        FETCH(&r[1], 0, CHAN_W);
        FETCH(&r[2], 1, CHAN_W);
 
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1539,7 +1078,7 @@ exec_instruction(
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
 	 FETCH( &r[0], 0, CHAN_Y );
 	 FETCH( &r[1], 1, CHAN_Y);
-	 micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 	 STORE( &r[0], 0, CHAN_Y );
       }
 
@@ -1559,8 +1098,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         /* XXX use micro_min()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
+         r[0].q = micro_min(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1571,8 +1109,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         /* XXX use micro_max()?? */
-         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
+         r[0].q = micro_max(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index );
       }
@@ -1583,7 +1120,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1593,7 +1133,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         r[0].q = micro_ge(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1603,9 +1143,8 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_mul( &r[0], &r[0], &r[1] );
-         FETCH( &r[1], 2, chan_index );
-         micro_add( &r[0], &r[0], &r[1] );
+         FETCH( &r[2], 2, chan_index );
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1615,7 +1154,7 @@ exec_instruction(
          FETCH(&r[0], 0, chan_index);
          FETCH(&r[1], 1, chan_index);
 
-         micro_sub( &r[0], &r[0], &r[1] );
+         r[0].q = si_fs(r[0].q, r[1].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1628,9 +1167,8 @@ exec_instruction(
          FETCH(&r[1], 1, chan_index);
          FETCH(&r[2], 2, chan_index);
 
-         micro_sub( &r[1], &r[1], &r[2] );
-         micro_mul( &r[0], &r[0], &r[1] );
-         micro_add( &r[0], &r[0], &r[2] );
+         r[1].q = si_fs(r[1].q, r[2].q);
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -1661,7 +1199,7 @@ exec_instruction(
    /* TGSI_OPCODE_FRC */
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_frc( &r[0], &r[0] );
+         r[0].q = micro_frc(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1674,7 +1212,7 @@ exec_instruction(
    /* TGSI_OPCODE_FLR */
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_flr( &r[0], &r[0] );
+         r[0].q = micro_flr(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1682,7 +1220,7 @@ exec_instruction(
    case TGSI_OPCODE_ROUND:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_rnd( &r[0], &r[0] );
+         r[0].q = micro_rnd(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1691,7 +1229,7 @@ exec_instruction(
     /* TGSI_OPCODE_EX2 */
       FETCH(&r[0], 0, CHAN_X);
 
-      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+      r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1701,7 +1239,7 @@ exec_instruction(
    case TGSI_OPCODE_LOGBASE2:
    /* TGSI_OPCODE_LG2 */
       FETCH( &r[0], 0, CHAN_X );
-      micro_lg2( &r[0], &r[0] );
+      r[0].q = micro_lg2(r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
       }
@@ -1712,7 +1250,7 @@ exec_instruction(
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 1, CHAN_X);
 
-      micro_pow( &r[0], &r[0], &r[1] );
+      r[0].q = micro_pow(r[0].q, r[1].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1723,35 +1261,34 @@ exec_instruction(
       /* TGSI_OPCODE_XPD */
       FETCH(&r[0], 0, CHAN_Y);
       FETCH(&r[1], 1, CHAN_Z);
-
-      micro_mul( &r[2], &r[0], &r[1] );
-
       FETCH(&r[3], 0, CHAN_Z);
       FETCH(&r[4], 1, CHAN_Y);
 
-      micro_mul( &r[5], &r[3], &r[4] );
-      micro_sub( &r[2], &r[2], &r[5] );
+      /* r2 = (r0 * r1) - (r3 * r5)
+       */
+      r[2].q = si_fm(r[3].q, r[5].q);
+      r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
          STORE( &r[2], 0, CHAN_X );
       }
 
       FETCH(&r[2], 1, CHAN_X);
-
-      micro_mul( &r[3], &r[3], &r[2] );
-
       FETCH(&r[5], 0, CHAN_X);
 
-      micro_mul( &r[1], &r[1], &r[5] );
-      micro_sub( &r[3], &r[3], &r[1] );
+      /* r3 = (r3 * r2) - (r1 * r5)
+       */
+      r[1].q = si_fm(r[1].q, r[5].q);
+      r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
          STORE( &r[3], 0, CHAN_Y );
       }
 
-      micro_mul( &r[5], &r[5], &r[4] );
-      micro_mul( &r[0], &r[0], &r[2] );
-      micro_sub( &r[5], &r[5], &r[0] );
+      /* r5 = (r5 * r4) - (r0 * r2)
+       */
+      r[0].q = si_fm(r[0].q, r[2].q);
+      r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
 
       if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
          STORE( &r[5], 0, CHAN_Z );
@@ -1770,7 +1307,7 @@ exec_instruction(
        FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
           FETCH(&r[0], 0, chan_index);
 
-          micro_abs( &r[0], &r[0] );
+          r[0].q = micro_abs(r[0].q);
 
           STORE(&r[0], 0, chan_index);
        }
@@ -1784,23 +1321,21 @@ exec_instruction(
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 1, CHAN_X);
 
-      micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 1, CHAN_Y);
 
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FETCH(&r[1], 0, CHAN_Z);
       FETCH(&r[2], 1, CHAN_Z);
 
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FETCH(&r[1], 1, CHAN_W);
 
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fa(r[0].q, r[1].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1810,7 +1345,7 @@ exec_instruction(
    case TGSI_OPCODE_COS:
       FETCH(&r[0], 0, CHAN_X);
 
-      micro_cos( &r[0], &r[0] );
+      r[0].q = micro_cos(r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
 	 STORE( &r[0], 0, chan_index );
@@ -1820,7 +1355,7 @@ exec_instruction(
    case TGSI_OPCODE_DDX:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ddx( &r[0], &r[0] );
+         r[0].q = micro_ddx(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1828,7 +1363,7 @@ exec_instruction(
    case TGSI_OPCODE_DDY:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ddy( &r[0], &r[0] );
+         r[0].q = micro_ddy(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1866,9 +1401,9 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1],
-                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
-                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1881,14 +1416,14 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         r[0].q = si_fcgt(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
 
    case TGSI_OPCODE_SIN:
       FETCH( &r[0], 0, CHAN_X );
-      micro_sin( &r[0], &r[0] );
+      r[0].q = micro_sin(r[0].q);
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
       }
@@ -1898,7 +1433,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -1907,7 +1445,10 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2038,7 +1579,11 @@ exec_instruction(
          FETCH(&r[1], 1, chan_index);
          FETCH(&r[2], 2, chan_index);
 
-         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
+         /* r0 = (r0 < 0.0) ? r1 : r2
+          */
+         r[3].q = si_xor(r[3].q, r[3].q);
+         r[0].q = micro_lt(r[0].q, r[3].q);
+         r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
 
          STORE(&r[0], 0, chan_index);
       }
@@ -2049,11 +1594,11 @@ exec_instruction(
          FETCH( &r[0], 0, CHAN_X );
       }
       if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
-         micro_cos( &r[1], &r[0] );
+         r[1].q = micro_cos(r[0].q);
          STORE( &r[1], 0, CHAN_X );
       }
       if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         micro_sin( &r[1], &r[0] );
+         r[1].q = micro_sin(r[0].q);
          STORE( &r[1], 0, CHAN_Y );
       }
       if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
@@ -2075,12 +1620,11 @@ exec_instruction(
    case TGSI_OPCODE_DP2:
       FETCH( &r[0], 0, CHAN_X );
       FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
+      r[0].q = si_fm(r[0].q, r[1].q);
 
       FETCH( &r[1], 0, CHAN_Y );
       FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
 
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( &r[0], 0, chan_index );
@@ -2152,7 +1696,7 @@ exec_instruction(
    case TGSI_OPCODE_CEIL:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_ceil( &r[0], &r[0] );
+         r[0].q = micro_ceil(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2160,7 +1704,7 @@ exec_instruction(
    case TGSI_OPCODE_I2F:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_i2f( &r[0], &r[0] );
+         r[0].q = si_csflt(r[0].q, 0);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2168,7 +1712,7 @@ exec_instruction(
    case TGSI_OPCODE_NOT:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_not( &r[0], &r[0] );
+         r[0].q = si_xorbi(r[0].q, 0xff);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2176,7 +1720,7 @@ exec_instruction(
    case TGSI_OPCODE_TRUNC:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
-         micro_trunc( &r[0], &r[0] );
+         r[0].q = micro_trunc(r[0].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2185,7 +1729,9 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_shl( &r[0], &r[0], &r[1] );
+
+         r[0].q = si_shl(r[0].q, r[1].q);
+
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2194,7 +1740,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_ishr( &r[0], &r[0], &r[1] );
+         r[0].q = micro_ishr(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2203,7 +1749,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_and( &r[0], &r[0], &r[1] );
+         r[0].q = si_and(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2212,7 +1758,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_or( &r[0], &r[0], &r[1] );
+         r[0].q = si_or(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
@@ -2225,7 +1771,7 @@ exec_instruction(
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
-         micro_xor( &r[0], &r[0], &r[1] );
+         r[0].q = si_xor(r[0].q, r[1].q);
          STORE( &r[0], 0, chan_index );
       }
       break;
diff --git a/src/mesa/pipe/cell/spu/spu_exec.h b/src/mesa/pipe/cell/spu/spu_exec.h
index 89e422ba48..b4c7661ef6 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.h
+++ b/src/mesa/pipe/cell/spu/spu_exec.h
@@ -43,6 +43,7 @@ union spu_exec_channel
    float    f[QUAD_SIZE];
    int      i[QUAD_SIZE];
    unsigned u[QUAD_SIZE];
+   qword    q;
 };
 
 /**
-- 
cgit v1.2.3


From 490a7b1c73babd528b6d883471a8636157c5853a Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 17:12:20 -0800
Subject: Vectorize vertex puller

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 186 +++++++++--------------------
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |   4 +-
 2 files changed, 61 insertions(+), 129 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 5b0f2a6470..4133fbba17 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -42,6 +42,8 @@
 #define DRAW_DBG 0
 
 
+static const vec_float4 defaults = { 0.0, 0.0, 0.0, 1.0 };
+
 /**
  * Fetch a float[4] vertex attribute from memory, doing format/type
  * conversion as needed.
@@ -50,19 +52,16 @@
  * conversion, texture sampling etc.
  */
 #define FETCH_ATTRIB( NAME, SZ, CVT )			\
-static void						\
-fetch_##NAME(const void *ptr, float *attrib)		\
+static qword						\
+fetch_##NAME(const void *ptr)				\
 {							\
-   static const float defaults[4] = { 0,0,0,1 };	\
+   vec_float4 attrib = defaults;			\
    int i;						\
 							\
    for (i = 0; i < SZ; i++) {				\
-      attrib[i] = CVT;					\
-   }							\
-							\
-   for (; i < 4; i++) {					\
-      attrib[i] = defaults[i];				\
+      attrib = spu_insert(CVT, attrib, i);		\
    }							\
+   return (qword) attrib;				\
 }
 
 #define CVT_64_FLOAT   (float) ((double *) ptr)[i]
@@ -309,106 +308,59 @@ static spu_fetch_func get_fetch_func( enum pipe_format format )
 }
 
 
-static void 
-transpose_4x4( float *out, const float *in )
-{
-   /* This can be achieved in 12 sse instructions, plus the final
-    * stores I guess.  This is probably a bit more than that - maybe
-    * 32 or so?
-    */
-   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
-   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
-   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
-   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
-}
-
-
-
-static void fetch_xyz_rgb( struct spu_vs_context *draw,
-			   struct spu_exec_machine *machine,
-			   const unsigned *elts,
-			   unsigned count )
+void
+spu_transpose_4x4(qword *out, const qword *in)
 {
-   assert(count <= 4);
-
-//   _mesa_printf("%s\n", __FUNCTION__);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   for (i = 0; i < 4; i++) {
+   static const qword masks[8] = {
       {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-
+         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
       {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-   }
-}
-
-
-
-
-static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
-			      struct spu_exec_machine *machine,
-			      const unsigned *elts,
-			      unsigned count )
-{
-   assert(count <= 4);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   for (i = 0; i < 4; i++) {
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      },
+
+      { 
+         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
+      { 
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+      },
+
+      { 
+         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
+      { 
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
+      },
+
+      { 
+         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+      },
       {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
+         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+      },
+   };
 
-      {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
+   out[0] = si_shufb(in[0], in[1], masks[0]);
+   out[0] = si_or(out[0], si_shufb(in[2], in[3], masks[1]));
 
-      {
-	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = 0.0f;
- 	 out[12] = 1.0f;
-      }
-   }
-}
+   out[1] = si_shufb(in[0], in[1], masks[2]);
+   out[1] = si_or(out[1], si_shufb(in[2], in[3], masks[3]));
 
+   out[2] = si_shufb(in[0], in[1], masks[4]);
+   out[2] = si_or(out[2], si_shufb(in[2], in[3], masks[5]));
 
+   out[3] = si_shufb(in[0], in[1], masks[6]);
+   out[3] = si_or(out[3], si_shufb(in[2], in[3], masks[7]));
+}
 
 
 /**
@@ -435,7 +387,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
       const uint64_t src = draw->vertex_fetch.src_ptr[attr];
       const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
       unsigned i;
-      float p[4][4];
+      qword p[4];
 
 
       /* Fetch four attributes for four vertices.  
@@ -452,17 +404,15 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-         memmove(& buffer, buffer + (addr & 0x0f), 16);
-
-         fetch(buffer, p[i]);
+         p[i] = (*fetch)(buffer + (addr & 0x0f));
       }
 
       /* Be nice and zero out any missing vertices: 
        */
       for (/* empty */; i < 4; i++) 
-          p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
-      
-      /* Transpose/swizzle into sse-friendly format.  Currently
+          p[i] = si_xor(p[i], p[i]);
+
+      /* Transpose/swizzle into vector-friendly format.  Currently
        * assuming that all vertex shader inputs are float[4], but this
        * isn't true -- if the vertex shader only wants tex0.xy, we
        * could optimize for that.
@@ -471,7 +421,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * excessive number of fetch functions, but we could at least
        * minimize the transpose step:
        */
-      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+      spu_transpose_4x4(&machine->Inputs[attr].xyzw[0].q, p);
    }
 }
 
@@ -487,24 +437,4 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )
    }
 
    draw->vertex_fetch.fetch_func = generic_vertex_fetch;
-
-   /* Disable the fast path because they don't use mfc_get yet.
-    */
-#if 0
-   switch (draw->vertex_fetch.nr_attrs) {
-   case 2:
-      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT)
-          draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
-      break;
-   case 3:
-      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT &&
-          draw->vertex_fetch.format[2] == PIPE_FORMAT_R32G32_FLOAT)
-          draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
-      break;
-   default:
-      break;
-   }
-#endif
 }
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index b261ab44a2..2435b7ddae 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -6,7 +6,7 @@
 
 struct spu_vs_context;
 
-typedef void (*spu_fetch_func)(const void *ptr, float *attrib);
+typedef qword (*spu_fetch_func)(const void *ptr);
 typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
 				     struct spu_exec_machine *machine,
 				     const unsigned *elts,
@@ -39,6 +39,8 @@ struct spu_vs_context {
 
 extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
 
+extern void spu_transpose_4x4(qword *out, const qword *in);
+
 static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 				    struct spu_exec_machine *machine,
 				    const unsigned *elts,
-- 
cgit v1.2.3


From e8a80c8627972537c595f06fb28cd383569e7ea0 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Fri, 1 Feb 2008 17:14:09 -0800
Subject: More semi-trivial vectorization in the shader VM

---
 src/mesa/pipe/cell/spu/spu_exec.c | 62 +++++++++++++++++----------------------
 1 file changed, 27 insertions(+), 35 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 1ac9c031e3..1bd8687d41 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -70,6 +70,7 @@
 #include "pipe/tgsi/util/tgsi_util.h"
 #include "spu_exec.h"
 #include "spu_main.h"
+#include "spu_vertex_shader.h"
 
 #define TILE_TOP_LEFT     0
 #define TILE_TOP_RIGHT    1
@@ -144,23 +145,27 @@ spu_exec_machine_init(struct spu_exec_machine *mach,
                       struct spu_sampler *samplers,
                       unsigned processor)
 {
+   qword zero;
+   qword not_zero;
    uint i;
 
    mach->Samplers = samplers;
    mach->Processor = processor;
    mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
 
+   zero = si_xor(zero, zero);
+   not_zero = si_xori(zero, 0xff);
+
    /* Setup constants. */
-   for( i = 0; i < 4; i++ ) {
-      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
-      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
-      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
-      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
-      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
-      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
-      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
-      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
-   }
+   mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
+   mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
+   mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
+   mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
+
+   mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
+   mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
+   mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
+   mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
 }
 
 
@@ -459,25 +464,16 @@ fetch_source(
          &index2,
          &indir_index );
 
-      index.i[0] += indir_index.i[0];
-      index.i[1] += indir_index.i[1];
-      index.i[2] += indir_index.i[2];
-      index.i[3] += indir_index.i[3];
+      index.q = si_a(index.q, indir_index.q);
    }
 
    if( reg->SrcRegister.Dimension ) {
       switch( reg->SrcRegister.File ) {
       case TGSI_FILE_INPUT:
-         index.i[0] *= 17;
-         index.i[1] *= 17;
-         index.i[2] *= 17;
-         index.i[3] *= 17;
+         index.q = si_mpyi(index.q, 17);
          break;
       case TGSI_FILE_CONSTANT:
-         index.i[0] *= 4096;
-         index.i[1] *= 4096;
-         index.i[2] *= 4096;
-         index.i[3] *= 4096;
+         index.q = si_shli(index.q, 12);
          break;
       default:
          assert( 0 );
@@ -505,10 +501,7 @@ fetch_source(
             &index2,
             &indir_index );
 
-         index.i[0] += indir_index.i[0];
-         index.i[1] += indir_index.i[1];
-         index.i[2] += indir_index.i[2];
-         index.i[3] += indir_index.i[3];
+         index.q = si_a(index.q, indir_index.q);
       }
    }
 
@@ -666,17 +659,16 @@ fetch_texel( struct spu_sampler *sampler,
              union spu_exec_channel *b,
              union spu_exec_channel *a )
 {
-   uint j;
-   float rgba[NUM_CHANNELS][QUAD_SIZE];
+   qword rgba[4];
+   qword out[4];
 
-   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
 
-   for (j = 0; j < 4; j++) {
-      r->f[j] = rgba[0][j];
-      g->f[j] = rgba[1][j];
-      b->f[j] = rgba[2][j];
-      a->f[j] = rgba[3][j];
-   }
+   spu_transpose_4x4(out, rgba);
+   r->q = out[0];
+   g->q = out[1];
+   b->q = out[2];
+   a->q = out[3];
 }
 
 
-- 
cgit v1.2.3


From 45f4125fa83c4e43a01d44cb8eb2a4c97b72181f Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 4 Feb 2008 16:03:55 -0800
Subject: Add some debug messages

---
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index 4133fbba17..cfa449e813 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -378,7 +378,10 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
 
    wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -401,6 +404,9 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
          const uint64_t addr = src + (elts[i] * pitch);
          const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
 
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
          mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
          wait_on_mask(1 << TAG_VERTEX_BUFFER);
 
-- 
cgit v1.2.3


From c9f98142b6a47825c49aea72a79c1be62c2b7d89 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Tue, 5 Feb 2008 09:43:52 -0800
Subject: Use _transpose_matrix4x4 from Cell SDK instead of my own version

---
 src/mesa/pipe/cell/spu/spu_exec.c          |  3 +-
 src/mesa/pipe/cell/spu/spu_vertex_fetch.c  | 59 ++----------------------------
 src/mesa/pipe/cell/spu/spu_vertex_shader.h |  2 -
 3 files changed, 5 insertions(+), 59 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
index 1bd8687d41..e51008b9b3 100644
--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -52,6 +52,7 @@
 
 #include <libmisc.h>
 #include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
 #include <simdmath/ceilf4.h>
 #include <simdmath/cosf4.h>
 #include <simdmath/divf4.h>
@@ -664,7 +665,7 @@ fetch_texel( struct spu_sampler *sampler,
 
    sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
 
-   spu_transpose_4x4(out, rgba);
+   _transpose_matrix4x4(out, rgba);
    r->q = out[0];
    g->q = out[1];
    b->q = out[2];
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
index cfa449e813..6e86a919ce 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -31,6 +31,8 @@
   */
 
 #include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
+
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
@@ -308,61 +310,6 @@ static spu_fetch_func get_fetch_func( enum pipe_format format )
 }
 
 
-void
-spu_transpose_4x4(qword *out, const qword *in)
-{
-   static const qword masks[8] = {
-      {
-         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      {
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
-      },
-
-      { 
-         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      { 
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
-      },
-
-      { 
-         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      { 
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
-      },
-
-      { 
-         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-      },
-      {
-         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-         0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
-      },
-   };
-
-   out[0] = si_shufb(in[0], in[1], masks[0]);
-   out[0] = si_or(out[0], si_shufb(in[2], in[3], masks[1]));
-
-   out[1] = si_shufb(in[0], in[1], masks[2]);
-   out[1] = si_or(out[1], si_shufb(in[2], in[3], masks[3]));
-
-   out[2] = si_shufb(in[0], in[1], masks[4]);
-   out[2] = si_or(out[2], si_shufb(in[2], in[3], masks[5]));
-
-   out[3] = si_shufb(in[0], in[1], masks[6]);
-   out[3] = si_or(out[3], si_shufb(in[2], in[3], masks[7]));
-}
-
-
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
@@ -427,7 +374,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
        * excessive number of fetch functions, but we could at least
        * minimize the transpose step:
        */
-      spu_transpose_4x4(&machine->Inputs[attr].xyzw[0].q, p);
+      _transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p);
    }
 }
 
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
index 2435b7ddae..c96b93ff0a 100644
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -39,8 +39,6 @@ struct spu_vs_context {
 
 extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
 
-extern void spu_transpose_4x4(qword *out, const qword *in);
-
 static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 				    struct spu_exec_machine *machine,
 				    const unsigned *elts,
-- 
cgit v1.2.3


From b0974420f4dab55d398f4015cf71a62fa643f713 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 14:21:01 -0700
Subject: Cell: added cell_batch_alloc_aligned()

---
 src/mesa/pipe/cell/ppu/cell_batch.c | 26 ++++++++++++++++++++------
 src/mesa/pipe/cell/ppu/cell_batch.h |  4 ++++
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index 2fb49711b2..f45e5f25b6 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -157,7 +157,7 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
       size = 0;
    }
 
-   assert(size + bytes <= CELL_BUFFER_SIZE);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
 
@@ -167,14 +167,22 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 
 void *
 cell_batch_alloc(struct cell_context *cell, uint bytes)
+{
+   return cell_batch_alloc_aligned(cell, bytes, 1);
+}
+
+
+void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment)
 {
    void *pos;
-   uint size;
+   uint size, padbytes;
 
    ASSERT(bytes % 8 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
-
-   assert(cell->cur_batch >= 0);
+   ASSERT(alignment > 0);
+   ASSERT(cell->cur_batch >= 0);
 
 #ifdef ASSERT
    {
@@ -188,12 +196,18 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
 
    size = cell->buffer_size[cell->cur_batch];
 
-   if (size + bytes > CELL_BUFFER_SIZE) {
+   padbytes = (alignment - (size % alignment)) % alignment;
+
+   if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
+   else {
+      size += padbytes;
+   }
 
-   assert(size + bytes <= CELL_BUFFER_SIZE);
+   ASSERT(size % alignment == 0);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    pos = (void *) (cell->buffer[cell->cur_batch] + size);
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index f4f37314a4..a6eee0a8b1 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -50,5 +50,9 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
 extern void *
 cell_batch_alloc(struct cell_context *cell, uint bytes);
 
+extern void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment);
+
 
 #endif /* CELL_BATCH_H */
-- 
cgit v1.2.3


From 2174890ed030bde8494b7f13b7090e27771695fa Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 14:21:48 -0700
Subject: Cell: remove dummy fields, update/add some comments

---
 src/mesa/pipe/cell/common.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index cf8fc94ebf..4de514c358 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -51,7 +51,7 @@
 
 /** for sanity checking */
 #define ASSERT_ALIGN16(ptr) \
-   assert((((unsigned long) (ptr)) & 0xf) == 0);
+  ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
 
 
 /** round up value to next multiple of 4 */
@@ -105,7 +105,7 @@
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;
+   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
@@ -117,7 +117,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_clear_surface
 {
-   uint64_t opcode;
+   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
 };
@@ -128,8 +128,8 @@ struct cell_command_clear_surface
  */
 struct cell_array_info
 {
-    uint64_t base;          /**< Base address of the 0th element. */
-    uint attr;          /**< Attribute that this state if for. */
+    uint64_t base;      /**< Base address of the 0th element. */
+    uint attr;          /**< Attribute that this state is for. */
     uint pitch;         /**< Byte pitch from one entry to the next. */
     uint format;        /**< Pipe format of each entry. */
 } ALIGN16_ATTRIB;
@@ -169,11 +169,9 @@ struct cell_command_render
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
-   uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
    uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
-   float xmin, dummy2, ymin, xmax, ymax;  /* XXX another dummy field */
-   uint dummy3;
+   float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
 };
-- 
cgit v1.2.3


From 4da82fd5c5e0a7535e30aa81f08dcbe1a26358b7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 14:23:34 -0700
Subject: Cell: re-enable inlined vertex buffers

Vertex data must be on a 16-byte address/offset so SIMD operations will work
properly in the SPU code.
---
 src/mesa/pipe/cell/ppu/cell_vbuf.c  | 12 +++++-------
 src/mesa/pipe/cell/spu/spu_main.c   |  3 ++-
 src/mesa/pipe/cell/spu/spu_render.c | 12 ++++++++----
 3 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index 0fee61821a..e9fafe492e 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -40,7 +40,7 @@
 
 
 /** Allow vertex data to be inlined after RENDER command */
-#define ALLOW_INLINE_VERTS 0
+#define ALLOW_INLINE_VERTS 1
 
 
 /**
@@ -199,9 +199,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    {
       const uint index_bytes = ROUNDUP8(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
-
-      const uint batch_size = sizeof(struct cell_command_render)
-         + index_bytes;
+      const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
 
       struct cell_command_render *render
          = (struct cell_command_render *)
@@ -223,9 +221,9 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
           min_index == 0 &&
-          vertex_bytes <= cell_batch_free_space(cell)) {
-         /* vertex data inlined, after indices */
-         void *dst = cell_batch_alloc(cell, vertex_bytes);
+          vertex_bytes + 16 <= cell_batch_free_space(cell)) {
+         /* vertex data inlined, after indices, at 16-byte boundary */
+         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
          render->vertex_buf = ~0;
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 4f126d5e5b..e375197fe6 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -387,7 +387,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += sizeof(*render) / 8 + ((pos_incr + 1) / 2);
+            pos += pos_incr;
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -541,6 +541,7 @@ main(main_param_t speid, main_param_t argp)
    (void) speid;
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+   ASSERT(sizeof(struct cell_command_render) % 8 == 0);
 
    one_time_init();
 
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
index e8705eeeba..932fb500b3 100644
--- a/src/mesa/pipe/cell/spu/spu_render.c
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -171,6 +171,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
    ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
    const uint vertex_size = render->vertex_size; /* in bytes */
    /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   uint index_bytes;
    const ubyte *vertices;
    const ushort *indexes;
    uint i, j;
@@ -199,13 +200,16 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
 
    /* indexes are right after the render command in the batch buffer */
    indexes = (const ushort *) (render + 1);
-   *pos_incr = (render->num_indexes * 2 + 3) / 4;
+   index_bytes = ROUNDUP8(render->num_indexes * 2);
+   *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
 
 
    if (render->inline_verts) {
-      /* Vertices are right after indexes in batch buffer */
-      vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-      *pos_incr = *pos_incr + total_vertex_bytes / 4;
+      /* Vertices are after indexes in batch buffer at next 16-byte addr */
+      vertices = (const ubyte *) render + (*pos_incr * 8);
+      vertices = (const ubyte *) align_pointer((void *) vertices, 16);
+      ASSERT_ALIGN16(vertices);
+      *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
    }
    else {
       /* Begin DMA fetch of vertex buffer */
-- 
cgit v1.2.3


From 1730f7bad462ac7f29857b8b2347e38c1b6c9820 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 15:07:36 -0700
Subject: Cell: SIMD-ize tri_linear_coeff(), use vector float for vertex
 attributes in struct vertex_header

---
 src/mesa/pipe/cell/spu/spu_tri.c | 112 ++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 37 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 688c8646ab..be9624cf7d 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -56,7 +56,7 @@ typedef union
  * Simplified types taken from other parts of Gallium
  */
 struct vertex_header {
-   float data[0][4];
+   vector float data[1];
 };
 
 
@@ -476,6 +476,7 @@ static void print_vertex(const struct vertex_header *v)
 }
 #endif
 
+
 static boolean setup_sort_vertices(const struct vertex_header *v0,
                                    const struct vertex_header *v1,
                                    const struct vertex_header *v2)
@@ -492,9 +493,9 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
 
    /* determine bottom to top order of vertices */
    {
-      float y0 = v0->data[0][1];
-      float y1 = v1->data[0][1];
-      float y2 = v2->data[0][1];
+      float y0 = spu_extract(v0->data[0], 1);
+      float y1 = spu_extract(v1->data[0], 1);
+      float y2 = spu_extract(v2->data[0], 1);
       if (y0 <= y1) {
 	 if (y1 <= y2) {
 	    /* y0<=y1<=y2 */
@@ -538,25 +539,25 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
    }
 
    /* Check if triangle is completely outside the tile bounds */
-   if (setup.vmin->data[0][1] > setup.cliprect_maxy)
+   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
       return FALSE;
-   if (setup.vmax->data[0][1] < setup.cliprect_miny)
+   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
       return FALSE;
-   if (setup.vmin->data[0][0] < setup.cliprect_minx &&
-       setup.vmid->data[0][0] < setup.cliprect_minx &&
-       setup.vmax->data[0][0] < setup.cliprect_minx)
+   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
       return FALSE;
-   if (setup.vmin->data[0][0] > setup.cliprect_maxx &&
-       setup.vmid->data[0][0] > setup.cliprect_maxx &&
-       setup.vmax->data[0][0] > setup.cliprect_maxx)
+   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
       return FALSE;
 
-   setup.ebot.dx = setup.vmid->data[0][0] - setup.vmin->data[0][0];
-   setup.ebot.dy = setup.vmid->data[0][1] - setup.vmin->data[0][1];
-   setup.emaj.dx = setup.vmax->data[0][0] - setup.vmin->data[0][0];
-   setup.emaj.dy = setup.vmax->data[0][1] - setup.vmin->data[0][1];
-   setup.etop.dx = setup.vmax->data[0][0] - setup.vmid->data[0][0];
-   setup.etop.dy = setup.vmax->data[0][1] - setup.vmid->data[0][1];
+   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
+   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
 
    /*
     * Compute triangle's area.  Use 1/area to compute partial
@@ -597,14 +598,12 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
  * The result will be put into setup.coef[slot].a0.
  * \param slot  which attribute slot 
  */
-static INLINE void const_coeff(uint slot)
+static INLINE void
+const_coeff(uint slot)
 {
    setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
    setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
-   setup.coef[slot].a0.f[0] = setup.vprovoke->data[slot][0];
-   setup.coef[slot].a0.f[1] = setup.vprovoke->data[slot][1];
-   setup.coef[slot].a0.f[2] = setup.vprovoke->data[slot][2];
-   setup.coef[slot].a0.f[3] = setup.vprovoke->data[slot][3];
+   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
 }
 
 
@@ -612,12 +611,19 @@ static INLINE void const_coeff(uint slot)
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a triangle.
  */
-static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
+static INLINE void
+tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
 {
    uint i;
+   const float *vmin_d = (float *) &setup.vmin->data[slot];
+   const float *vmid_d = (float *) &setup.vmid->data[slot];
+   const float *vmax_d = (float *) &setup.vmax->data[slot];
+   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
+   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+
    for (i = firstComp; i < lastComp; i++) {
-      float botda = setup.vmid->data[slot][i] - setup.vmin->data[slot][i];
-      float majda = setup.vmax->data[slot][i] - setup.vmin->data[slot][i];
+      float botda = vmid_d[i] - vmin_d[i];
+      float majda = vmax_d[i] - vmin_d[i];
       float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
       float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
    
@@ -638,9 +644,9 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup.coef[slot].a0.f[i] = (setup.vmin->data[slot][i] - 
-                                 (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
-                                  setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
+                                 (setup.coef[slot].dadx.f[i] * x + 
+                                  setup.coef[slot].dady.f[i] * y));
    }
 
    /*
@@ -653,6 +659,37 @@ static void tri_linear_coeff( uint slot, uint firstComp, uint lastComp )
 }
 
 
+/**
+ * As above, but interp setup all four vector components.
+ */
+static INLINE void
+tri_linear_coeff4(uint slot)
+{
+   const vector float vmin_d = setup.vmin->data[slot];
+   const vector float vmid_d = setup.vmid->data[slot];
+   const vector float vmax_d = setup.vmax->data[slot];
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+                         
+   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+
 #if 0
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
@@ -710,17 +747,18 @@ static void setup_tri_coefficients(void)
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(i, 2, 3);
+         /*tri_linear_coeff(i, 2, 3);*/
          /* XXX interp W if PERSPECTIVE... */
+         tri_linear_coeff4(i);
          break;
       case INTERP_CONSTANT:
          const_coeff(i);
          break;
       case INTERP_LINEAR:
-         tri_linear_coeff(i, 0, 4);
+         tri_linear_coeff4(i);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff(i, 0, 4); /* XXX temporary */
+         tri_linear_coeff4(i);  /* temporary */
          break;
       default:
          ASSERT(0);
@@ -738,12 +776,12 @@ static void setup_tri_coefficients(void)
 
 static void setup_tri_edges(void)
 {
-   float vmin_x = setup.vmin->data[0][0] + 0.5f;
-   float vmid_x = setup.vmid->data[0][0] + 0.5f;
+   float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
+   float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 
-   float vmin_y = setup.vmin->data[0][1] - 0.5f;
-   float vmid_y = setup.vmid->data[0][1] - 0.5f;
-   float vmax_y = setup.vmax->data[0][1] - 0.5f;
+   float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+   float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
+   float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 
    setup.emaj.sy = CEILF(vmin_y);
    setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
-- 
cgit v1.2.3


From e39fccc34c07a015d8713841a69037e32187dd6d Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 15:12:18 -0700
Subject: Cell: remove accidentally added OPT_FLAGS lines

---
 src/mesa/pipe/cell/spu/Makefile | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 66f16cde9b..f202971d73 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -8,8 +8,6 @@ TOP = ../../../../..
 include $(TOP)/configs/linux-cell
 
 
-OPT_FLAGS=-g
-OPT_FLAGS=-O3
 PROG = g3d
 
 PROG_SPU = $(PROG)_spu
-- 
cgit v1.2.3


From 5e2d0517b1b42c6f94fa69bf4e32a19d00fd519f Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 4 Feb 2008 17:02:08 +0900
Subject: gallium: Portability guidelines.

---
 src/mesa/pipe/README.portability | 43 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 src/mesa/pipe/README.portability

(limited to 'src')

diff --git a/src/mesa/pipe/README.portability b/src/mesa/pipe/README.portability
new file mode 100644
index 0000000000..c70ca774da
--- /dev/null
+++ b/src/mesa/pipe/README.portability
@@ -0,0 +1,43 @@
+	      CROSS-PLATFORM PORTABILITY GUIDELINES FOR GALLIUM3D 
+
+
+= General Considerations =
+
+The state tracker and winsys driver support a rather limited number of
+platforms. However, the pipe drivers are meant to run in a wide number of
+platforms. Hence the pipe drivers, the auxiliary modules, and all public
+headers in general, should stricly follow these guidelines to ensure
+
+
+= Compiler Support =
+
+* Include the p_compiler.h.
+
+* Don't use the 'inline' keyword, use the INLINE macro in p_compiler.h instead.
+
+* Cast explicitly when converting to integer types of smaller sizes.
+
+* Cast explicitly when converting between float, double and integral types.
+
+* Don't use named struct initializers.
+
+* Don't use variable number of macro arguments. Use static inline functions
+instead.
+
+
+= Standard Library =
+
+* Avoid including standard library headers. Most standard library functions are
+not available in Windows Kernel Mode. Use the appropriate p_*.h include.
+
+== Memory Allocation ==
+
+* Use MALLOC, CALLOC, FREE instead of the malloc, calloc, free functions.
+
+* Use align_pointer() function defined in p_util.h for aligning pointers in a
+portable way.
+
+== Debugging ==
+
+TODO
+
-- 
cgit v1.2.3


From a31d289de6091987e6b9da6af1b2e56eb79a96fb Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 6 Feb 2008 13:27:49 +0900
Subject: gallium: Cross-platform debugging helpers.

---
 src/mesa/SConscript          |  1 +
 src/mesa/pipe/p_debug.h      | 79 ++++++++++++++++++++++++++++++++++++++++++++
 src/mesa/pipe/util/p_debug.c | 70 +++++++++++++++++++++++++++++++++++++++
 src/mesa/sources             |  1 +
 4 files changed, 151 insertions(+)
 create mode 100644 src/mesa/pipe/p_debug.h
 create mode 100644 src/mesa/pipe/util/p_debug.c

(limited to 'src')

diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index 70a98f3129..faf8c84872 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -158,6 +158,7 @@ STATECACHE_SOURCES = [
 ]
 
 PIPEUTIL_SOURCES = [
+	'pipe/util/p_debug.c',
 	'pipe/util/p_tile.c',
 	'pipe/util/p_util.c',
 ]
diff --git a/src/mesa/pipe/p_debug.h b/src/mesa/pipe/p_debug.h
new file mode 100644
index 0000000000..b037eba2a3
--- /dev/null
+++ b/src/mesa/pipe/p_debug.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Cross-platform debugging helpers.
+ * 
+ * For now it just has assert and printf replacements, but it might be extended 
+ * with stack trace reports and more advanced logging in the near future. 
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef P_DEBUG_H_
+#define P_DEBUG_H_
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+#ifdef DBG
+#ifndef DEBUG
+#define DEBUG 1
+#endif
+#else
+#ifndef NDEBUG
+#define NDEBUG 1
+#endif
+#endif
+
+
+void debug_printf(const char *format, ...);
+void debug_assert_fail(const char *expr, const char *file, unsigned line);
+
+/** Assert macro */
+#ifdef DEBUG
+#define debug_assert(expr) ((expr) ? (void)0 : debug_assert_fail(#expr, __FILE__, __LINE__))
+#else
+#define debug_assert(expr) ((void)0)
+#endif
+
+
+#ifdef assert
+#warning Standard C Library assert macro usage detected. 
+#undef assert
+#endif
+#define assert(expr) debug_assert(expr)
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* P_DEBUG_H_ */
diff --git a/src/mesa/pipe/util/p_debug.c b/src/mesa/pipe/util/p_debug.c
new file mode 100644
index 0000000000..faa093e57c
--- /dev/null
+++ b/src/mesa/pipe/util/p_debug.c
@@ -0,0 +1,70 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdarg.h>
+
+#ifdef WIN32
+#include <windows.h>
+#include <winddi.h>
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#include "pipe/p_debug.h" 
+#include "pipe/p_compiler.h" 
+
+
+void debug_printf(const char *format, ...)
+{
+   va_list ap;
+   va_start( ap, format );  
+#ifdef WIN32
+   EngDebugPrint("Gallium3D: ", (PCHAR)format, ap);
+#else
+   vfprintf(stderr, format, ap);
+#endif
+   va_end( ap );
+}
+
+
+static INLINE debug_abort(void) 
+{
+#ifdef WIN32
+   EngDebugBreak();
+#else
+   abort();
+#endif
+}
+
+
+void debug_assert_fail(const char *expr, const char *file, unsigned line) 
+{
+   debug_printf("%s:%i: Assertion `%s' failed.");
+   debug_abort();
+}
diff --git a/src/mesa/sources b/src/mesa/sources
index e31d8cc466..c0087f76e6 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -195,6 +195,7 @@ STATECACHE_SOURCES = \
 	pipe/cso_cache/cso_cache.c
 
 PIPEUTIL_SOURCES = \
+	pipe/util/p_debug.c \
 	pipe/util/p_tile.c \
 	pipe/util/p_util.c
 
-- 
cgit v1.2.3


From 78bce9c2dcd45d1d8706bb9bab3b3a73943de990 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 6 Feb 2008 14:37:24 +0900
Subject: gallium: Add forgotten return type.

---
 src/mesa/pipe/util/p_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/util/p_debug.c b/src/mesa/pipe/util/p_debug.c
index faa093e57c..9303c970cc 100644
--- a/src/mesa/pipe/util/p_debug.c
+++ b/src/mesa/pipe/util/p_debug.c
@@ -53,7 +53,7 @@ void debug_printf(const char *format, ...)
 }
 
 
-static INLINE debug_abort(void) 
+static INLINE void debug_abort(void) 
 {
 #ifdef WIN32
    EngDebugBreak();
-- 
cgit v1.2.3


From 9791d7f64c5a58b9c1bf32d00c71e0e031f54f70 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 6 Feb 2008 14:37:49 +0900
Subject: gallium: Use p_debug.h instead of non-portable stdio.h/assert.h
 functions.

---
 src/mesa/pipe/draw/draw_prim.c                  | 18 +++----
 src/mesa/pipe/draw/draw_vbuf.c                  | 63 ++++++++++++-------------
 src/mesa/pipe/draw/draw_vertex_fetch.c          |  8 ++--
 src/mesa/pipe/draw/draw_vertex_shader.c         |  6 +--
 src/mesa/pipe/draw/draw_vertex_shader_llvm.c    |  4 +-
 src/mesa/pipe/draw/draw_vf.c                    |  4 +-
 src/mesa/pipe/draw/draw_vf_generic.c            |  3 +-
 src/mesa/pipe/draw/draw_vf_sse.c                |  6 +--
 src/mesa/pipe/i915simple/i915_fpc_translate.c   | 14 +++---
 src/mesa/pipe/i915simple/i915_prim_vbuf.c       |  3 +-
 src/mesa/pipe/i915simple/i915_state_derived.c   |  2 +-
 src/mesa/pipe/i915simple/i915_state_emit.c      |  2 +-
 src/mesa/pipe/i915simple/i915_state_immediate.c |  2 +-
 src/mesa/pipe/i915simple/i915_state_sampler.c   |  2 +-
 src/mesa/pipe/i965simple/brw_cc.c               |  2 +-
 src/mesa/pipe/i965simple/brw_curbe.c            |  6 +--
 src/mesa/pipe/i965simple/brw_eu_debug.c         | 10 ++--
 src/mesa/pipe/i965simple/brw_eu_emit.c          |  4 +-
 src/mesa/pipe/i965simple/brw_sf.c               |  8 ++--
 src/mesa/pipe/i965simple/brw_sf_emit.c          | 14 +++---
 src/mesa/pipe/i965simple/brw_state.c            |  2 +-
 src/mesa/pipe/i965simple/brw_state_cache.c      |  6 +--
 src/mesa/pipe/i965simple/brw_state_pool.c       |  4 +-
 src/mesa/pipe/i965simple/brw_urb.c              |  6 +--
 src/mesa/pipe/i965simple/brw_vs_emit.c          |  2 +-
 src/mesa/pipe/i965simple/brw_wm.c               |  2 +-
 src/mesa/pipe/i965simple/brw_wm_glsl.c          |  2 +-
 src/mesa/pipe/i965simple/brw_wm_sampler_state.c |  2 +-
 src/mesa/pipe/p_compiler.h                      |  3 +-
 src/mesa/pipe/p_format.h                        |  3 ++
 src/mesa/pipe/p_util.h                          |  4 --
 src/mesa/pipe/pipebuffer/pb_buffer.h            |  4 +-
 src/mesa/pipe/pipebuffer/pb_buffer_fenced.c     |  4 +-
 src/mesa/pipe/pipebuffer/pb_buffer_fenced.h     |  2 +-
 src/mesa/pipe/pipebuffer/pb_buffer_malloc.c     |  4 +-
 src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c     |  4 +-
 src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c         | 21 ++++-----
 src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c       |  8 ++--
 src/mesa/pipe/softpipe/sp_prim_setup.c          | 24 +++++-----
 src/mesa/pipe/softpipe/sp_quad_fs.c             | 10 ++--
 src/mesa/pipe/softpipe/sp_tile_cache.c          |  4 +-
 src/mesa/pipe/tgsi/exec/tgsi_exec.c             |  6 +--
 src/mesa/pipe/tgsi/exec/tgsi_sse2.c             | 44 ++++++++---------
 src/mesa/pipe/tgsi/util/tgsi_build.c            |  1 +
 src/mesa/pipe/tgsi/util/tgsi_dump.c             |  3 ++
 src/mesa/pipe/tgsi/util/tgsi_parse.c            |  1 +
 src/mesa/pipe/tgsi/util/tgsi_util.c             |  1 +
 47 files changed, 176 insertions(+), 182 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 58400213d7..51e2242719 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -30,6 +30,8 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
+#include "pipe/p_debug.h"
+
 #include "draw_private.h"
 #include "draw_context.h"
 
@@ -60,8 +62,8 @@ static void draw_prim_queue_flush( struct draw_context *draw )
    unsigned i;
 
    if (0)
-      fprintf(stdout,"Flushing with %d prims, %d verts\n",
-             draw->pq.queue_nr, draw->vs.queue_nr);
+      debug_printf("Flushing with %d prims, %d verts\n",
+                   draw->pq.queue_nr, draw->vs.queue_nr);
 
    assert (draw->pq.queue_nr != 0);
 
@@ -120,9 +122,9 @@ static void draw_prim_queue_flush( struct draw_context *draw )
 void draw_do_flush( struct draw_context *draw, unsigned flags )
 {
    if (0)
-      fprintf(stdout,"Flushing with %d verts, %d prims\n",
-	      draw->vs.queue_nr,
-	      draw->pq.queue_nr );
+      debug_printf("Flushing with %d verts, %d prims\n",
+                   draw->vs.queue_nr,
+                   draw->pq.queue_nr );
 
 
    if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
@@ -157,11 +159,11 @@ static struct prim_header *get_queued_prim( struct draw_context *draw,
 					    unsigned nr_verts )
 {
    if (!draw_vertex_cache_check_space( draw, nr_verts )) {
-//      fprintf(stderr, "v");
+//      debug_printf("v");
       draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE );
    }
    else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) {
-//      fprintf(stderr, "p");
+//      debug_printf("p");
       draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE );
    }
 
@@ -283,7 +285,7 @@ draw_prim( struct draw_context *draw,
    boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
 		       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
 
-//   _mesa_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
+//   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
 
    switch (prim) {
    case PIPE_PRIM_POINTS:
diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index ac03001d8f..be96c8fdeb 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -34,8 +34,7 @@
  */
 
 
-#include <assert.h>
-
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 
 #include "draw_vbuf.h"
@@ -125,55 +124,55 @@ dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
       j = vinfo->src_index[i];
       switch (vinfo->emit[i]) {
       case EMIT_OMIT:
-         fprintf(stderr, "EMIT_OMIT:");
+         debug_printf("EMIT_OMIT:");
          break;
       case EMIT_ALL:
          assert(i == 0);
          assert(j == 0);
-         fprintf(stderr, "EMIT_ALL:\t");
+         debug_printf("EMIT_ALL:\t");
          for(k = 0; k < vinfo->size*4; ++k)
-            fprintf(stderr, "%02x ", *data++);
+            debug_printf("%02x ", *data++);
          break;
       case EMIT_1F:
-         fprintf(stderr, "EMIT_1F:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_1F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_1F_PSIZE:
-         fprintf(stderr, "EMIT_1F_PSIZE:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_1F_PSIZE:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_2F:
-         fprintf(stderr, "EMIT_2F:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_2F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_3F:
-         fprintf(stderr, "EMIT_3F:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_3F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          data += sizeof(float);
          break;
       case EMIT_4F:
-         fprintf(stderr, "EMIT_4F:\t");
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
-         fprintf(stderr, "%f ", *(float *)data); data += sizeof(float);
+         debug_printf("EMIT_4F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_4UB:
-         fprintf(stderr, "EMIT_4UB:\t");
-         fprintf(stderr, "%u ", *data++);
-         fprintf(stderr, "%u ", *data++);
-         fprintf(stderr, "%u ", *data++);
-         fprintf(stderr, "%u ", *data++);
+         debug_printf("EMIT_4UB:\t");
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
          break;
       default:
          assert(0);
       }
-      fprintf(stderr, "\n");
+      debug_printf("\n");
    }
-   fprintf(stderr, "\n");
+   debug_printf("\n");
 }
 #endif
 
@@ -190,7 +189,7 @@ emit_vertex( struct vbuf_stage *vbuf,
              struct vertex_header *vertex )
 {
 #if 0
-   fprintf(stderr, "emit vertex %d to %p\n", 
+   debug_printf("emit vertex %d to %p\n", 
            vbuf->nr_vertices, vbuf->vertex_ptr);
 #endif
 
@@ -198,7 +197,7 @@ emit_vertex( struct vbuf_stage *vbuf,
       if(vertex->vertex_id < vbuf->nr_vertices)
 	 return;
       else
-	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	 debug_printf("Bad vertex id 0x%04x (>= 0x%04x)\n", 
 	         vertex->vertex_id, vbuf->nr_vertices);
       return;
    }
@@ -269,9 +268,9 @@ emit_vertex( struct vbuf_stage *vbuf,
 	 static float data[256]; 
 	 draw_vf_emit_vertex(vbuf->vf, vertex, data);
 	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
-            fprintf(stderr, "With VF:\n");
+            debug_printf("With VF:\n");
             dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
-	    fprintf(stderr, "Without VF:\n");
+	    debug_printf("Without VF:\n");
 	    dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
 	    assert(0);
 	 }
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index b23f487e74..e13df04605 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -162,7 +162,7 @@ static fetch_func get_fetch_func( enum pipe_format format )
    {
       char tmp[80];
       pf_sprint_name(tmp, format);
-      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
+      debug_printf("%s: %s\n", __FUNCTION__, tmp);
    }
 #endif
 
@@ -332,7 +332,7 @@ static void fetch_xyz_rgb( struct draw_context *draw,
 
    assert(count <= 4);
 
-//   _mesa_printf("%s\n", __FUNCTION__);
+//   debug_printf("%s\n", __FUNCTION__);
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -421,7 +421,7 @@ static void generic_vertex_fetch( struct draw_context *draw,
 
    assert(count <= 4);
 
-//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+//   debug_printf("%s %d\n", __FUNCTION__, count);
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -467,7 +467,7 @@ void draw_update_vertex_fetch( struct draw_context *draw )
 {
    unsigned nr_attrs, i;
 
-//   _mesa_printf("%s\n", __FUNCTION__);
+//   debug_printf("%s\n", __FUNCTION__);
    
    /* this may happend during context init */
    if (!draw->vertex_shader)
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index b851da845f..e6590eafcc 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -166,7 +166,7 @@ run_vertex_program(struct draw_context *draw,
       vOut[j]->data[0][3] = w;
 
 #if DBG_VS
-      printf("output[%d]win: %f %f %f %f\n", j,
+      debug_printf("output[%d]win: %f %f %f %f\n", j,
              vOut[j]->data[0][0],
              vOut[j]->data[0][1],
              vOut[j]->data[0][2],
@@ -181,7 +181,7 @@ run_vertex_program(struct draw_context *draw,
          vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
          vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
 #if DBG_VS
-         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
+         debug_printf("output[%d][%d]: %f %f %f %f\n", j, slot,
                 vOut[j]->data[slot][0],
                 vOut[j]->data[slot][1],
                 vOut[j]->data[slot][2],
@@ -207,7 +207,7 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
     */
    draw_update_vertex_fetch( draw );
 
-//   fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
+//   debug_printf( " q(%d) ", draw->vs.queue_nr );
 #ifdef MESA_LLVM
    if (draw->vertex_shader->llvm_prog) {
       draw_vertex_shader_queue_flush_llvm(draw);
diff --git a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
index 4228c4f388..63551c993e 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
@@ -152,7 +152,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
       z = vOut->clip[2] = dests[0][2];
       w = vOut->clip[3] = dests[0][3];
 #if DBG
-      printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
+      debug_printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
 #endif
 
       vOut->clipmask = compute_clipmask(vOut->clip, draw->plane, draw->nr_planes);
@@ -179,7 +179,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
          vOut->data[slot][3] = dests[slot][3];
 
 #if DBG
-         printf("output %d: %f %f %f %f\n", slot,
+         debug_printf("output %d: %f %f %f %f\n", slot,
                 vOut->data[slot][0],
                 vOut->data[slot][1],
                 vOut->data[slot][2],
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
index 0da8e59ad6..f23d7fcec5 100644
--- a/src/mesa/pipe/draw/draw_vf.c
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -168,7 +168,7 @@ draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
       const unsigned format = map[i].format;
       if (format == DRAW_EMIT_PAD) {
 #if (DRAW_VF_DBG)
-	    _mesa_printf("%d: pad %d, offset %d\n", i,  
+	    debug_printf("%d: pad %d, offset %d\n", i,  
 			 map[i].offset, offset);  
 #endif
 
@@ -186,7 +186,7 @@ draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
 	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
 	 
 #if (DRAW_VF_DBG)
-	    _mesa_printf("%d: %s, offset %d\n", i,  
+	    debug_printf("%d: %s, offset %d\n", i,  
 			 draw_vf_format_info[format].name,
 			 vf->attr[j].vertoffset);   
 #endif
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
index 7f5f56ef9c..7a60a9db9c 100644
--- a/src/mesa/pipe/draw/draw_vf_generic.c
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -27,9 +27,8 @@
  */
 
 
-#include <assert.h>
-
 #include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 
 #include "draw_vf.h"
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
index 1389e6cfb9..1ad2ae756d 100644
--- a/src/mesa/pipe/draw/draw_vf_sse.c
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -453,7 +453,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 else {
-	    fprintf(stderr, "Can't emit 1ub %x %x %d\n", 
+	    debug_printf("Can't emit 1ub %x %x %d\n", 
 	            a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
 	    return FALSE;
 	 }
@@ -499,7 +499,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	    j++;		/* NOTE: two attrs consumed */
 	 }
 	 else {
-	    fprintf(stderr, "Can't emit 3ub\n");
+	    debug_printf("Can't emit 3ub\n");
 	 }
 	 return FALSE;	/* add this later */
 	 break;
@@ -532,7 +532,7 @@ static boolean build_vertex_emit( struct x86_program *p )
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       default:
-	 fprintf(stderr, "unknown a[%d].format %d\n", j, a->format);
+	 debug_printf("unknown a[%d].format %d\n", j, a->format);
 	 return FALSE;	/* catch any new opcodes */
       }
       
diff --git a/src/mesa/pipe/i915simple/i915_fpc_translate.c b/src/mesa/pipe/i915simple/i915_fpc_translate.c
index 0185512aeb..868f0c7e04 100644
--- a/src/mesa/pipe/i915simple/i915_fpc_translate.c
+++ b/src/mesa/pipe/i915simple/i915_fpc_translate.c
@@ -100,7 +100,7 @@ negate(int reg, int x, int y, int z, int w)
 static void
 i915_use_passthrough_shader(struct i915_context *i915)
 {
-   fprintf(stderr, "**** Using i915 pass-through fragment shader\n");
+   debug_printf("**** Using i915 pass-through fragment shader\n");
 
    i915->current.program = (uint *) MALLOC(sizeof(passthrough));
    if (i915->current.program) {
@@ -119,12 +119,12 @@ i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
    va_list args;
    char buffer[1024];
 
-   fprintf(stderr, "i915_program_error: ");
+   debug_printf("i915_program_error: ");
    va_start( args, msg );  
    vsprintf( buffer, msg, args );
    va_end( args );
-   fprintf(stderr, buffer);
-   fprintf(stderr, "\n");
+   debug_printf(buffer);
+   debug_printf("\n");
 
    p->error = 1;
 }
@@ -169,7 +169,7 @@ src_vector(struct i915_fp_compile *p,
 
       switch (sem_name) {
       case TGSI_SEMANTIC_POSITION:
-         fprintf(stderr, "SKIP SEM POS\n");
+         debug_printf("SKIP SEM POS\n");
          /*
          assert(p->wpos_tex != -1);
          src = i915_emit_decl(p, REG_TYPE_T, p->wpos_tex, D0_CHANNEL_ALL);
@@ -913,7 +913,7 @@ i915_translate_instructions(struct i915_fp_compile *p,
             ind = parse.FullToken.FullDeclaration.u.DeclarationRange.First;
             sem = parse.FullToken.FullDeclaration.Semantic.SemanticName;
             semi = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
-            /*printf("FS Input DECL [%u] sem %u\n", ind, sem);*/
+            /*debug_printf("FS Input DECL [%u] sem %u\n", ind, sem);*/
             p->input_semantic_name[ind] = sem;
             p->input_semantic_index[ind] = semi;
          }
@@ -924,7 +924,7 @@ i915_translate_instructions(struct i915_fp_compile *p,
             ind = parse.FullToken.FullDeclaration.u.DeclarationRange.First;
             sem = parse.FullToken.FullDeclaration.Semantic.SemanticName;
             semi = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
-            /*printf("FS Output DECL [%u] sem %u\n", ind, sem);*/
+            /*debug_printf("FS Output DECL [%u] sem %u\n", ind, sem);*/
             p->output_semantic_name[ind] = sem;
             p->output_semantic_index[ind] = semi;
          }
diff --git a/src/mesa/pipe/i915simple/i915_prim_vbuf.c b/src/mesa/pipe/i915simple/i915_prim_vbuf.c
index 39154b2488..e069773fd4 100644
--- a/src/mesa/pipe/i915simple/i915_prim_vbuf.c
+++ b/src/mesa/pipe/i915simple/i915_prim_vbuf.c
@@ -38,9 +38,8 @@
  */
 
 
-#include <assert.h>
-
 #include "pipe/draw/draw_vbuf.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
diff --git a/src/mesa/pipe/i915simple/i915_state_derived.c b/src/mesa/pipe/i915simple/i915_state_derived.c
index 62741e30f8..653983e4a9 100644
--- a/src/mesa/pipe/i915simple/i915_state_derived.c
+++ b/src/mesa/pipe/i915simple/i915_state_derived.c
@@ -87,7 +87,7 @@ static void calculate_vertex_layout( struct i915_context *i915 )
          }
          break;
       case TGSI_SEMANTIC_FOG:
-         fprintf(stderr, "i915 fogcoord not implemented yet\n");
+         debug_printf("i915 fogcoord not implemented yet\n");
          draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src++);
          break;
       default:
diff --git a/src/mesa/pipe/i915simple/i915_state_emit.c b/src/mesa/pipe/i915simple/i915_state_emit.c
index 657f523893..3339287f49 100644
--- a/src/mesa/pipe/i915simple/i915_state_emit.c
+++ b/src/mesa/pipe/i915simple/i915_state_emit.c
@@ -107,7 +107,7 @@ i915_emit_hardware_state(struct i915_context *i915 )
                            ) * 3/2; /* plus 50% margin */
 
 #if 0
-   fprintf (stderr, "i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
+   debug_printf("i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
 #endif
    
    if(!BEGIN_BATCH(dwords, relocs)) {
diff --git a/src/mesa/pipe/i915simple/i915_state_immediate.c b/src/mesa/pipe/i915simple/i915_state_immediate.c
index 752d25f233..07031fc6c5 100644
--- a/src/mesa/pipe/i915simple/i915_state_immediate.c
+++ b/src/mesa/pipe/i915simple/i915_state_immediate.c
@@ -97,7 +97,7 @@ static void upload_S2S4(struct i915_context *i915)
       LIS2 = i915->current.vertex_info.hwfmt[1];
       LIS4 = i915->current.vertex_info.hwfmt[0];
       /*
-      printf("LIS2: 0x%x  LIS4: 0x%x\n", LIS2, LIS4);
+      debug_printf("LIS2: 0x%x  LIS4: 0x%x\n", LIS2, LIS4);
       */
       assert(LIS4); /* should never be zero? */
    }
diff --git a/src/mesa/pipe/i915simple/i915_state_sampler.c b/src/mesa/pipe/i915simple/i915_state_sampler.c
index 59408b6ba0..0dbbc5241d 100644
--- a/src/mesa/pipe/i915simple/i915_state_sampler.c
+++ b/src/mesa/pipe/i915simple/i915_state_sampler.c
@@ -169,7 +169,7 @@ translate_texture_format(enum pipe_format pipeFormat)
    case PIPE_FORMAT_S8Z24_UNORM:
       return (MAPSURF_32BIT | MT_32BIT_xL824);
    default:
-      fprintf(stderr, "i915: translate_texture_format() bad image format %x\n",
+      debug_printf("i915: translate_texture_format() bad image format %x\n",
               pipeFormat);
       assert(0);
       return 0;
diff --git a/src/mesa/pipe/i965simple/brw_cc.c b/src/mesa/pipe/i965simple/brw_cc.c
index dcee731895..337e4f95f6 100644
--- a/src/mesa/pipe/i965simple/brw_cc.c
+++ b/src/mesa/pipe/i965simple/brw_cc.c
@@ -58,7 +58,7 @@ static int brw_translate_compare_func(int func)
       return BRW_COMPAREFUNCTION_ALWAYS;
    }
 
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
    return BRW_COMPAREFUNCTION_ALWAYS;
 }
 
diff --git a/src/mesa/pipe/i965simple/brw_curbe.c b/src/mesa/pipe/i965simple/brw_curbe.c
index 2733eb4e75..52bbd525c1 100644
--- a/src/mesa/pipe/i965simple/brw_curbe.c
+++ b/src/mesa/pipe/i965simple/brw_curbe.c
@@ -273,10 +273,10 @@ static void upload_constant_buffer(struct brw_context *brw)
 
    if (1) {
       for (i = 0; i < sz; i+=4)
-	 _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
 		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
 
-      _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
 		   brw->curbe.last_buf, buf,
 		   bufsz, brw->curbe.last_bufsz,
 		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
@@ -299,7 +299,7 @@ static void upload_constant_buffer(struct brw_context *brw)
 			  bufsz,
 			  1 << 6,
 			  &brw->curbe.gs_offset)) {
-	 _mesa_printf("out of GS memory for curbe\n");
+	 debug_printf("out of GS memory for curbe\n");
 	 assert(0);
 	 return;
       }
diff --git a/src/mesa/pipe/i965simple/brw_eu_debug.c b/src/mesa/pipe/i965simple/brw_eu_debug.c
index be692f6502..4a94ddefa6 100644
--- a/src/mesa/pipe/i965simple/brw_eu_debug.c
+++ b/src/mesa/pipe/i965simple/brw_eu_debug.c
@@ -30,6 +30,8 @@
   */
     
 
+#include "pipe/p_debug.h"
+
 #include "brw_eu.h"
 
 void brw_print_reg( struct brw_reg hwreg )
@@ -52,7 +54,7 @@ void brw_print_reg( struct brw_reg hwreg )
       "f"
    };
 
-   _mesa_printf("%s%s", 
+   debug_printf("%s%s", 
 		hwreg.abs ? "abs/" : "",
 		hwreg.negate ? "-" : "");
      
@@ -63,17 +65,17 @@ void brw_print_reg( struct brw_reg hwreg )
        hwreg.width == BRW_WIDTH_8 &&
        hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
        hwreg.type == BRW_REGISTER_TYPE_F) {
-      _mesa_printf("vec%d", hwreg.nr);
+      debug_printf("vec%d", hwreg.nr);
    }
    else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
 	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
 	    hwreg.width == BRW_WIDTH_1 &&
 	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
 	    hwreg.type == BRW_REGISTER_TYPE_F) {      
-      _mesa_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
    }
    else {
-      _mesa_printf("%s%d.%d<%d;%d,%d>:%s", 
+      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
 		   file[hwreg.file],
 		   hwreg.nr,
 		   hwreg.subnr / type_sz(hwreg.type),
diff --git a/src/mesa/pipe/i965simple/brw_eu_emit.c b/src/mesa/pipe/i965simple/brw_eu_emit.c
index 2423536dd1..400a80b6fb 100644
--- a/src/mesa/pipe/i965simple/brw_eu_emit.c
+++ b/src/mesa/pipe/i965simple/brw_eu_emit.c
@@ -953,7 +953,7 @@ void brw_SAMPLE(struct brw_compile *p,
    boolean need_stall = 0;
 
    if(writemask == 0) {
-/*       _mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+/*       debug_printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
 
@@ -985,7 +985,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
 	 need_stall = 1;
-/* 	 _mesa_printf("need stall %x %x\n", newmask , writemask); */
+/* 	 debug_printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
diff --git a/src/mesa/pipe/i965simple/brw_sf.c b/src/mesa/pipe/i965simple/brw_sf.c
index b89b2e4087..7c83b81c85 100644
--- a/src/mesa/pipe/i965simple/brw_sf.c
+++ b/src/mesa/pipe/i965simple/brw_sf.c
@@ -175,7 +175,7 @@ static void upload_sf_prog( struct brw_context *brw )
 	    //int semantic = parse.FullToken.FullDeclaration.Semantic.SemanticName;
 	    //int semantic_index = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
 
-	    fprintf(stderr, "fs input %d..%d interp mode %d\n", first, last, interp_mode);
+	    debug_printf("fs input %d..%d interp mode %d\n", first, last, interp_mode);
 	    
 	    switch (interp_mode) {
 	    case TGSI_INTERPOLATE_CONSTANT:
@@ -213,9 +213,9 @@ static void upload_sf_prog( struct brw_context *brw )
    key.linear_mask |= 1;
    key.const_mask <<= 1;
 
-   fprintf(stderr, "key.persp_mask: %x\n", key.persp_mask);
-   fprintf(stderr, "key.linear_mask: %x\n", key.linear_mask);
-   fprintf(stderr, "key.const_mask: %x\n", key.const_mask);
+   debug_printf("key.persp_mask: %x\n", key.persp_mask);
+   debug_printf("key.linear_mask: %x\n", key.linear_mask);
+   debug_printf("key.const_mask: %x\n", key.const_mask);
 
 
 //   key.do_point_sprite = brw->attribs.Point->PointSprite;
diff --git a/src/mesa/pipe/i965simple/brw_sf_emit.c b/src/mesa/pipe/i965simple/brw_sf_emit.c
index 6ff5254ff7..78d6fa5e9e 100644
--- a/src/mesa/pipe/i965simple/brw_sf_emit.c
+++ b/src/mesa/pipe/i965simple/brw_sf_emit.c
@@ -137,8 +137,8 @@ static boolean calculate_masks( struct brw_sf_compile *c,
    unsigned persp_mask = c->key.persp_mask;
    unsigned linear_mask = c->key.linear_mask;
 
-   fprintf(stderr, "persp_mask: %x\n", persp_mask);
-   fprintf(stderr, "linear_mask: %x\n", linear_mask);
+   debug_printf("persp_mask: %x\n", persp_mask);
+   debug_printf("linear_mask: %x\n", linear_mask);
 
    *pc_persp = 0;
    *pc_linear = 0;
@@ -162,9 +162,9 @@ static boolean calculate_masks( struct brw_sf_compile *c,
 	 *pc_linear |= 0xf0;
    }
 
-   fprintf(stderr, "pc: %x\n", *pc);
-   fprintf(stderr, "pc_persp: %x\n", *pc_persp);
-   fprintf(stderr, "pc_linear: %x\n", *pc_linear);
+   debug_printf("pc: %x\n", *pc);
+   debug_printf("pc_persp: %x\n", *pc_persp);
+   debug_printf("pc_linear: %x\n", *pc_linear);
    
 
    return is_last_attr;
@@ -177,7 +177,7 @@ void brw_emit_tri_setup( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    unsigned i;
 
-   fprintf(stderr, "%s START ==============\n", __FUNCTION__);
+   debug_printf("%s START ==============\n", __FUNCTION__);
 
    c->nr_verts = 3;
    alloc_regs(c);
@@ -250,7 +250,7 @@ void brw_emit_tri_setup( struct brw_sf_compile *c )
       }
    }
 
-   fprintf(stderr, "%s DONE ==============\n", __FUNCTION__);
+   debug_printf("%s DONE ==============\n", __FUNCTION__);
 
 }
 
diff --git a/src/mesa/pipe/i965simple/brw_state.c b/src/mesa/pipe/i965simple/brw_state.c
index daf14ff4ff..95dfce88e4 100644
--- a/src/mesa/pipe/i965simple/brw_state.c
+++ b/src/mesa/pipe/i965simple/brw_state.c
@@ -225,7 +225,7 @@ static void brw_bind_vs_state(struct pipe_context *pipe, void *vs)
    brw->attribs.VertexProgram = (struct brw_vertex_program *)vs;
    brw->state.dirty.brw |= BRW_NEW_VS;
 
-   printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
+   debug_printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
 }
 
 static void brw_delete_vs_state(struct pipe_context *pipe, void *shader)
diff --git a/src/mesa/pipe/i965simple/brw_state_cache.c b/src/mesa/pipe/i965simple/brw_state_cache.c
index c5738733f4..b3a5124461 100644
--- a/src/mesa/pipe/i965simple/brw_state_cache.c
+++ b/src/mesa/pipe/i965simple/brw_state_cache.c
@@ -149,7 +149,7 @@ unsigned brw_upload_cache( struct brw_cache *cache,
    if (!brw_pool_alloc(cache->pool, data_size, 1 << 6, &offset)) {
       /* Should not be possible:
        */
-      printf("brw_pool_alloc failed\n");
+      debug_printf("brw_pool_alloc failed\n");
       exit(1);
    }
 
@@ -177,7 +177,7 @@ unsigned brw_upload_cache( struct brw_cache *cache,
    }
 
    if (BRW_DEBUG & DEBUG_STATE)
-      printf("upload %s: %d bytes to pool buffer %p offset %x\n",
+      debug_printf("upload %s: %d bytes to pool buffer %p offset %x\n",
              cache->name, 
 	     data_size,
              (void*)cache->pool->buffer,
@@ -416,7 +416,7 @@ void brw_clear_all_caches( struct brw_context *brw )
    int i;
 
    if (BRW_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+      debug_printf("%s\n", __FUNCTION__);
 
    for (i = 0; i < BRW_MAX_CACHE; i++)
       clear_cache(&brw->cache[i]);
diff --git a/src/mesa/pipe/i965simple/brw_state_pool.c b/src/mesa/pipe/i965simple/brw_state_pool.c
index 7c67f0ee25..f3174bfe0a 100644
--- a/src/mesa/pipe/i965simple/brw_state_pool.c
+++ b/src/mesa/pipe/i965simple/brw_state_pool.c
@@ -58,7 +58,7 @@ boolean brw_pool_alloc( struct brw_mem_pool *pool,
    size = align(size, 4);
 
    if (pool->offset + fixup + size >= pool->size) {
-      printf("%s failed\n", __FUNCTION__);
+      debug_printf("%s failed\n", __FUNCTION__);
       assert(0);
       exit(0);
    }
@@ -74,7 +74,7 @@ static
 void brw_invalidate_pool( struct brw_mem_pool *pool )
 {
    if (BRW_DEBUG & DEBUG_STATE)
-      printf("\n\n\n %s \n\n\n", __FUNCTION__);
+      debug_printf("\n\n\n %s \n\n\n", __FUNCTION__);
 
    pool->offset = 0;
 
diff --git a/src/mesa/pipe/i965simple/brw_urb.c b/src/mesa/pipe/i965simple/brw_urb.c
index b284526aa6..101a4367b9 100644
--- a/src/mesa/pipe/i965simple/brw_urb.c
+++ b/src/mesa/pipe/i965simple/brw_urb.c
@@ -120,18 +120,18 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	     * entries and the values for minimum nr of entries
 	     * provided above.
 	     */
-	    fprintf(stderr, "couldn't calculate URB layout!\n");
+	    debug_printf("couldn't calculate URB layout!\n");
 	    exit(1);
 	 }
 
 	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
-	    printf("URB CONSTRAINED\n");
+	    debug_printf("URB CONSTRAINED\n");
       }
       else
 	 brw->urb.constrained = 0;
 
       if (BRW_DEBUG & DEBUG_URB)
-	 printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
 		      brw->urb.gs_start,
 		      brw->urb.clip_start,
diff --git a/src/mesa/pipe/i965simple/brw_vs_emit.c b/src/mesa/pipe/i965simple/brw_vs_emit.c
index b32c233dd2..98915ba101 100644
--- a/src/mesa/pipe/i965simple/brw_vs_emit.c
+++ b/src/mesa/pipe/i965simple/brw_vs_emit.c
@@ -1228,7 +1228,7 @@ static void process_instruction(struct brw_vs_compile *c,
    case TGSI_OPCODE_ENDSUB:
       break;
    default:
-      printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
+      debug_printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
       break;
    }
 
diff --git a/src/mesa/pipe/i965simple/brw_wm.c b/src/mesa/pipe/i965simple/brw_wm.c
index 0ee0fbed51..539b170744 100644
--- a/src/mesa/pipe/i965simple/brw_wm.c
+++ b/src/mesa/pipe/i965simple/brw_wm.c
@@ -57,7 +57,7 @@ static void do_wm_prog( struct brw_context *brw,
    c->pixel_w = brw_null_reg();
 
 
-   fprintf(stderr, "XXXXXXXX FP\n");
+   debug_printf("XXXXXXXX FP\n");
    
    brw_wm_glsl_emit(c);
 
diff --git a/src/mesa/pipe/i965simple/brw_wm_glsl.c b/src/mesa/pipe/i965simple/brw_wm_glsl.c
index f4b5c13c06..d95645d108 100644
--- a/src/mesa/pipe/i965simple/brw_wm_glsl.c
+++ b/src/mesa/pipe/i965simple/brw_wm_glsl.c
@@ -982,7 +982,7 @@ static void brw_wm_emit_instruction( struct brw_wm_compile *c,
       break;
 
    default:
-      _mesa_printf("unsupported IR in fragment shader %d\n",
+      debug_printf("unsupported IR in fragment shader %d\n",
 		   inst->Instruction.Opcode);
    }
 #if 0
diff --git a/src/mesa/pipe/i965simple/brw_wm_sampler_state.c b/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
index cfb430eb09..de42ffc5b1 100644
--- a/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
+++ b/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
@@ -71,7 +71,7 @@ static int intel_translate_shadow_compare_func(unsigned func)
        return COMPAREFUNC_NEVER;
    }
 
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
    return COMPAREFUNC_NEVER;
 }
 
diff --git a/src/mesa/pipe/p_compiler.h b/src/mesa/pipe/p_compiler.h
index e939d9cd9b..30cd729c56 100644
--- a/src/mesa/pipe/p_compiler.h
+++ b/src/mesa/pipe/p_compiler.h
@@ -28,10 +28,9 @@
 #ifndef P_COMPILER_H
 #define P_COMPILER_H
 
-#include <assert.h>
+
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
 
 
 #if defined(_WIN32) && !defined(__WIN32__)
diff --git a/src/mesa/pipe/p_format.h b/src/mesa/pipe/p_format.h
index 9f60cdbb04..c9ad324315 100644
--- a/src/mesa/pipe/p_format.h
+++ b/src/mesa/pipe/p_format.h
@@ -28,7 +28,10 @@
 #ifndef PIPE_FORMAT_H
 #define PIPE_FORMAT_H
 
+#include <stdio.h> // for sprintf
+
 #include "p_compiler.h"
+#include "p_debug.h"
 
 /**
  * The PIPE_FORMAT is a 32-bit wide bitfield that encodes all the information
diff --git a/src/mesa/pipe/p_util.h b/src/mesa/pipe/p_util.h
index 059528787d..4780ed7818 100644
--- a/src/mesa/pipe/p_util.h
+++ b/src/mesa/pipe/p_util.h
@@ -381,10 +381,6 @@ static INLINE int align(int value, int alignment)
    return (value + alignment - 1) & ~(alignment - 1);
 }
 
-/* Convenient...
- */
-extern void _mesa_printf(const char *str, ...);
-
 
 /* util/p_util.c
  */
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer.h b/src/mesa/pipe/pipebuffer/pb_buffer.h
index 17551b3b50..97beb5f72a 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer.h
+++ b/src/mesa/pipe/pipebuffer/pb_buffer.h
@@ -44,10 +44,8 @@
 #define PB_BUFFER_H_
 
 
-#include <assert.h>
-#include <stdlib.h>
-
 #include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_state.h"
 #include "pipe/p_inlines.h"
 
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
index 4cf4222db9..f4fc3f6d71 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
@@ -34,12 +34,10 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
 #include "linked_list.h"
 
 #include "p_compiler.h"
+#include "p_debug.h"
 #include "p_winsys.h"
 #include "p_thread.h"
 #include "p_util.h"
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h
index 09082a5390..c40b9c75e1 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h
@@ -51,7 +51,7 @@
 #define PB_BUFFER_FENCED_H_
 
 
-#include <assert.h>
+#include "pipe/p_debug.h"
 
 
 struct pipe_winsys;
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
index 2151f1d691..c1b7759874 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
@@ -34,9 +34,7 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pb_buffer.h"
 
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c
index 3b341c64c2..c535d3276c 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c
@@ -34,9 +34,7 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
+#include "p_debug.h"
 #include "p_util.h"
 
 #include "pb_buffer.h"
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
index b6af7cdedc..8b1b51c0e2 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
@@ -34,11 +34,10 @@
  */
 
 
-#include <assert.h>
-
 #include "linked_list.h"
 
 #include "p_defines.h"
+#include "p_debug.h"
 #include "p_thread.h"
 #include "p_util.h"
 #include "pb_buffer.h"
@@ -69,28 +68,28 @@ struct mem_block
 static void
 mmDumpMemInfo(const struct mem_block *heap)
 {
-   fprintf(stderr, "Memory heap %p:\n", (void *)heap);
+   debug_printf("Memory heap %p:\n", (void *)heap);
    if (heap == 0) {
-      fprintf(stderr, "  heap == 0\n");
+      debug_printf("  heap == 0\n");
    } else {
       const struct mem_block *p;
 
       for(p = heap->next; p != heap; p = p->next) {
-	 fprintf(stderr, "  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
 		 p->free ? 'F':'.',
 		 p->reserved ? 'R':'.');
       }
 
-      fprintf(stderr, "\nFree list:\n");
+      debug_printf("\nFree list:\n");
 
       for(p = heap->next_free; p != heap; p = p->next_free) {
-	 fprintf(stderr, " FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+	 debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
 		 p->free ? 'F':'.',
 		 p->reserved ? 'R':'.');
       }
 
    }
-   fprintf(stderr, "End of memory blocks\n");
+   debug_printf("End of memory blocks\n");
 }
 #endif
 
@@ -308,11 +307,11 @@ mmFreeMem(struct mem_block *b)
       return 0;
 
    if (b->free) {
-      fprintf(stderr, "block already free\n");
+      debug_printf("block already free\n");
       return -1;
    }
    if (b->reserved) {
-      fprintf(stderr, "block is reserved\n");
+      debug_printf("block is reserved\n");
       return -1;
    }
 
@@ -479,7 +478,7 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    
    mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
    if(!mm_buf->block) {
-      fprintf(stderr, "warning: heap full\n");
+      debug_printf("warning: heap full\n");
 #if 0
       mmDumpMemInfo(mm->heap);
 #endif
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
index f80c7e34c0..bcd4b3e257 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
@@ -35,12 +35,10 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
 #include "linked_list.h"
 
 #include "p_compiler.h"
+#include "p_debug.h"
 #include "p_thread.h"
 #include "p_defines.h"
 #include "p_util.h"
@@ -178,7 +176,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
 
    if (pool->numFree == 0) {
       _glthread_UNLOCK_MUTEX(pool->mutex);
-      fprintf(stderr, "warning: out of fixed size buffer objects\n");
+      debug_printf("warning: out of fixed size buffer objects\n");
       return NULL;
    }
 
@@ -186,7 +184,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
 
    if (item == &pool->free) {
       _glthread_UNLOCK_MUTEX(pool->mutex);
-      fprintf(stderr, "error: fixed size buffer pool corruption\n");
+      debug_printf("error: fixed size buffer pool corruption\n");
       return NULL;
    }
 
diff --git a/src/mesa/pipe/softpipe/sp_prim_setup.c b/src/mesa/pipe/softpipe/sp_prim_setup.c
index b17801d13d..7478b2336b 100644
--- a/src/mesa/pipe/softpipe/sp_prim_setup.c
+++ b/src/mesa/pipe/softpipe/sp_prim_setup.c
@@ -251,9 +251,9 @@ static void print_vertex(const struct setup_stage *setup,
                          const struct vertex_header *v)
 {
    int i;
-   fprintf(stderr, "Vertex: (%p)\n", v);
+   debug_printf("Vertex: (%p)\n", v);
    for (i = 0; i < setup->quad.nr_attrs; i++) {
-      fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
+      debug_printf("  %d: %f %f %f %f\n",  i, 
               v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
    }
 }
@@ -267,7 +267,7 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
    const struct vertex_header *v2 = prim->v[2];
 
 #if DEBUG_VERTS
-   fprintf(stderr, "Triangle:\n");
+   debug_printf("Triangle:\n");
    print_vertex(setup, v0);
    print_vertex(setup, v1);
    print_vertex(setup, v2);
@@ -345,7 +345,7 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 
       setup->oneoverarea = 1.0f / area;
       /*
-      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
+      debug_printf("%s one-over-area %f  area %f  det %f\n",
                    __FUNCTION__, setup->oneoverarea, area, prim->det );
       */
    }
@@ -419,7 +419,7 @@ static void tri_linear_coeff( struct setup_stage *setup,
                    dady * (setup->vmin->data[0][1] - 0.5f)));
 
    /*
-   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+   debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
 		setup->coef[slot].a0[i],
 		setup->coef[slot].dadx[i],
@@ -453,10 +453,10 @@ static void tri_persp_coeff( struct setup_stage *setup,
    float dady = b * setup->oneoverarea;
       
    /*
-   printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
-          setup->vmin->data[vertSlot][i],
-          setup->vmid->data[vertSlot][i],
-          setup->vmax->data[vertSlot][i]
+   debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+          	setup->vmin->data[vertSlot][i],
+          	setup->vmid->data[vertSlot][i],
+       		setup->vmax->data[vertSlot][i]
           );
    */
    assert(i <= 3);
@@ -619,7 +619,7 @@ static void subtriangle( struct setup_stage *setup,
    finish_y -= sy;
 
    /*
-   _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
+   debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
    */
 
    for (y = start_y; y < finish_y; y++) {
@@ -671,7 +671,7 @@ static void setup_tri( struct draw_stage *stage,
    struct setup_stage *setup = setup_stage( stage );
 
    /*
-   _mesa_printf("%s\n", __FUNCTION__ );
+   debug_printf("%s\n", __FUNCTION__ );
    */
 
    setup_sort_vertices( setup, prim );
@@ -1124,7 +1124,7 @@ setup_point(struct draw_stage *stage, struct prim_header *prim)
          int ix, iy;
 
          /*
-         printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
+         debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
          */
          for (iy = iymin; iy <= iymax; iy += 2) {
             uint rowMask = 0xf;
diff --git a/src/mesa/pipe/softpipe/sp_quad_fs.c b/src/mesa/pipe/softpipe/sp_quad_fs.c
index 90691c6065..b5d7dfca1c 100644
--- a/src/mesa/pipe/softpipe/sp_quad_fs.c
+++ b/src/mesa/pipe/softpipe/sp_quad_fs.c
@@ -223,13 +223,13 @@ shade_quad_llvm(struct quad_stage *qs,
    inputs[2][0][1] = fy + 1.0f;
    inputs[3][0][1] = fy + 1.0f;
 #if DLLVM
-   printf("MASK = %d\n", quad->mask);
+   debug_printf("MASK = %d\n", quad->mask);
 #endif
    gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef);
 #if DLLVM
    for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 2; ++j) {
-         printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
+         debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
                 inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]);
       }
    }
@@ -240,7 +240,7 @@ shade_quad_llvm(struct quad_stage *qs,
                                    softpipe->mapped_constants[PIPE_SHADER_FRAGMENT],
                                    qss->samplers);
 #if DLLVM
-   printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
+   debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
           dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], 
           dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]);
 #endif
@@ -260,7 +260,7 @@ shade_quad_llvm(struct quad_stage *qs,
    }
 #if DLLVM
    for (int i = 0; i < QUAD_SIZE; ++i) {
-      printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
+      debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
              quad->outputs.color[0][i],
              quad->outputs.color[1][i],
              quad->outputs.color[2][i],
@@ -284,7 +284,7 @@ shade_quad_llvm(struct quad_stage *qs,
       }
    }
 #if DLLVM
-   printf("D [%f, %f, %f, %f] mask = %d\n",
+   debug_printf("D [%f, %f, %f, %f] mask = %d\n",
              quad->outputs.depth[0],
              quad->outputs.depth[1],
              quad->outputs.depth[2],
diff --git a/src/mesa/pipe/softpipe/sp_tile_cache.c b/src/mesa/pipe/softpipe/sp_tile_cache.c
index ccf367a5e4..1597361b82 100644
--- a/src/mesa/pipe/softpipe/sp_tile_cache.c
+++ b/src/mesa/pipe/softpipe/sp_tile_cache.c
@@ -341,7 +341,7 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
       }
    }
 #if 0
-   printf("num cleared: %u\n", numCleared);
+   debug_printf("num cleared: %u\n", numCleared);
 #endif
 }
 
@@ -384,7 +384,7 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
 #endif
 
 #if 0
-   printf("flushed tiles in use: %d\n", inuse);
+   debug_printf("flushed tiles in use: %d\n", inuse);
 #endif
 }
 
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.c b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
index dcc39362a9..463ff0d9da 100644
--- a/src/mesa/pipe/tgsi/exec/tgsi_exec.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
@@ -143,7 +143,7 @@ tgsi_exec_prepare( struct tgsi_exec_machine *mach )
 
    k = tgsi_parse_init( &parse, mach->Tokens );
    if (k != TGSI_PARSE_OK) {
-      fprintf(stderr, "Problem parsing!\n");
+      debug_printf("Problem parsing!\n");
       return;
    }
 
@@ -249,7 +249,7 @@ tgsi_exec_machine_init(
 
    k = tgsi_parse_init (&parse, mach->Tokens);
    if (k != TGSI_PARSE_OK) {
-      fprintf( stderr, "Problem parsing!\n" );
+      debug_printf( "Problem parsing!\n" );
       return;
    }
 
@@ -1236,7 +1236,7 @@ exec_tex(struct tgsi_exec_machine *mach,
    uint chan_index;
    float lodBias;
 
-   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
 
    switch (inst->InstructionExtTexture.Texture) {
    case TGSI_TEXTURE_1D:
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
index df0c698301..f2180082f1 100755
--- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
@@ -48,28 +48,28 @@ _print_reg(
    case file_REG32:
       switch( reg.idx ) {
       case reg_AX:
-         printf( "EAX" );
+         debug_printf( "EAX" );
          break;
       case reg_CX:
-         printf( "ECX" );
+         debug_printf( "ECX" );
          break;
       case reg_DX:
-         printf( "EDX" );
+         debug_printf( "EDX" );
          break;
       case reg_BX:
-         printf( "EBX" );
+         debug_printf( "EBX" );
          break;
       case reg_SP:
-         printf( "ESP" );
+         debug_printf( "ESP" );
          break;
       case reg_BP:
-         printf( "EBP" );
+         debug_printf( "EBP" );
          break;
       case reg_SI:
-         printf( "ESI" );
+         debug_printf( "ESI" );
          break;
       case reg_DI:
-         printf( "EDI" );
+         debug_printf( "EDI" );
          break;
       }
       break;
@@ -77,7 +77,7 @@ _print_reg(
       assert( 0 );
       break;
    case file_XMM:
-      printf( "XMM%u", reg.idx );
+      debug_printf( "XMM%u", reg.idx );
       break;
    case file_x87:
       assert( 0 );
@@ -92,35 +92,35 @@ _fill(
    unsigned count = 10 - strlen( op );
 
    while( count-- ) {
-      printf( " " );
+      debug_printf( " " );
    }
 }
 
-#define DUMP_START() printf( "\nsse-dump start ----------------" )
-#define DUMP_END() printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) printf( "\n%s", OP )
+#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
+#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
+#define DUMP( OP ) debug_printf( "\n%s", OP )
 #define DUMP_I( OP, I ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
-   printf( "%u", I ); } while( 0 )
+   debug_printf( "%u", I ); } while( 0 )
 #define DUMP_R( OP, R0 ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
    _print_reg( R0 ); } while( 0 )
 #define DUMP_RR( OP, R0, R1 ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
    _print_reg( R0 );\
-   printf( ", " );\
+   debug_printf( ", " );\
    _print_reg( R1 ); } while( 0 )
 #define DUMP_RRI( OP, R0, R1, I ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
    _print_reg( R0 );\
-   printf( ", " );\
+   debug_printf( ", " );\
    _print_reg( R1 );\
-   printf( ", " );\
-   printf( "%u", I ); } while( 0 )
+   debug_printf( ", " );\
+   debug_printf( "%u", I ); } while( 0 )
 
 #else
 
diff --git a/src/mesa/pipe/tgsi/util/tgsi_build.c b/src/mesa/pipe/tgsi/util/tgsi_build.c
index 67f7d2c2c2..a00ff1c2a5 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_build.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_build.c
@@ -1,3 +1,4 @@
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_build.h"
diff --git a/src/mesa/pipe/tgsi/util/tgsi_dump.c b/src/mesa/pipe/tgsi/util/tgsi_dump.c
index cdbc0dbc9c..b5c54847e0 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_dump.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_dump.c
@@ -25,6 +25,9 @@
  * 
  **************************************************************************/
 
+#include <stdio.h> 
+
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_dump.h"
diff --git a/src/mesa/pipe/tgsi/util/tgsi_parse.c b/src/mesa/pipe/tgsi/util/tgsi_parse.c
index f0f8d44ac2..bf6b89ce56 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_parse.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_parse.c
@@ -25,6 +25,7 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
diff --git a/src/mesa/pipe/tgsi/util/tgsi_util.c b/src/mesa/pipe/tgsi/util/tgsi_util.c
index 1e76b0f133..4cdd89182a 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_util.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_util.c
@@ -1,3 +1,4 @@
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
-- 
cgit v1.2.3


From af6b12cc76b40c86f3b144a7f5cd3ef1278863d0 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Thu, 7 Feb 2008 01:07:49 +0900
Subject: gallium: Bring latest fixes.

---
 src/mesa/pipe/p_debug.h      |  9 ++++++++-
 src/mesa/pipe/util/p_debug.c | 16 +++++++++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/p_debug.h b/src/mesa/pipe/p_debug.h
index b037eba2a3..2a11627b36 100644
--- a/src/mesa/pipe/p_debug.h
+++ b/src/mesa/pipe/p_debug.h
@@ -38,6 +38,10 @@
 #ifndef P_DEBUG_H_
 #define P_DEBUG_H_
 
+
+#include <stdarg.h>
+
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -55,8 +59,12 @@ extern "C" {
 
 
 void debug_printf(const char *format, ...);
+
+void debug_vprintf(const char *format, va_list ap);
+
 void debug_assert_fail(const char *expr, const char *file, unsigned line);
 
+
 /** Assert macro */
 #ifdef DEBUG
 #define debug_assert(expr) ((expr) ? (void)0 : debug_assert_fail(#expr, __FILE__, __LINE__))
@@ -66,7 +74,6 @@ void debug_assert_fail(const char *expr, const char *file, unsigned line);
 
 
 #ifdef assert
-#warning Standard C Library assert macro usage detected. 
 #undef assert
 #endif
 #define assert(expr) debug_assert(expr)
diff --git a/src/mesa/pipe/util/p_debug.c b/src/mesa/pipe/util/p_debug.c
index 9303c970cc..b9607a6ba7 100644
--- a/src/mesa/pipe/util/p_debug.c
+++ b/src/mesa/pipe/util/p_debug.c
@@ -40,16 +40,22 @@
 #include "pipe/p_compiler.h" 
 
 
-void debug_printf(const char *format, ...)
+void debug_vprintf(const char *format, va_list ap)
 {
-   va_list ap;
-   va_start( ap, format );  
 #ifdef WIN32
    EngDebugPrint("Gallium3D: ", (PCHAR)format, ap);
 #else
    vfprintf(stderr, format, ap);
 #endif
-   va_end( ap );
+}
+
+
+void debug_printf(const char *format, ...)
+{
+   va_list ap;
+   va_start(ap, format);
+   debug_vprintf(format, ap);
+   va_end(ap);
 }
 
 
@@ -65,6 +71,6 @@ static INLINE void debug_abort(void)
 
 void debug_assert_fail(const char *expr, const char *file, unsigned line) 
 {
-   debug_printf("%s:%i: Assertion `%s' failed.");
+   debug_printf("%s:%i: Assertion `%s' failed.\n", file, line, expr);
    debug_abort();
 }
-- 
cgit v1.2.3


From 4650b35846e8e87fb0d74573a5f66452bb449b4b Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 16:32:15 -0700
Subject: gallium: add bitmap/drawpixels texcoord bias support

The state tracker will call pipe->get_paramf(PIPE_CAP_BITMAP_TEXCOORD_BIAS)
to get a bias factor for adjusting the texcoords used in bitmap/drawpixels.
This allows us to compensate for small differences in rasterization from
one device to another.
---
 src/mesa/pipe/p_defines.h                 | 2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c | 7 +++++--
 src/mesa/state_tracker/st_context.h       | 2 ++
 src/mesa/state_tracker/st_extensions.c    | 3 +++
 4 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/p_defines.h b/src/mesa/pipe/p_defines.h
index 85adf2d61d..0bf53ecb79 100644
--- a/src/mesa/pipe/p_defines.h
+++ b/src/mesa/pipe/p_defines.h
@@ -265,6 +265,6 @@ enum pipe_texture_target {
 #define PIPE_CAP_MAX_POINT_WIDTH_AA      17
 #define PIPE_CAP_MAX_TEXTURE_ANISOTROPY  18
 #define PIPE_CAP_MAX_TEXTURE_LOD_BIAS    19
-
+#define PIPE_CAP_BITMAP_TEXCOORD_BIAS    20
 
 #endif
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 6b44cba2e4..34d420fcff 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -581,10 +581,13 @@ draw_quad_colored(GLcontext *ctx, GLfloat x0, GLfloat y0, GLfloat z,
                   GLfloat x1, GLfloat y1, const GLfloat *color,
                   GLboolean invertTex)
 {
+   GLfloat bias = ctx->st->bitmap_texcoord_bias;
    GLfloat verts[4][3][4]; /* four verts, three attribs, XYZW */
    GLuint i;
-   GLfloat sLeft = 0.0, sRight = 1.0;
-   GLfloat tTop = invertTex, tBot = 1.0 - tTop;
+   GLfloat xBias = bias / (x1-x0);
+   GLfloat yBias = bias / (y1-y0);
+   GLfloat sLeft = 0.0 + xBias, sRight = 1.0 + xBias;
+   GLfloat tTop = invertTex - yBias, tBot = 1.0 - tTop - yBias;
 
    /* upper-left */
    verts[0][0][0] = x0;    /* attr[0].x */
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 2b6f8743f3..a756055898 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -143,6 +143,8 @@ struct st_context
 
    GLfloat polygon_offset_scale; /* ?? */
 
+   GLfloat bitmap_texcoord_bias;
+
    /** Mapping from VERT_RESULT_x to post-transformed vertex slot */
    const GLuint *vertex_result_to_slot;
 
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 0157bdd6b3..97d28d77c4 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -106,6 +106,9 @@ void st_init_limits(struct st_context *st)
 
    c->MaxTextureLodBias
       = pipe->get_paramf(pipe, PIPE_CAP_MAX_TEXTURE_LOD_BIAS);
+
+   st->bitmap_texcoord_bias
+      = pipe->get_paramf(pipe, PIPE_CAP_BITMAP_TEXCOORD_BIAS);
 }
 
 
-- 
cgit v1.2.3


From 71984d76aae937274f6dd08c24f995d3c0c06357 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 17:46:31 -0700
Subject: clean-up

---
 src/mesa/state_tracker/st_atom_texture.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index fb21d29c40..b3859f18cb 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -36,7 +36,6 @@
 #include "st_atom.h"
 #include "st_cb_texture.h"
 #include "pipe/p_context.h"
-#include "pipe/p_defines.h"
 
 
 /**
@@ -46,18 +45,14 @@
 static void 
 update_textures(struct st_context *st)
 {
-   GLuint s;
-
    /* ST_NEW_FRAGMENT_PROGRAM
     */
    struct gl_fragment_program *fprog = st->ctx->FragmentProgram._Current;
+   GLuint unit;
 
-   for (s = 0; s < st->ctx->Const.MaxTextureCoordUnits; s++) {
-      GLuint su = fprog->Base.SamplerUnits[s];
-      
-      struct gl_texture_object *texObj
-         = st->ctx->Texture.Unit[su]._Current;
-
+   for (unit = 0; unit < st->ctx->Const.MaxTextureCoordUnits; unit++) {
+      const GLuint su = fprog->Base.SamplerUnits[unit];
+      struct gl_texture_object *texObj = st->ctx->Texture.Unit[su]._Current;
       struct pipe_texture *pt;
 
       if (texObj) {
@@ -75,9 +70,9 @@ update_textures(struct st_context *st)
        * this table before being deleted, otherwise the pointer
        * comparison below could fail.
        */
-      if (st->state.sampler_texture[s] != pt) {
-	 st->state.sampler_texture[s] = pt;
-	 st->pipe->set_sampler_texture(st->pipe, s, pt);
+      if (st->state.sampler_texture[unit] != pt) {
+	 st->state.sampler_texture[unit] = pt;
+	 st->pipe->set_sampler_texture(st->pipe, unit, pt);
       }
    }
 }
-- 
cgit v1.2.3


From a4fbf096734efca2100aff41e988cce26ced5f6f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 17:55:16 -0700
Subject: comments, clean-ups, consts

---
 src/mesa/state_tracker/st_texture.c | 30 ++++++++++++++++++------------
 src/mesa/state_tracker/st_texture.h |  8 ++++----
 2 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 15cc458be8..741f36c2a7 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -59,6 +59,10 @@ target_to_target(GLenum target)
 }
 #endif
 
+
+/**
+ * Allocate a new pipe_texture object
+ */
 struct pipe_texture *
 st_texture_create(struct st_context *st,
                   enum pipe_texture_target target,
@@ -100,17 +104,16 @@ st_texture_create(struct st_context *st,
 }
 
 
-
-
-/* Can the image be pulled into a unified mipmap texture.  This mirrors
- * the completeness test in a lot of ways.
+/**
+ * Check if a texture image be pulled into a unified mipmap texture.
+ * This mirrors the completeness test in a lot of ways.
  *
  * Not sure whether I want to pass gl_texture_image here.
  */
 GLboolean
-st_texture_match_image(struct pipe_texture *pt,
-                          struct gl_texture_image *image,
-                          GLuint face, GLuint level)
+st_texture_match_image(const struct pipe_texture *pt,
+                       const struct gl_texture_image *image,
+                       GLuint face, GLuint level)
 {
    /* Images with borders are never pulled into mipmap textures. 
     */
@@ -189,6 +192,7 @@ st_texture_image_map(struct st_context *st, struct st_texture_image *stImage,
    return pipe_surface_map(stImage->surface);
 }
 
+
 void
 st_texture_image_unmap(struct st_texture_image *stImage)
 {
@@ -201,7 +205,8 @@ st_texture_image_unmap(struct st_texture_image *stImage)
 
 
-/* Upload data to a rectangular sub-region.  Lots of choices how to do this:
+/**
+ * Upload data to a rectangular sub-region.  Lots of choices how to do this:
  *
  * - memcpy by span to current destination
  * - upload data as new buffer and blit
@@ -261,13 +266,14 @@ st_texture_image_data(struct pipe_context *pipe,
    }
 }
 
+
 /* Copy mipmap image between textures
  */
 void
 st_texture_image_copy(struct pipe_context *pipe,
-                         struct pipe_texture *dst,
-                         GLuint face, GLuint level,
-                         struct pipe_texture *src)
+                      struct pipe_texture *dst,
+                      GLuint face, GLuint level,
+                      struct pipe_texture *src)
 {
    GLuint width = src->width[level];
    GLuint height = src->height[level];
@@ -278,6 +284,7 @@ st_texture_image_copy(struct pipe_context *pipe,
 
    if (dst->compressed)
       height /= 4;
+
    for (i = 0; i < depth; i++) {
       dst_surface = pipe->get_tex_surface(pipe, dst, face, level, i);
       src_surface = pipe->get_tex_surface(pipe, src, face, level, i);
@@ -292,5 +299,4 @@ st_texture_image_copy(struct pipe_context *pipe,
       pipe_surface_reference(&dst_surface, NULL);
       pipe_surface_reference(&src_surface, NULL);
    }
-
 }
diff --git a/src/mesa/state_tracker/st_texture.h b/src/mesa/state_tracker/st_texture.h
index d8b1bcad9d..0b87a494c3 100644
--- a/src/mesa/state_tracker/st_texture.h
+++ b/src/mesa/state_tracker/st_texture.h
@@ -47,11 +47,11 @@ st_texture_create(struct st_context *st,
                   GLuint compress_byte);
 
 
-/* Check if an image fits an existing texture
+/* Check if an image fits into an existing texture object.
  */
 extern GLboolean
-st_texture_match_image(struct pipe_texture *pt,
-                       struct gl_texture_image *image,
+st_texture_match_image(const struct pipe_texture *pt,
+                       const struct gl_texture_image *image,
                        GLuint face, GLuint level);
 
 /* Return a pointer to an image within a texture.  Return image stride as
@@ -73,7 +73,7 @@ extern const GLuint *
 st_texture_depth_offsets(struct pipe_texture *pt, GLuint level);
 
 
-/* Return the linear offset of an image relative to the start of its region:
+/* Return the linear offset of an image relative to the start of its region.
  */
 extern GLuint
 st_texture_image_offset(const struct pipe_texture *pt,
-- 
cgit v1.2.3


From 105b3596be6c0644e3aaa0823ab7e27aa76fa909 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 18:01:18 -0700
Subject: added comment

---
 src/mesa/state_tracker/st_atom_texture.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index b3859f18cb..2a836d630b 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -59,6 +59,7 @@ update_textures(struct st_context *st)
          GLboolean flush, retval;
 
          retval = st_finalize_texture(st->ctx, st->pipe, texObj, &flush);
+         /* XXX retval indicates whether there's a texture border */
 
          pt = st_get_texobj_texture(texObj);
       }
-- 
cgit v1.2.3


From afc54983370033b65e3a7cbb29bd9c87156f0881 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 18:13:28 -0700
Subject: clean-ups

---
 src/mesa/state_tracker/st_cb_texture.c | 67 +++++++++++++++-------------------
 1 file changed, 29 insertions(+), 38 deletions(-)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index ba0950e295..eee94baa20 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -76,13 +76,13 @@ struct st_texture_object
 
 
-
 static INLINE struct st_texture_object *
 st_texture_object(struct gl_texture_object *obj)
 {
    return (struct st_texture_object *) obj;
 }
 
+
 static INLINE struct st_texture_image *
 st_texture_image(struct gl_texture_image *img)
 {
@@ -122,32 +122,28 @@ gl_target_to_pipe(GLenum target)
 }
 
 
+/**
+ * Return nominal bytes per texel for a compressed format, 0 for non-compressed
+ * format.
+ */
 static int
 compressed_num_bytes(GLuint mesaFormat)
 {
-   int bytes = 0;
    switch(mesaFormat) {
-     
    case MESA_FORMAT_RGB_FXT1:
    case MESA_FORMAT_RGBA_FXT1:
    case MESA_FORMAT_RGB_DXT1:
    case MESA_FORMAT_RGBA_DXT1:
-     bytes = 2;
-     break;
-     
+      return 2;
    case MESA_FORMAT_RGBA_DXT3:
    case MESA_FORMAT_RGBA_DXT5:
-     bytes = 4;
+      return 4;
    default:
-     break;
+      return 0;
    }
-   
-   return bytes;
 }
 
 
-
-
 static GLboolean
 st_IsTextureResident(GLcontext * ctx, struct gl_texture_object *texObj)
 {
@@ -164,7 +160,6 @@ st_IsTextureResident(GLcontext * ctx, struct gl_texture_object *texObj)
 }
 
 
-
 static struct gl_texture_image *
 st_NewTextureImage(GLcontext * ctx)
 {
@@ -216,8 +211,6 @@ st_FreeTextureImageData(GLcontext * ctx, struct gl_texture_image *texImage)
 }
 
 
-
-
 /* ================================================================
  * From linux kernel i386 header files, copes with odd sizes better
  * than COPY_DWORDS would:
@@ -302,7 +295,7 @@ logbase2(int n)
 static void
 guess_and_alloc_texture(struct st_context *st,
 			struct st_texture_object *stObj,
-			struct st_texture_image *stImage)
+			const struct st_texture_image *stImage)
 {
    GLuint firstLevel;
    GLuint lastLevel;
@@ -487,21 +480,18 @@ try_pbo_upload(GLcontext *ctx,
 
 
-
-
-
-
 static void
 st_TexImage(GLcontext * ctx,
-              GLint dims,
-              GLenum target, GLint level,
-              GLint internalFormat,
-              GLint width, GLint height, GLint depth,
-              GLint border,
-              GLenum format, GLenum type, const void *pixels,
-              const struct gl_pixelstore_attrib *unpack,
-              struct gl_texture_object *texObj,
-              struct gl_texture_image *texImage, GLsizei imageSize, int compressed)
+            GLint dims,
+            GLenum target, GLint level,
+            GLint internalFormat,
+            GLint width, GLint height, GLint depth,
+            GLint border,
+            GLenum format, GLenum type, const void *pixels,
+            const struct gl_pixelstore_attrib *unpack,
+            struct gl_texture_object *texObj,
+            struct gl_texture_image *texImage,
+            GLsizei imageSize, int compressed)
 {
    struct st_texture_object *stObj = st_texture_object(texObj);
    struct st_texture_image *stImage = st_texture_image(texImage);
@@ -524,7 +514,7 @@ st_TexImage(GLcontext * ctx,
 
    /* choose the texture format */
    texImage->TexFormat = st_ChooseTextureFormat(ctx, internalFormat,
-                                                  format, type);
+                                                format, type);
 
    _mesa_set_fetch_functions(texImage, dims);
 
@@ -536,7 +526,8 @@ st_TexImage(GLcontext * ctx,
 	 ctx->Driver.CompressedTextureSize(ctx, texImage->Width,
 					   texImage->Height, texImage->Depth,
 					   texImage->TexFormat->MesaFormat);
-   } else {
+   }
+   else {
       texelBytes = texImage->TexFormat->TexelBytes;
       
       /* Minimum pitch of 32 bytes */
@@ -669,7 +660,7 @@ st_TexImage(GLcontext * ctx,
     * conversion and copy:
     */
    if (compressed) {
-     memcpy(texImage->Data, pixels, imageSize);
+      memcpy(texImage->Data, pixels, imageSize);
    }
    else {
       GLuint srcImageStride = _mesa_image_image_stride(unpack, width, height,
@@ -1401,7 +1392,10 @@ copy_image_data_to_texture(struct st_context *st,
 }
 
 
-/*  
+/**
+ * Called during state validation.  When this function is finished,
+ * the texture object should be ready for rendering.
+ * \return GL_FALSE if a texture border is present, GL_TRUE otherwise
  */
 GLboolean
 st_finalize_texture(GLcontext *ctx,
@@ -1410,11 +1404,10 @@ st_finalize_texture(GLcontext *ctx,
 		    GLboolean *needFlush)
 {
    struct st_texture_object *stObj = st_texture_object(tObj);
+   const GLuint nr_faces = (stObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
    int comp_byte = 0;
    int cpp;
-
    GLuint face, i;
-   GLuint nr_faces = 0;
    struct st_texture_image *firstImage;
 
    *needFlush = GL_FALSE;
@@ -1426,8 +1419,7 @@ st_finalize_texture(GLcontext *ctx,
    /* What levels must the texture include at a minimum?
     */
    calculate_first_last_level(stObj);
-   firstImage =
-      st_texture_image(stObj->base.Image[0][stObj->firstLevel]);
+   firstImage = st_texture_image(stObj->base.Image[0][stObj->firstLevel]);
 
    /* Fallback case:
     */
@@ -1503,7 +1495,6 @@ st_finalize_texture(GLcontext *ctx,
 
    /* Pull in any images not in the object's texture:
     */
-   nr_faces = (stObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
    for (face = 0; face < nr_faces; face++) {
       for (i = stObj->firstLevel; i <= stObj->lastLevel; i++) {
          struct st_texture_image *stImage =
-- 
cgit v1.2.3


From 4c2f3dbca940f289e67248682b84a3516d5a3031 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 18:15:03 -0700
Subject: Added ctx->Driver.GenerateMipmap() driver hook

---
 src/mesa/drivers/common/driverfuncs.c |  2 ++
 src/mesa/main/dd.h                    |  7 +++++
 src/mesa/main/fbobject.c              |  2 +-
 src/mesa/main/texstore.c              | 48 +++++++++++++++++------------------
 4 files changed, 34 insertions(+), 25 deletions(-)

(limited to 'src')

diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 33caf7dae1..b5b383b4e4 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -28,6 +28,7 @@
 #include "buffers.h"
 #include "context.h"
 #include "framebuffer.h"
+#include "mipmap.h"
 #include "program.h"
 #include "prog_execute.h"
 #include "queryobj.h"
@@ -99,6 +100,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->CopyTexSubImage1D = _swrast_copy_texsubimage1d;
    driver->CopyTexSubImage2D = _swrast_copy_texsubimage2d;
    driver->CopyTexSubImage3D = _swrast_copy_texsubimage3d;
+   driver->GenerateMipmap = _mesa_generate_mipmap;
    driver->TestProxyTexImage = _mesa_test_proxy_teximage;
    driver->CompressedTexImage1D = _mesa_store_compressed_teximage1d;
    driver->CompressedTexImage2D = _mesa_store_compressed_teximage2d;
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 3bec3bd433..c2ef67ba6d 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -332,6 +332,13 @@ struct dd_function_table {
                               GLint x, GLint y,
                               GLsizei width, GLsizei height );
 
+   /**
+    * Called by glGenerateMipmap() or when GL_GENERATE_MIPMAP_SGIS is enabled.
+    */
+   void (*GenerateMipmap)(GLcontext *ctx,  GLenum target,
+                          const struct gl_texture_unit *texUnit,
+                          struct gl_texture_object *texObj);
+
    /**
     * Called by glTexImage[123]D when user specifies a proxy texture
     * target.  
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 963e35d678..13cbd35424 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1560,7 +1560,7 @@ _mesa_GenerateMipmapEXT(GLenum target)
 
    /* XXX this might not handle cube maps correctly */
    _mesa_lock_texture(ctx, texObj);
-   _mesa_generate_mipmap(ctx, target, texUnit, texObj);
+   ctx->Driver.GenerateMipmap(ctx, target, texUnit, texObj);
    _mesa_unlock_texture(ctx, texObj);
 }
 
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 30be65525e..26ca4f1bd5 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -2917,9 +2917,9 @@ _mesa_store_teximage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3003,9 +3003,9 @@ _mesa_store_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3079,9 +3079,9 @@ _mesa_store_teximage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3127,9 +3127,9 @@ _mesa_store_texsubimage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3182,9 +3182,9 @@ _mesa_store_texsubimage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3237,9 +3237,9 @@ _mesa_store_texsubimage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3313,9 +3313,9 @@ _mesa_store_compressed_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
@@ -3425,9 +3425,9 @@ _mesa_store_compressed_texsubimage2d(GLcontext *ctx, GLenum target,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target,
+                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
+                                 texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
-- 
cgit v1.2.3


From 2440ff74d69a8caf49b05a960b4c7e282a96565e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Tue, 5 Feb 2008 18:15:39 -0700
Subject: plug in ctx->Driver.GenerateMipmap function

---
 src/mesa/state_tracker/st_cb_texture.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index eee94baa20..15c5359360 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -30,6 +30,7 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/macros.h"
+#include "main/mipmap.h"
 #include "main/texcompress.h"
 #include "main/texformat.h"
 #include "main/teximage.h"
@@ -1531,6 +1532,7 @@ st_init_texture_functions(struct dd_function_table *functions)
    functions->CopyTexSubImage1D = st_CopyTexSubImage1D;
    functions->CopyTexSubImage2D = st_CopyTexSubImage2D;
    functions->CopyTexSubImage3D = st_CopyTexSubImage3D;
+   functions->GenerateMipmap = _mesa_generate_mipmap;
 
    functions->GetTexImage = st_GetTexImage;
 
-- 
cgit v1.2.3


From f52f5136e6eed23e55098681e5b082cc452136d6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 08:44:19 -0700
Subject: clean-ups in guess_and_alloc_texture()

---
 src/mesa/state_tracker/st_cb_texture.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 15c5359360..8db4a804ab 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -284,7 +284,12 @@ logbase2(int n)
 }
 
 
-/* Otherwise, store it in memory if (Border != 0) or (any dimension ==
+/**
+ * Allocate a pipe_texture object for the given st_texture_object using
+ * the given st_texture_image to guess the mipmap size/levels.
+ *
+ * [comments...]
+ * Otherwise, store it in memory if (Border != 0) or (any dimension ==
  * 1).
  *    
  * Otherwise, if max_level >= level >= min_level, create texture with
@@ -303,11 +308,12 @@ guess_and_alloc_texture(struct st_context *st,
    GLuint width = stImage->base.Width;
    GLuint height = stImage->base.Height;
    GLuint depth = stImage->base.Depth;
-   GLuint l2width, l2height, l2depth;
    GLuint i, comp_byte = 0;
 
    DBG("%s\n", __FUNCTION__);
 
+   assert(!stObj->pt);
+
    if (stImage->base.Border)
       return;
 
@@ -349,15 +355,15 @@ guess_and_alloc_texture(struct st_context *st,
       lastLevel = firstLevel;
    }
    else {
-      l2width = logbase2(width);
-      l2height = logbase2(height);
-      l2depth = logbase2(depth);
+      GLuint l2width = logbase2(width);
+      GLuint l2height = logbase2(height);
+      GLuint l2depth = logbase2(depth);
       lastLevel = firstLevel + MAX2(MAX2(l2width, l2height), l2depth);
    }
 
-   assert(!stObj->pt);
    if (stImage->base.IsCompressed)
       comp_byte = compressed_num_bytes(stImage->base.TexFormat->MesaFormat);
+
    stObj->pt = st_texture_create(st,
                                  gl_target_to_pipe(stObj->base.Target),
                                  st_mesa_format_to_pipe_format(stImage->base.TexFormat->MesaFormat),
-- 
cgit v1.2.3


From 31c98eafb043cbc82e5de206ceecc5888174b5e6 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:24:30 -0700
Subject: gallium: change pipe->texture_create() to operate like the CSO
 functions

Now, pass in a template object and return a new object.
---
 src/mesa/pipe/cell/ppu/cell_texture.c     | 31 ++++++++++++++---------------
 src/mesa/pipe/cell/ppu/cell_texture.h     |  5 +++--
 src/mesa/pipe/i915simple/i915_texture.c   | 17 ++++++++--------
 src/mesa/pipe/i915simple/i915_texture.h   |  5 +++--
 src/mesa/pipe/i965simple/brw_tex_layout.c | 15 +++++++-------
 src/mesa/pipe/i965simple/brw_tex_layout.h |  4 ++--
 src/mesa/pipe/p_context.h                 |  4 ++--
 src/mesa/pipe/softpipe/sp_texture.c       | 33 +++++++++++++++----------------
 src/mesa/pipe/softpipe/sp_texture.h       |  5 +++--
 src/mesa/state_tracker/st_texture.c       | 31 ++++++++++++-----------------
 10 files changed, 73 insertions(+), 77 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index 2cf6022939..df178d9ca2 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -79,31 +79,30 @@ cell_texture_layout(struct cell_texture * spt)
 }
 
 
-void
-cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+struct pipe_texture *
+cell_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat)
 {
-   struct cell_texture *spt = REALLOC(*pt, sizeof(struct pipe_texture),
-					  sizeof(struct cell_texture));
+   struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
+   if (!spt)
+      return NULL;
 
-   if (spt) {
-      memset(&spt->base + 1, 0,
-	     sizeof(struct cell_texture) - sizeof(struct pipe_texture));
+   spt->base = *templat;
 
-      cell_texture_layout(spt);
+   cell_texture_layout(spt);
 
-      spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
-                                                PIPE_BUFFER_USAGE_PIXEL,
-                                                spt->buffer_size);
+   spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
+                                             PIPE_BUFFER_USAGE_PIXEL,
+                                             spt->buffer_size);
 
-      if (!spt->buffer) {
-	 FREE(spt);
-	 spt = NULL;
-      }
+   if (!spt->buffer) {
+      FREE(spt);
+      return NULL;
    }
 
-   *pt = &spt->base;
+   return &spt->base;
 }
 
+
 void
 cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.h b/src/mesa/pipe/cell/ppu/cell_texture.h
index bd434c8776..0264fed88e 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.h
+++ b/src/mesa/pipe/cell/ppu/cell_texture.h
@@ -60,8 +60,9 @@ cell_texture(struct pipe_texture *pt)
 
 
-extern void
-cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+cell_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat);
 
 extern void
 cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/i915simple/i915_texture.c b/src/mesa/pipe/i915simple/i915_texture.c
index 61944fe7d9..6faeab134a 100644
--- a/src/mesa/pipe/i915simple/i915_texture.c
+++ b/src/mesa/pipe/i915simple/i915_texture.c
@@ -477,17 +477,17 @@ i945_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
    return TRUE;
 }
 
-void
-i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+
+struct pipe_texture *
+i915_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat)
 {
-   struct i915_texture *tex = REALLOC(*pt, sizeof(struct pipe_texture),
-				      sizeof(struct i915_texture));
+   struct i915_texture *tex = CALLOC_STRUCT(i915_texture);
 
    if (tex) {
       struct i915_context *i915 = i915_context(pipe);
 
-      memset(&tex->base + 1, 0,
-	     sizeof(struct i915_texture) - sizeof(struct pipe_texture));
+      tex->base = *templat;
 
       if (i915->flags.is_i945 ? i945_miptree_layout(pipe, tex) :
 	  i915_miptree_layout(pipe, tex))
@@ -498,13 +498,14 @@ i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
 
       if (!tex->buffer) {
 	 FREE(tex);
-	 tex = NULL;
+	 return NULL;
       }
    }
 
-   *pt = &tex->base;
+   return &tex->base;
 }
 
+
 void
 i915_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/i915simple/i915_texture.h b/src/mesa/pipe/i915simple/i915_texture.h
index 84a0502e81..330d111dc7 100644
--- a/src/mesa/pipe/i915simple/i915_texture.h
+++ b/src/mesa/pipe/i915simple/i915_texture.h
@@ -6,8 +6,9 @@ struct pipe_context;
 struct pipe_texture;
 
 
-extern void
-i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+struct pipe_texture *
+i915_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat);
 
 extern void
 i915_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.c b/src/mesa/pipe/i965simple/brw_tex_layout.c
index b8b6b579e2..405fd1f794 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.c
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.c
@@ -299,15 +299,14 @@ static boolean brw_miptree_layout(struct pipe_context *pipe, struct brw_texture
    return TRUE;
 }
 
-void
-brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+
+struct pipe_texture *
+brw_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat)
 {
-   struct brw_texture *tex = REALLOC(*pt, sizeof(struct pipe_texture),
-                                     sizeof(struct brw_texture));
+   struct brw_texture *tex = CALLOC_STRUCT(brw_texture);
 
    if (tex) {
-      memset(&tex->base + 1, 0,
-	     sizeof(struct brw_texture) - sizeof(struct pipe_texture));
+      tex->base = *templat;
 
       if (brw_miptree_layout(pipe, tex))
 	 tex->buffer = pipe->winsys->buffer_create(pipe->winsys, 64,
@@ -317,11 +316,11 @@ brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
 
       if (!tex->buffer) {
 	 FREE(tex);
-	 tex = NULL;
+         return NULL;
       }
    }
 
-   *pt = &tex->base;
+   return &tex->base;
 }
 
 void
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.h b/src/mesa/pipe/i965simple/brw_tex_layout.h
index 15e275058a..cfd6b1ef3a 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.h
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.h
@@ -6,8 +6,8 @@
 struct pipe_context;
 struct pipe_texture;
 
-extern void
-brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+brw_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat);
 
 extern void
 brw_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/p_context.h b/src/mesa/pipe/p_context.h
index 0dda06c53b..92a1cd70c4 100644
--- a/src/mesa/pipe/p_context.h
+++ b/src/mesa/pipe/p_context.h
@@ -199,8 +199,8 @@ struct pipe_context {
    /*
     * Texture functions
     */
-   void (*texture_create)(struct pipe_context *pipe,
-			  struct pipe_texture **pt);
+   struct pipe_texture * (*texture_create)(struct pipe_context *pipe,
+                                           const struct pipe_texture *templat);
 
    void (*texture_release)(struct pipe_context *pipe,
 			   struct pipe_texture **pt);
diff --git a/src/mesa/pipe/softpipe/sp_texture.c b/src/mesa/pipe/softpipe/sp_texture.c
index 172234843d..fd2cc3dbbb 100644
--- a/src/mesa/pipe/softpipe/sp_texture.c
+++ b/src/mesa/pipe/softpipe/sp_texture.c
@@ -79,31 +79,30 @@ softpipe_texture_layout(struct softpipe_texture * spt)
 }
 
 
-void
-softpipe_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+struct pipe_texture *
+softpipe_texture_create(struct pipe_context *pipe,
+                        const struct pipe_texture *templat)
 {
-   struct softpipe_texture *spt = REALLOC(*pt, sizeof(struct pipe_texture),
-					  sizeof(struct softpipe_texture));
-
-   if (spt) {
-      memset(&spt->base + 1, 0,
-	     sizeof(struct softpipe_texture) - sizeof(struct pipe_texture));
+   struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
+   if (!spt)
+      return NULL;
 
-      softpipe_texture_layout(spt);
+   spt->base = *templat;
 
-      spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
-                                                PIPE_BUFFER_USAGE_PIXEL,
-                                                spt->buffer_size);
+   softpipe_texture_layout(spt);
 
-      if (!spt->buffer) {
-	 FREE(spt);
-	 spt = NULL;
-      }
+   spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
+                                             PIPE_BUFFER_USAGE_PIXEL,
+                                             spt->buffer_size);
+   if (!spt->buffer) {
+      FREE(spt);
+      return NULL;
    }
 
-   *pt = &spt->base;
+   return &spt->base;
 }
 
+
 void
 softpipe_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/softpipe/sp_texture.h b/src/mesa/pipe/softpipe/sp_texture.h
index c6cf370351..fa646c0de9 100644
--- a/src/mesa/pipe/softpipe/sp_texture.h
+++ b/src/mesa/pipe/softpipe/sp_texture.h
@@ -55,8 +55,9 @@ softpipe_texture(struct pipe_texture *pt)
 
 
-extern void
-softpipe_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+softpipe_texture_create(struct pipe_context *pipe,
+                        const struct pipe_texture *templat);
 
 extern void
 softpipe_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 741f36c2a7..844a9f80d8 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -74,7 +74,7 @@ st_texture_create(struct st_context *st,
 		  GLuint depth0,
 		  GLuint compress_byte)
 {
-   struct pipe_texture *pt = CALLOC_STRUCT(pipe_texture);
+   struct pipe_texture pt;
 
    assert(target <= PIPE_TEXTURE_CUBE);
 
@@ -82,25 +82,20 @@ st_texture_create(struct st_context *st,
        _mesa_lookup_enum_by_nr(target),
        _mesa_lookup_enum_by_nr(format), first_level, last_level);
 
-   if (!pt)
-      return NULL;
-
    assert(format);
 
-   pt->target = target;
-   pt->format = format;
-   pt->first_level = first_level;
-   pt->last_level = last_level;
-   pt->width[0] = width0;
-   pt->height[0] = height0;
-   pt->depth[0] = depth0;
-   pt->compressed = compress_byte ? 1 : 0;
-   pt->cpp = pt->compressed ? compress_byte : st_sizeof_format(format);
-   pt->refcount = 1; 
-
-   st->pipe->texture_create(st->pipe, &pt);
-
-   return pt;
+   pt.target = target;
+   pt.format = format;
+   pt.first_level = first_level;
+   pt.last_level = last_level;
+   pt.width[0] = width0;
+   pt.height[0] = height0;
+   pt.depth[0] = depth0;
+   pt.compressed = compress_byte ? 1 : 0;
+   pt.cpp = pt.compressed ? compress_byte : st_sizeof_format(format);
+   pt.refcount = 1; 
+
+   return st->pipe->texture_create(st->pipe, &pt);
 }
 
 
-- 
cgit v1.2.3


From c8af89cf722830ec16d594afd99d717aed71d44c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:28:20 -0700
Subject: gallium: added mem_dup()

---
 src/mesa/pipe/p_util.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/p_util.h b/src/mesa/pipe/p_util.h
index 4780ed7818..991ac447ba 100644
--- a/src/mesa/pipe/p_util.h
+++ b/src/mesa/pipe/p_util.h
@@ -183,6 +183,20 @@ align_free(void *ptr)
 
 
+/**
+ * Duplicate of a block of memory
+ */
+static INLINE void *
+mem_dup(const void *src, uint size)
+{
+   void *dup = malloc(size);
+   if (dup)
+      memcpy(dup, src, size);
+   return dup;
+}
+
+
+
 #define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
 #define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
 #define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
-- 
cgit v1.2.3


From f12d641ab2bafe20f876dddb90ada76c83732757 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:29:00 -0700
Subject: gallium: use mem_dup()

---
 src/mesa/pipe/softpipe/sp_state_blend.c      | 10 +++-------
 src/mesa/pipe/softpipe/sp_state_rasterizer.c |  7 ++-----
 src/mesa/pipe/softpipe/sp_state_sampler.c    |  4 +---
 3 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/softpipe/sp_state_blend.c b/src/mesa/pipe/softpipe/sp_state_blend.c
index 160ca5cbc0..2d40d6bd8f 100644
--- a/src/mesa/pipe/softpipe/sp_state_blend.c
+++ b/src/mesa/pipe/softpipe/sp_state_blend.c
@@ -32,13 +32,12 @@
 #include "sp_context.h"
 #include "sp_state.h"
 
+
 void *
 softpipe_create_blend_state(struct pipe_context *pipe,
                             const struct pipe_blend_state *blend)
 {
-   struct pipe_blend_state *state = MALLOC( sizeof(struct pipe_blend_state) );
-   memcpy(state, blend, sizeof(struct pipe_blend_state));
-   return state;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 void softpipe_bind_blend_state( struct pipe_context *pipe,
@@ -78,10 +77,7 @@ void *
 softpipe_create_depth_stencil_state(struct pipe_context *pipe,
 				    const struct pipe_depth_stencil_alpha_state *depth_stencil)
 {
-   struct pipe_depth_stencil_alpha_state *state =
-      MALLOC( sizeof(struct pipe_depth_stencil_alpha_state) );
-   memcpy(state, depth_stencil, sizeof(struct pipe_depth_stencil_alpha_state));
-   return state;
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
 }
 
 void
diff --git a/src/mesa/pipe/softpipe/sp_state_rasterizer.c b/src/mesa/pipe/softpipe/sp_state_rasterizer.c
index ce8fa4f2b8..53755099dd 100644
--- a/src/mesa/pipe/softpipe/sp_state_rasterizer.c
+++ b/src/mesa/pipe/softpipe/sp_state_rasterizer.c
@@ -35,12 +35,9 @@
 
 void *
 softpipe_create_rasterizer_state(struct pipe_context *pipe,
-                                 const struct pipe_rasterizer_state *setup)
+                                 const struct pipe_rasterizer_state *rast)
 {
-   struct pipe_rasterizer_state *state =
-      MALLOC( sizeof(struct pipe_rasterizer_state) );
-   memcpy(state, setup, sizeof(struct pipe_rasterizer_state));
-   return state;
+   return mem_dup(rast, sizeof(*rast));
 }
 
 void softpipe_bind_rasterizer_state(struct pipe_context *pipe,
diff --git a/src/mesa/pipe/softpipe/sp_state_sampler.c b/src/mesa/pipe/softpipe/sp_state_sampler.c
index 3842e71503..51b4b78287 100644
--- a/src/mesa/pipe/softpipe/sp_state_sampler.c
+++ b/src/mesa/pipe/softpipe/sp_state_sampler.c
@@ -40,9 +40,7 @@ void *
 softpipe_create_sampler_state(struct pipe_context *pipe,
                               const struct pipe_sampler_state *sampler)
 {
-   struct pipe_sampler_state *state = MALLOC( sizeof(struct pipe_sampler_state) );
-   memcpy(state, sampler, sizeof(struct pipe_sampler_state));
-   return state;
+   return mem_dup(sampler, sizeof(*sampler));
 }
 
 void
-- 
cgit v1.2.3


From c0235d0a24da82304f7f23936c71032c0a9a7ce1 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:35:33 -0700
Subject: Cell: use mem_dup()

---
 src/mesa/pipe/cell/ppu/cell_state_blend.c   | 9 ++-------
 src/mesa/pipe/cell/ppu/cell_state_sampler.c | 4 +---
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_blend.c b/src/mesa/pipe/cell/ppu/cell_state_blend.c
index 2c19aa3971..4fc60548c8 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_blend.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_blend.c
@@ -39,9 +39,7 @@ void *
 cell_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *blend)
 {
-   struct pipe_blend_state *state = MALLOC(sizeof(struct pipe_blend_state));
-   memcpy(state, blend, sizeof(struct pipe_blend_state));
-   return state;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 
@@ -85,10 +83,7 @@ void *
 cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
                  const struct pipe_depth_stencil_alpha_state *depth_stencil)
 {
-   struct pipe_depth_stencil_alpha_state *state =
-      MALLOC(sizeof(struct pipe_depth_stencil_alpha_state));
-   memcpy(state, depth_stencil, sizeof(struct pipe_depth_stencil_alpha_state));
-   return state;
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
 }
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_sampler.c b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
index 317f7603bb..ade6cc8338 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_sampler.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
@@ -40,9 +40,7 @@ void *
 cell_create_sampler_state(struct pipe_context *pipe,
                           const struct pipe_sampler_state *sampler)
 {
-   struct pipe_sampler_state *state = MALLOC( sizeof(struct pipe_sampler_state) );
-   memcpy(state, sampler, sizeof(struct pipe_sampler_state));
-   return state;
+   return mem_dup(sampler, sizeof(*sampler));
 }
 
 void
-- 
cgit v1.2.3


From 2ed6604f50bc06cfd44b429f476588381113db9c Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:48:37 -0700
Subject: gallium: #include p_debug.h since we use assert

---
 src/mesa/pipe/p_util.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src')

diff --git a/src/mesa/pipe/p_util.h b/src/mesa/pipe/p_util.h
index 991ac447ba..469920efee 100644
--- a/src/mesa/pipe/p_util.h
+++ b/src/mesa/pipe/p_util.h
@@ -29,6 +29,7 @@
 #define P_UTIL_H
 
 #include "p_compiler.h"
+#include "p_debug.h"
 #include <math.h>
 
 
-- 
cgit v1.2.3


From c4e0d725dc9f18aed2babed344bb4e42df9e481f Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Wed, 6 Feb 2008 09:48:56 -0700
Subject: Cell: silence unused var warnings

---
 src/mesa/pipe/cell/ppu/cell_state_fs.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/cell/ppu/cell_state_fs.c b/src/mesa/pipe/cell/ppu/cell_state_fs.c
index 81c2ac14dd..96a52273b0 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_fs.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_fs.c
@@ -45,7 +45,7 @@ void *
 cell_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
-   struct cell_context *cell = cell_context(pipe);
+   /*struct cell_context *cell = cell_context(pipe);*/
    struct cell_fragment_shader_state *state;
 
    state = CALLOC_STRUCT(cell_fragment_shader_state);
@@ -94,8 +94,6 @@ cell_bind_fs_state(struct pipe_context *pipe, void *fs)
 void
 cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
-   struct cell_context *cell = cell_context(pipe);
-
    struct cell_fragment_shader_state *state =
       (struct cell_fragment_shader_state *) fs;
 
-- 
cgit v1.2.3


From 28ecb986d958d52c9a996453e53418d1f98d7f75 Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Thu, 7 Feb 2008 19:43:34 +0000
Subject: pipebuffer: Fix reversed assertion.

---
 src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
index bcd4b3e257..04477a865a 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
@@ -170,7 +170,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
    struct list_head *item;
 
    assert(size == pool->bufSize);
-   assert(desc->alignment % pool->bufAlign == 0);
+   assert(pool->bufAlign % desc->alignment == 0);
    
    _glthread_LOCK_MUTEX(pool->mutex);
 
-- 
cgit v1.2.3


From 909c703bfbf7404414befaa0a94b76d78ba3cb4c Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Thu, 7 Feb 2008 19:44:42 +0000
Subject: tgsi: Fall back to interpreter instead of assert(0) on unimplemented
 SSE code.

---
 src/mesa/pipe/draw/draw_vertex_shader.c | 22 ++++++++++++++++------
 src/mesa/pipe/tgsi/exec/tgsi_sse2.c     |  3 +--
 src/mesa/x86/rtasm/x86sse.c             |  1 +
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index e6590eafcc..5ca93aa615 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -121,11 +121,16 @@ run_vertex_program(struct draw_context *draw,
          = (struct draw_vertex_shader *)draw->vertex_shader;
       codegen_function func
          = (codegen_function) x86_get_func( &shader->sse2_program );
-      func(
-         machine->Inputs,
-         machine->Outputs,
-         machine->Consts,
-         machine->Temps );
+
+      if (func)
+         func(
+            machine->Inputs,
+            machine->Outputs,
+            machine->Consts,
+            machine->Temps );
+      else
+         /* interpreter */
+         tgsi_exec_machine_run( machine );
    }
    else
 #endif
@@ -269,7 +274,12 @@ draw_create_vertex_shader(struct draw_context *draw,
       struct pipe_shader_state *sh = (struct pipe_shader_state *) shader;
 
       x86_init_func( &vs->sse2_program );
-      tgsi_emit_sse2( (struct tgsi_token *) sh->tokens, &vs->sse2_program );
+      if (!tgsi_emit_sse2( (struct tgsi_token *) sh->tokens,
+                           &vs->sse2_program )) {
+         x86_release_func( (struct x86_function *) &vs->sse2_program );
+	 fprintf(stdout /*err*/,
+		 "tgsi_emit_sse2() failed, falling back to interpreter\n");
+      }
    }
 #endif
 
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
index f2180082f1..40bacf8552 100755
--- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
@@ -2254,8 +2254,7 @@ tgsi_emit_sse2(
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
          /* XXX implement this */
-         assert(0);
-         break;
+         return 0;
 
       default:
          assert( 0 );
diff --git a/src/mesa/x86/rtasm/x86sse.c b/src/mesa/x86/rtasm/x86sse.c
index 56c211eee0..f8da6e405f 100644
--- a/src/mesa/x86/rtasm/x86sse.c
+++ b/src/mesa/x86/rtasm/x86sse.c
@@ -1137,6 +1137,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size )
 void x86_release_func( struct x86_function *p )
 {
    _mesa_exec_free(p->store);
+   p->store = NULL;
 }
 
 
-- 
cgit v1.2.3


From 2b6a31bfda319975b728930f019175611145ebb9 Mon Sep 17 00:00:00 2001
From: Jerome Glisse <glisse@freedesktop.org>
Date: Fri, 8 Feb 2008 18:25:49 +0100
Subject: failover: several fixes to failover pipe module

---
 src/mesa/pipe/failover/fo_context.c |  2 +
 src/mesa/pipe/failover/fo_state.c   | 79 ++++++++++++++++++++++++++-----------
 2 files changed, 57 insertions(+), 24 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/failover/fo_context.c b/src/mesa/pipe/failover/fo_context.c
index cf6c9fed50..7ce4a7df17 100644
--- a/src/mesa/pipe/failover/fo_context.c
+++ b/src/mesa/pipe/failover/fo_context.c
@@ -114,6 +114,8 @@ struct pipe_context *failover_create( struct pipe_context *hw,
    if (failover == NULL)
       return NULL;
 
+   failover->hw = hw;
+   failover->sw = sw;
    failover->pipe.winsys = hw->winsys;
    failover->pipe.destroy = failover_destroy;
    failover->pipe.is_format_supported = hw->is_format_supported;
diff --git a/src/mesa/pipe/failover/fo_state.c b/src/mesa/pipe/failover/fo_state.c
index fa700b9674..0fc5568da1 100644
--- a/src/mesa/pipe/failover/fo_state.c
+++ b/src/mesa/pipe/failover/fo_state.c
@@ -54,8 +54,8 @@ failover_create_blend_state( struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_blend_state(pipe, blend);
-   state->hw_state = failover->hw->create_blend_state(pipe, blend);
+   state->sw_state = failover->sw->create_blend_state(failover->sw, blend);
+   state->hw_state = failover->hw->create_blend_state(failover->hw, blend);
 
    return state;
 }
@@ -68,6 +68,7 @@ failover_bind_blend_state( struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state *)blend;
    failover->blend = state;
    failover->dirty |= FO_NEW_BLEND;
+   failover->sw->bind_blend_state( failover->sw, state->sw_state );
    failover->hw->bind_blend_state( failover->hw, state->hw_state );
 }
 
@@ -78,8 +79,8 @@ failover_delete_blend_state( struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)blend;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_blend_state(pipe, state->sw_state);
-   failover->hw->delete_blend_state(pipe, state->hw_state);
+   failover->sw->delete_blend_state(failover->sw, state->sw_state);
+   failover->hw->delete_blend_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -93,6 +94,7 @@ failover_set_blend_color( struct pipe_context *pipe,
 
    failover->blend_color = *blend_color;
    failover->dirty |= FO_NEW_BLEND_COLOR;
+   failover->sw->set_blend_color( failover->sw, blend_color );
    failover->hw->set_blend_color( failover->hw, blend_color );
 }
 
@@ -104,6 +106,7 @@ failover_set_clip_state( struct pipe_context *pipe,
 
    failover->clip = *clip;
    failover->dirty |= FO_NEW_CLIP;
+   failover->sw->set_clip_state( failover->sw, clip );
    failover->hw->set_clip_state( failover->hw, clip );
 }
 
@@ -115,8 +118,8 @@ failover_create_depth_stencil_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_depth_stencil_alpha_state(pipe, templ);
-   state->hw_state = failover->hw->create_depth_stencil_alpha_state(pipe, templ);
+   state->sw_state = failover->sw->create_depth_stencil_alpha_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_depth_stencil_alpha_state(failover->hw, templ);
 
    return state;
 }
@@ -129,6 +132,7 @@ failover_bind_depth_stencil_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state *)depth_stencil;
    failover->depth_stencil = state;
    failover->dirty |= FO_NEW_DEPTH_STENCIL;
+   failover->sw->bind_depth_stencil_alpha_state(failover->sw, state->sw_state);
    failover->hw->bind_depth_stencil_alpha_state(failover->hw, state->hw_state);
 }
 
@@ -139,8 +143,8 @@ failover_delete_depth_stencil_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)ds;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_depth_stencil_alpha_state(pipe, state->sw_state);
-   failover->hw->delete_depth_stencil_alpha_state(pipe, state->hw_state);
+   failover->sw->delete_depth_stencil_alpha_state(failover->sw, state->sw_state);
+   failover->hw->delete_depth_stencil_alpha_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -154,6 +158,7 @@ failover_set_framebuffer_state(struct pipe_context *pipe,
 
    failover->framebuffer = *framebuffer;
    failover->dirty |= FO_NEW_FRAMEBUFFER;
+   failover->sw->set_framebuffer_state( failover->sw, framebuffer );
    failover->hw->set_framebuffer_state( failover->hw, framebuffer );
 }
 
@@ -165,8 +170,8 @@ failover_create_fs_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_fs_state(pipe, templ);
-   state->hw_state = failover->hw->create_fs_state(pipe, templ);
+   state->sw_state = failover->sw->create_fs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_fs_state(failover->hw, templ);
 
    return state;
 }
@@ -178,6 +183,7 @@ failover_bind_fs_state(struct pipe_context *pipe, void *fs)
    struct fo_state *state = (struct fo_state*)fs;
    failover->fragment_shader = state;
    failover->dirty |= FO_NEW_FRAGMENT_SHADER;
+   failover->sw->bind_fs_state(failover->sw, state->sw_state);
    failover->hw->bind_fs_state(failover->hw, state->hw_state);
 }
 
@@ -188,8 +194,8 @@ failover_delete_fs_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)fs;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_fs_state(pipe, state->sw_state);
-   failover->hw->delete_fs_state(pipe, state->hw_state);
+   failover->sw->delete_fs_state(failover->sw, state->sw_state);
+   failover->hw->delete_fs_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -202,8 +208,8 @@ failover_create_vs_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_vs_state(pipe, templ);
-   state->hw_state = failover->hw->create_vs_state(pipe, templ);
+   state->sw_state = failover->sw->create_vs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_vs_state(failover->hw, templ);
 
    return state;
 }
@@ -217,6 +223,7 @@ failover_bind_vs_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)vs;
    failover->vertex_shader = state;
    failover->dirty |= FO_NEW_VERTEX_SHADER;
+   failover->sw->bind_vs_state(failover->sw, state->sw_state);
    failover->hw->bind_vs_state(failover->hw, state->hw_state);
 }
 
@@ -227,8 +234,8 @@ failover_delete_vs_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)vs;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_vs_state(pipe, state->sw_state);
-   failover->hw->delete_vs_state(pipe, state->hw_state);
+   failover->sw->delete_vs_state(failover->sw, state->sw_state);
+   failover->hw->delete_vs_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -242,6 +249,7 @@ failover_set_polygon_stipple( struct pipe_context *pipe,
 
    failover->poly_stipple = *stipple;
    failover->dirty |= FO_NEW_STIPPLE;
+   failover->sw->set_polygon_stipple( failover->sw, stipple );
    failover->hw->set_polygon_stipple( failover->hw, stipple );
 }
 
@@ -253,8 +261,8 @@ failover_create_rasterizer_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_rasterizer_state(pipe, templ);
-   state->hw_state = failover->hw->create_rasterizer_state(pipe, templ);
+   state->sw_state = failover->sw->create_rasterizer_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_rasterizer_state(failover->hw, templ);
 
    return state;
 }
@@ -268,6 +276,7 @@ failover_bind_rasterizer_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)raster;
    failover->rasterizer = state;
    failover->dirty |= FO_NEW_RASTERIZER;
+   failover->sw->bind_rasterizer_state(failover->sw, state->sw_state);
    failover->hw->bind_rasterizer_state(failover->hw, state->hw_state);
 }
 
@@ -278,8 +287,8 @@ failover_delete_rasterizer_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)raster;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_rasterizer_state(pipe, state->sw_state);
-   failover->hw->delete_rasterizer_state(pipe, state->hw_state);
+   failover->sw->delete_rasterizer_state(failover->sw, state->sw_state);
+   failover->hw->delete_rasterizer_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -294,6 +303,7 @@ failover_set_scissor_state( struct pipe_context *pipe,
 
    failover->scissor = *scissor;
    failover->dirty |= FO_NEW_SCISSOR;
+   failover->sw->set_scissor_state( failover->sw, scissor );
    failover->hw->set_scissor_state( failover->hw, scissor );
 }
 
@@ -305,8 +315,8 @@ failover_create_sampler_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_sampler_state(pipe, templ);
-   state->hw_state = failover->hw->create_sampler_state(pipe, templ);
+   state->sw_state = failover->sw->create_sampler_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_sampler_state(failover->hw, templ);
 
    return state;
 }
@@ -320,6 +330,8 @@ failover_bind_sampler_state(struct pipe_context *pipe,
    failover->sampler[unit] = state;
    failover->dirty |= FO_NEW_SAMPLER;
    failover->dirty_sampler |= (1<<unit);
+   failover->sw->bind_sampler_state(failover->sw, unit,
+                                    state->sw_state);
    failover->hw->bind_sampler_state(failover->hw, unit,
                                     state->hw_state);
 }
@@ -330,8 +342,8 @@ failover_delete_sampler_state(struct pipe_context *pipe, void *sampler)
    struct fo_state *state = (struct fo_state*)sampler;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_sampler_state(pipe, state->sw_state);
-   failover->hw->delete_sampler_state(pipe, state->hw_state);
+   failover->sw->delete_sampler_state(failover->sw, state->sw_state);
+   failover->hw->delete_sampler_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -348,6 +360,7 @@ failover_set_sampler_texture(struct pipe_context *pipe,
    failover->texture[unit] = texture;
    failover->dirty |= FO_NEW_TEXTURE;
    failover->dirty_texture |= (1<<unit);
+   failover->sw->set_sampler_texture( failover->sw, unit, texture );
    failover->hw->set_sampler_texture( failover->hw, unit, texture );
 }
 
@@ -360,6 +373,7 @@ failover_set_viewport_state( struct pipe_context *pipe,
 
    failover->viewport = *viewport; 
    failover->dirty |= FO_NEW_VIEWPORT;
+   failover->sw->set_viewport_state( failover->sw, viewport );
    failover->hw->set_viewport_state( failover->hw, viewport );
 }
 
@@ -374,6 +388,7 @@ failover_set_vertex_buffer(struct pipe_context *pipe,
    failover->vertex_buffer[unit] = *vertex_buffer;
    failover->dirty |= FO_NEW_VERTEX_BUFFER;
    failover->dirty_vertex_buffer |= (1<<unit);
+   failover->sw->set_vertex_buffer( failover->sw, unit, vertex_buffer );
    failover->hw->set_vertex_buffer( failover->hw, unit, vertex_buffer );
 }
 
@@ -388,9 +403,24 @@ failover_set_vertex_element(struct pipe_context *pipe,
    failover->vertex_element[unit] = *vertex_element;
    failover->dirty |= FO_NEW_VERTEX_ELEMENT;
    failover->dirty_vertex_element |= (1<<unit);
+   failover->sw->set_vertex_element( failover->sw, unit, vertex_element );
    failover->hw->set_vertex_element( failover->hw, unit, vertex_element );
 }
 
+void
+failover_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   failover->sw->set_constant_buffer(failover->sw, shader, index, buf);
+   failover->hw->set_constant_buffer(failover->hw, shader, index, buf);
+}
+
 
 void
 failover_init_state_functions( struct failover_context *failover )
@@ -423,4 +453,5 @@ failover_init_state_functions( struct failover_context *failover )
    failover->pipe.set_viewport_state = failover_set_viewport_state;
    failover->pipe.set_vertex_buffer = failover_set_vertex_buffer;
    failover->pipe.set_vertex_element = failover_set_vertex_element;
+   failover->pipe.set_constant_buffer = failover_set_constant_buffer;
 }
-- 
cgit v1.2.3


From 21e9396e650d23084bfeae0d2670b5ffcf731a85 Mon Sep 17 00:00:00 2001
From: Jerome Glisse <glisse@freedesktop.org>
Date: Fri, 8 Feb 2008 18:47:25 +0100
Subject: intel_winsys: remove leftover code

---
 src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'src')

diff --git a/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c b/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c
index 910c0d2cc5..789a386500 100644
--- a/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c
+++ b/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c
@@ -224,11 +224,6 @@ intel_i915_surface_alloc_storage(struct pipe_winsys *winsys,
    if(!surf->buffer)
       return -1;
 
-   if(ret) {
-      pipe_buffer_reference(winsys, &surf->buffer, NULL);
-      return ret;
-   }
-   
    return 0;
 }
 
-- 
cgit v1.2.3


From c3395f4473c8fdf75d04c0dd72e687bc8d8127a7 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:45:58 -0700
Subject: Remove unused texunit parameter to ctx->Driver.GenerateMipmap()

---
 src/mesa/main/dd.h       |  3 +--
 src/mesa/main/fbobject.c |  2 +-
 src/mesa/main/mipmap.c   |  1 -
 src/mesa/main/mipmap.h   |  1 -
 src/mesa/main/texstore.c | 32 ++++++++------------------------
 5 files changed, 10 insertions(+), 29 deletions(-)

(limited to 'src')

diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index c2ef67ba6d..37ef2a865b 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -335,8 +335,7 @@ struct dd_function_table {
    /**
     * Called by glGenerateMipmap() or when GL_GENERATE_MIPMAP_SGIS is enabled.
     */
-   void (*GenerateMipmap)(GLcontext *ctx,  GLenum target,
-                          const struct gl_texture_unit *texUnit,
+   void (*GenerateMipmap)(GLcontext *ctx, GLenum target,
                           struct gl_texture_object *texObj);
 
    /**
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 13cbd35424..6a8cba4d8a 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1560,7 +1560,7 @@ _mesa_GenerateMipmapEXT(GLenum target)
 
    /* XXX this might not handle cube maps correctly */
    _mesa_lock_texture(ctx, texObj);
-   ctx->Driver.GenerateMipmap(ctx, target, texUnit, texObj);
+   ctx->Driver.GenerateMipmap(ctx, target, texObj);
    _mesa_unlock_texture(ctx, texObj);
 }
 
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 9f3db22b75..1e61829e8f 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -933,7 +933,6 @@ make_2d_stack_mipmap(const struct gl_texture_format *format, GLint border,
  */
 void
 _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
-                      const struct gl_texture_unit *texUnit,
                       struct gl_texture_object *texObj)
 {
    const struct gl_texture_image *srcImage;
diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h
index df78603283..46e16902c8 100644
--- a/src/mesa/main/mipmap.h
+++ b/src/mesa/main/mipmap.h
@@ -30,7 +30,6 @@
 
 extern void
 _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
-                      const struct gl_texture_unit *texUnit,
                       struct gl_texture_object *texObj);
 
 
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 26ca4f1bd5..a6a18910fc 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -2917,9 +2917,7 @@ _mesa_store_teximage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3003,9 +3001,7 @@ _mesa_store_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3079,9 +3075,7 @@ _mesa_store_teximage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3127,9 +3121,7 @@ _mesa_store_texsubimage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3182,9 +3174,7 @@ _mesa_store_texsubimage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3237,9 +3227,7 @@ _mesa_store_texsubimage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3313,9 +3301,7 @@ _mesa_store_compressed_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
@@ -3425,9 +3411,7 @@ _mesa_store_compressed_texsubimage2d(GLcontext *ctx, GLenum target,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target,
-                                 &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                                 texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
-- 
cgit v1.2.3


From 864abce57d3b81d0f92673472959b71e09c4f245 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:46:47 -0700
Subject: gallium: added draw_flush() call in softpipe_bind_sampler_state()

---
 src/mesa/pipe/softpipe/sp_state_sampler.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'src')

diff --git a/src/mesa/pipe/softpipe/sp_state_sampler.c b/src/mesa/pipe/softpipe/sp_state_sampler.c
index 51b4b78287..291bbc40ad 100644
--- a/src/mesa/pipe/softpipe/sp_state_sampler.c
+++ b/src/mesa/pipe/softpipe/sp_state_sampler.c
@@ -49,6 +49,8 @@ softpipe_bind_sampler_state(struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   draw_flush(softpipe->draw);
+
    assert(unit < PIPE_MAX_SAMPLERS);
    softpipe->sampler[unit] = (struct pipe_sampler_state *)sampler;
 
-- 
cgit v1.2.3


From 0b64ee6960f9e099bc1a6ca6fa10720fee875b3a Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:51:32 -0700
Subject: gallium: added inClipCoords param to st_draw_vertices() to indicate
 coord system of vertices

Also, export st_make_passthrough_vertex_shader() from st_cb_drawpixels.c
---
 src/mesa/state_tracker/st_cb_clear.c      |  2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c | 18 +++++++++---------
 src/mesa/state_tracker/st_cb_drawpixels.h |  4 ++++
 src/mesa/state_tracker/st_draw.c          | 21 ++++++++++++---------
 src/mesa/state_tracker/st_draw.h          |  3 ++-
 5 files changed, 28 insertions(+), 20 deletions(-)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index 0cd469c156..ab98b54bab 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -251,7 +251,7 @@ draw_quad(GLcontext *ctx,
       verts[i][1][3] = color[3];
    }
 
-   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2);
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2, GL_FALSE);
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 34d420fcff..07886e7982 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -355,8 +355,8 @@ make_fragment_shader_z(struct st_context *st)
  * Create a simple vertex shader that just passes through the
  * vertex position and texcoord (and optionally, color).
  */
-static struct st_vertex_program *
-make_vertex_shader(struct st_context *st, GLboolean passColor)
+struct st_vertex_program *
+st_make_passthrough_vertex_shader(struct st_context *st, GLboolean passColor)
 {
    /* only make programs once and re-use */
    static struct st_vertex_program *progs[2] = { NULL, NULL };
@@ -572,7 +572,7 @@ draw_quad(GLcontext *ctx, GLfloat x0, GLfloat y0, GLfloat z,
       verts[i][1][3] = 1.0; /*Q*/
    }
 
-   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2);
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2, GL_FALSE);
 }
 
 
@@ -625,7 +625,7 @@ draw_quad_colored(GLcontext *ctx, GLfloat x0, GLfloat y0, GLfloat z,
       verts[i][2][3] = 1.0; /*Q*/
    }
 
-   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 3);
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 3, GL_FALSE);
 }
 
 
@@ -945,7 +945,7 @@ st_DrawPixels(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    if (format == GL_DEPTH_COMPONENT) {
       ps = st->state.framebuffer.zsbuf;
       stfp = make_fragment_shader_z(ctx->st);
-      stvp = make_vertex_shader(ctx->st, GL_TRUE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_TRUE);
       color = ctx->Current.RasterColor;
    }
    else if (format == GL_STENCIL_INDEX) {
@@ -956,7 +956,7 @@ st_DrawPixels(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    else {
       ps = st->state.framebuffer.cbufs[0];
       stfp = combined_drawpix_fragment_program(ctx);
-      stvp = make_vertex_shader(ctx->st, GL_FALSE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_FALSE);
       color = NULL;
    }
 
@@ -1111,7 +1111,7 @@ st_Bitmap(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    struct st_context *st = ctx->st;
    struct pipe_texture *pt;
 
-   stvp = make_vertex_shader(ctx->st, GL_TRUE);
+   stvp = st_make_passthrough_vertex_shader(ctx->st, GL_TRUE);
    stfp = combined_bitmap_fragment_program(ctx);
 
    st_validate_state(st);
@@ -1229,13 +1229,13 @@ st_CopyPixels(GLcontext *ctx, GLint srcx, GLint srcy,
       rbRead = st_renderbuffer(ctx->ReadBuffer->_ColorReadBuffer);
       color = NULL;
       stfp = combined_drawpix_fragment_program(ctx);
-      stvp = make_vertex_shader(ctx->st, GL_FALSE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_FALSE);
    }
    else {
       rbRead = st_renderbuffer(ctx->ReadBuffer->_DepthBuffer);
       color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
       stfp = make_fragment_shader_z(ctx->st);
-      stvp = make_vertex_shader(ctx->st, GL_TRUE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_TRUE);
    }
 
    psRead = rbRead->surface;
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h
index 71ba487020..b8b906f06b 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.h
+++ b/src/mesa/state_tracker/st_cb_drawpixels.h
@@ -30,6 +30,10 @@
 #define ST_CB_DRAWPIXELS_H
 
 
+extern struct st_vertex_program *
+st_make_passthrough_vertex_shader(struct st_context *st, GLboolean passColor);
+
+
 extern void st_init_drawpixels_functions(struct dd_function_table *functions);
 
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index c9b8e78485..ae9f5c8b11 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -354,7 +354,8 @@ st_draw_vbo(GLcontext *ctx,
 void 
 st_draw_vertices(GLcontext *ctx, unsigned prim,
                  unsigned numVertex, float *verts,
-                 unsigned numAttribs)
+                 unsigned numAttribs,
+                 GLboolean inClipCoords)
 {
    const float width = ctx->DrawBuffer->Width;
    const float height = ctx->DrawBuffer->Height;
@@ -367,14 +368,16 @@ st_draw_vertices(GLcontext *ctx, unsigned prim,
 
    assert(numAttribs > 0);
 
-   /* convert to clip coords */
-   for (i = 0; i < numVertex; i++) {
-      float x = verts[i * numAttribs * 4 + 0];
-      float y = verts[i * numAttribs * 4 + 1];
-      x = x / width * 2.0 - 1.0;
-      y = y / height * 2.0 - 1.0;
-      verts[i * numAttribs * 4 + 0] = x;
-      verts[i * numAttribs * 4 + 1] = y;
+   if (!inClipCoords) {
+      /* convert to clip coords */
+      for (i = 0; i < numVertex; i++) {
+         float x = verts[i * numAttribs * 4 + 0];
+         float y = verts[i * numAttribs * 4 + 1];
+         x = x / width * 2.0 - 1.0;
+         y = y / height * 2.0 - 1.0;
+         verts[i * numAttribs * 4 + 0] = x;
+         verts[i * numAttribs * 4 + 1] = y;
+      }
    }
 
    /* XXX create one-time */
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index 89ee790c57..171bde57e5 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -62,7 +62,8 @@ st_feedback_draw_vbo(GLcontext *ctx,
 void 
 st_draw_vertices(GLcontext *ctx, unsigned prim,
                  unsigned numVertex, float *verts,
-                 unsigned numAttribs);
+                 unsigned numAttribs,
+                 GLboolean inClipCoords);
 
 
 #endif
-- 
cgit v1.2.3


From 62abcb9aacc33218d0143a743c738435794b32a9 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:54:18 -0700
Subject: gallium: initial implemenation of auto mipmap generation in state
 tracker

Use hardware rendering to compute/render mipmap levels.
The fallback path (which will be used for non-renderable texture formats)
isn't working yet.
---
 src/mesa/sources                       |   1 +
 src/mesa/state_tracker/st_cb_texture.c |  11 +-
 src/mesa/state_tracker/st_context.c    |   2 +
 src/mesa/state_tracker/st_gen_mipmap.c | 362 +++++++++++++++++++++++++++++++++
 src/mesa/state_tracker/st_gen_mipmap.h |  46 +++++
 5 files changed, 415 insertions(+), 7 deletions(-)
 create mode 100644 src/mesa/state_tracker/st_gen_mipmap.c
 create mode 100644 src/mesa/state_tracker/st_gen_mipmap.h

(limited to 'src')

diff --git a/src/mesa/sources b/src/mesa/sources
index c0087f76e6..84492c91ac 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -234,6 +234,7 @@ STATETRACKER_SOURCES = \
 	state_tracker/st_extensions.c \
 	state_tracker/st_format.c \
 	state_tracker/st_framebuffer.c \
+	state_tracker/st_gen_mipmap.c \
 	state_tracker/st_mesa_to_tgsi.c \
 	state_tracker/st_program.c \
 	state_tracker/st_texture.c
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 8db4a804ab..3350254654 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -42,6 +42,7 @@
 #include "state_tracker/st_cb_texture.h"
 #include "state_tracker/st_format.h"
 #include "state_tracker/st_texture.h"
+#include "state_tracker/st_gen_mipmap.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -703,13 +704,9 @@ st_TexImage(GLcontext * ctx,
       texImage->Data = NULL;
    }
 
-#if 0
-   /* GL_SGIS_generate_mipmap -- this can be accelerated now.
-    */
+#if 01
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      intel_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 #endif
 }
@@ -1538,7 +1535,7 @@ st_init_texture_functions(struct dd_function_table *functions)
    functions->CopyTexSubImage1D = st_CopyTexSubImage1D;
    functions->CopyTexSubImage2D = st_CopyTexSubImage2D;
    functions->CopyTexSubImage3D = st_CopyTexSubImage3D;
-   functions->GenerateMipmap = _mesa_generate_mipmap;
+   functions->GenerateMipmap = st_generate_mipmap;
 
    functions->GetTexImage = st_GetTexImage;
 
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 9c206c057a..bf4618bed8 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -49,6 +49,7 @@
 #include "st_atom.h"
 #include "st_draw.h"
 #include "st_extensions.h"
+#include "st_gen_mipmap.h"
 #include "st_program.h"
 #include "pipe/p_context.h"
 #include "pipe/p_winsys.h"
@@ -96,6 +97,7 @@ st_create_context_priv( GLcontext *ctx, struct pipe_context *pipe )
 
    st_init_atoms( st );
    st_init_draw( st );
+   st_init_generate_mipmap(st);
 
    /* we want all vertex data to be placed in buffer objects */
    vbo_use_buffer_objects(ctx);
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
new file mode 100644
index 0000000000..16f9e4cd27
--- /dev/null
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -0,0 +1,362 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "main/imports.h"
+#include "main/mipmap.h"
+#include "main/teximage.h"
+
+#include "shader/prog_instruction.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/cso_cache/cso_cache.h"
+
+#include "st_context.h"
+#include "st_draw.h"
+#include "st_gen_mipmap.h"
+#include "st_program.h"
+#include "st_cb_texture.h"
+
+
+
+static void *blend_cso = NULL;
+static void *depthstencil_cso = NULL;
+static void *rasterizer_cso = NULL;
+static void *sampler_cso = NULL;
+
+static struct st_fragment_program *stfp = NULL;
+static struct st_vertex_program *stvp = NULL;
+
+
+
+static struct st_fragment_program *
+make_tex_fragment_program(GLcontext *ctx)
+{
+   struct st_fragment_program *stfp;
+   struct gl_program *p;
+   GLuint ic = 0;
+
+   p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
+   if (!p)
+      return NULL;
+
+   p->NumInstructions = 2;
+
+   p->Instructions = _mesa_alloc_instructions(p->NumInstructions);
+   if (!p->Instructions) {
+      ctx->Driver.DeleteProgram(ctx, p);
+      return NULL;
+   }
+   _mesa_init_instructions(p->Instructions, p->NumInstructions);
+
+   /* TEX result.color, fragment.texcoord[0], texture[0], 2D; */
+   p->Instructions[ic].Opcode = OPCODE_TEX;
+   p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
+   p->Instructions[ic].DstReg.Index = FRAG_RESULT_COLR;
+   p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
+   p->Instructions[ic].SrcReg[0].Index = FRAG_ATTRIB_TEX0;
+   p->Instructions[ic].TexSrcUnit = 0;
+   p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
+   ic++;
+
+   /* END; */
+   p->Instructions[ic++].Opcode = OPCODE_END;
+
+   assert(ic == p->NumInstructions);
+
+   p->InputsRead = FRAG_BIT_TEX0;
+   p->OutputsWritten = (1 << FRAG_RESULT_COLR);
+
+   stfp = (struct st_fragment_program *) p;
+
+   st_translate_fragment_program(ctx->st, stfp, NULL,
+                                 stfp->tokens, ST_MAX_SHADER_TOKENS);
+
+   return stfp;
+}
+
+
+
+
+/**
+ * one-time init for generate mipmap
+ * XXX Note: there may be other times we need no-op/simple state like this.
+ * In that case, some code refactoring would be good.
+ */
+void
+st_init_generate_mipmap(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_blend_state blend;
+   struct pipe_rasterizer_state rasterizer;
+   struct pipe_sampler_state sampler;
+   struct pipe_depth_stencil_alpha_state depthstencil;
+
+   assert(!blend_cso);
+
+   memset(&blend, 0, sizeof(blend));
+   blend.colormask = PIPE_MASK_RGBA;
+   blend_cso = pipe->create_blend_state(pipe, &blend);
+
+   memset(&depthstencil, 0, sizeof(depthstencil));
+   depthstencil_cso = pipe->create_depth_stencil_alpha_state(pipe, &depthstencil);
+
+   memset(&rasterizer, 0, sizeof(rasterizer));
+   rasterizer_cso = pipe->create_rasterizer_state(pipe, &rasterizer);
+
+   memset(&sampler, 0, sizeof(sampler));
+   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.normalized_coords = 1;
+   sampler_cso = pipe->create_sampler_state(pipe, &sampler);
+
+   stfp = make_tex_fragment_program(st->ctx);
+   stvp = st_make_passthrough_vertex_shader(st, GL_FALSE);
+}
+
+
+void
+st_destroy_generate_mipmpap(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+
+   pipe->delete_blend_state(pipe, blend_cso);
+   pipe->delete_depth_stencil_alpha_state(pipe, depthstencil_cso);
+   pipe->delete_rasterizer_state(pipe, rasterizer_cso);
+   pipe->delete_sampler_state(pipe, sampler_cso);
+
+   /* XXX free stfp, stvp */
+
+   blend_cso = NULL;
+   depthstencil_cso = NULL;
+   rasterizer_cso = NULL;
+   sampler_cso = NULL;
+}
+
+
+static void
+simple_viewport(struct pipe_context *pipe, uint width, uint height)
+{
+   struct pipe_viewport_state vp;
+
+   vp.scale[0] =  0.5 * width;
+   vp.scale[1] = -0.5 * height;
+   vp.scale[2] = 1.0;
+   vp.scale[3] = 1.0;
+   vp.translate[0] = 0.5 * width;
+   vp.translate[1] = 0.5 * height;
+   vp.translate[2] = 0.0;
+   vp.translate[3] = 0.0;
+
+   pipe->set_viewport_state(pipe, &vp);
+}
+
+
+
+/*
+ * Draw simple [-1,1]x[-1,1] quad
+ */
+static void
+draw_quad(GLcontext *ctx)
+{
+   GLfloat verts[4][2][4]; /* four verts, two attribs, XYZW */
+   GLuint i;
+   GLfloat sLeft = 0.0, sRight = 1.0;
+   GLfloat tTop = 1.0, tBot = 0.0;
+   GLfloat x0 = -1.0, x1 = 1.0;
+   GLfloat y0 = -1.0, y1 = 1.0;
+
+   /* upper-left */
+   verts[0][0][0] = x0;    /* attr[0].x */
+   verts[0][0][1] = y0;    /* attr[0].y */
+   verts[0][1][0] = sLeft; /* attr[1].s */
+   verts[0][1][1] = tTop;  /* attr[1].t */
+
+   /* upper-right */
+   verts[1][0][0] = x1;
+   verts[1][0][1] = y0;
+   verts[1][1][0] = sRight;
+   verts[1][1][1] = tTop;
+
+   /* lower-right */
+   verts[2][0][0] = x1;
+   verts[2][0][1] = y1;
+   verts[2][1][0] = sRight;
+   verts[2][1][1] = tBot;
+
+   /* lower-left */
+   verts[3][0][0] = x0;
+   verts[3][0][1] = y1;
+   verts[3][1][0] = sLeft;
+   verts[3][1][1] = tBot;
+
+   /* same for all verts: */
+   for (i = 0; i < 4; i++) {
+      verts[i][0][2] = 0.0; /*Z*/
+      verts[i][0][3] = 1.0; /*W*/
+      verts[i][1][2] = 0.0; /*R*/
+      verts[i][1][3] = 1.0; /*Q*/
+   }
+
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2, GL_TRUE);
+}
+
+
+
+/**
+ * Generate mipmap levels using hardware rendering.
+ * \return TRUE if successful, FALSE if not possible
+ */
+static boolean
+st_render_mipmap(struct st_context *st,
+                 struct pipe_texture *pt,
+                 uint baseLevel, uint lastLevel)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_framebuffer_state fb;
+   const uint face = 0, zslice = 0;
+   const uint first_level_save = pt->first_level;
+   uint dstLevel;
+
+   /* check if we can render in the texture's format */
+   if (!pipe->is_format_supported(pipe, pt->format, PIPE_SURFACE)) {
+      return FALSE;
+   }
+
+   /* init framebuffer state */
+   memset(&fb, 0, sizeof(fb));
+   fb.num_cbufs = 1;
+
+   /* bind CSOs */
+   pipe->bind_blend_state(pipe, blend_cso);
+   pipe->bind_depth_stencil_alpha_state(pipe, depthstencil_cso);
+   pipe->bind_rasterizer_state(pipe, rasterizer_cso);
+   pipe->bind_sampler_state(pipe, 0, sampler_cso);
+
+   /* bind shaders */
+   pipe->bind_fs_state(pipe, stfp->fs->data);
+   pipe->bind_vs_state(pipe, stvp->cso->data);
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+
+      /*
+       * Setup framebuffer / dest surface
+       */
+      fb.cbufs[0] = pipe->get_tex_surface(pipe, pt, face, dstLevel, zslice);
+      pipe->set_framebuffer_state(pipe, &fb);
+
+      simple_viewport(pipe, pt->width[dstLevel], pt->height[dstLevel]);
+
+      /*
+       * Setup src texture, override pt->first_level so we sample from
+       * the right mipmap level.
+       */
+      pt->first_level = srcLevel;
+      pipe->set_sampler_texture(pipe, 0, pt);
+
+      draw_quad(st->ctx);
+   }
+
+   /* restore first_level */
+   pt->first_level = first_level_save;
+
+   /* restore pipe state */
+   if (st->state.rasterizer)
+      pipe->bind_rasterizer_state(pipe, st->state.rasterizer->data);
+   if (st->state.fs)
+      pipe->bind_fs_state(pipe, st->state.fs->data);
+   if (st->state.vs)
+      pipe->bind_vs_state(pipe, st->state.vs->cso->data);
+   if (st->state.sampler[0])
+      pipe->bind_sampler_state(pipe, 0, st->state.sampler[0]->data);
+   pipe->set_sampler_texture(pipe, 0, st->state.sampler_texture[0]);
+   pipe->set_viewport_state(pipe, &st->state.viewport);
+
+   return TRUE;
+}
+
+
+
+void
+st_generate_mipmap(GLcontext *ctx, GLenum target,
+                   struct gl_texture_object *texObj)
+{
+   struct st_context *st = ctx->st;
+   struct pipe_texture *pt = st_get_texobj_texture(texObj);
+   const uint baseLevel = texObj->BaseLevel;
+   const uint lastLevel = pt->last_level;
+   uint dstLevel;
+
+   if (!st_render_mipmap(st, pt, baseLevel, lastLevel)) {
+      abort();
+      /* XXX the following won't really work at this time */
+      _mesa_generate_mipmap(ctx, target, texObj);
+      return;
+   }
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+      const struct gl_texture_image *srcImage
+         = _mesa_get_tex_image(ctx, texObj, target, srcLevel);
+      struct gl_texture_image *dstImage;
+      struct st_texture_image *stImage;
+      uint dstWidth = pt->width[dstLevel];
+      uint dstHeight = pt->height[dstLevel];
+      uint dstDepth = pt->depth[dstLevel];
+      uint border = srcImage->Border;
+
+
+      dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel);
+      if (!dstImage) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps");
+         return;
+      }
+
+      if (dstImage->ImageOffsets)
+         _mesa_free(dstImage->ImageOffsets);
+
+      /* Free old image data */
+      if (dstImage->Data)
+         ctx->Driver.FreeTexImageData(ctx, dstImage);
+
+      /* initialize new image */
+      _mesa_init_teximage_fields(ctx, target, dstImage, dstWidth, dstHeight,
+                                 dstDepth, border, srcImage->InternalFormat);
+
+      dstImage->TexFormat = srcImage->TexFormat;
+
+      stImage = (struct st_texture_image *) dstImage;
+      stImage->pt = pt;
+   }
+
+}
diff --git a/src/mesa/state_tracker/st_gen_mipmap.h b/src/mesa/state_tracker/st_gen_mipmap.h
new file mode 100644
index 0000000000..7668c1e44e
--- /dev/null
+++ b/src/mesa/state_tracker/st_gen_mipmap.h
@@ -0,0 +1,46 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef ST_GEN_MIPMAP_H
+#define ST_GEN_MIPMAP_H
+
+
+extern void
+st_init_generate_mipmap(struct st_context *st);
+
+
+extern void
+st_destroy_generate_mipmpap(struct st_context *st);
+
+
+extern void
+st_generate_mipmap(GLcontext *ctx, GLenum target,
+                   struct gl_texture_object *texObj);
+
+
+#endif /* ST_GEN_MIPMAP_H */
-- 
cgit v1.2.3


From 9f9c3b752a82d216d4655d6dcd07361f66f8a4c8 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:55:33 -0700
Subject: fix comment typos

---
 src/mesa/pipe/tgsi/exec/tgsi_exec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src')

diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.c b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
index 463ff0d9da..336ae1c8b6 100644
--- a/src/mesa/pipe/tgsi/exec/tgsi_exec.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
@@ -2010,7 +2010,7 @@ exec_instruction(
 
    case TGSI_OPCODE_TXB:
       /* Texture lookup with lod bias */
-      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[0] = texcoord (src[0].w = LOD bias) */
       /* src[1] = sampler unit */
       exec_tex(mach, inst, TRUE);
       break;
@@ -2026,7 +2026,7 @@ exec_instruction(
 
    case TGSI_OPCODE_TXL:
       /* Texture lookup with explit LOD */
-      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[0] = texcoord (src[0].w = LOD) */
       /* src[1] = sampler unit */
       exec_tex(mach, inst, TRUE);
       break;
-- 
cgit v1.2.3


From 479b5e9b5d9e0e387332c6fbeaffffa7612a0c52 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Fri, 8 Feb 2008 14:56:38 -0700
Subject: gallium: include st_cb_drawpixels.h

---
 src/mesa/state_tracker/st_gen_mipmap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src')

diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index 16f9e4cd27..a6ac9a55fb 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -40,6 +40,7 @@
 #include "st_draw.h"
 #include "st_gen_mipmap.h"
 #include "st_program.h"
+#include "st_cb_drawpixels.h"
 #include "st_cb_texture.h"
 
 
-- 
cgit v1.2.3