From 1ca512c643553bd3504abd258ab80b7a550ab292 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 25 Sep 2008 11:46:27 -0600
Subject: mesa: fix default buffer object access value

---
 src/mesa/main/bufferobj.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index ecdb4d219c..dd4ac4679e 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -38,6 +38,13 @@
 #include "bufferobj.h"
 
 
+#ifdef FEATURE_OES_mapbuffer
+#define DEFAULT_ACCESS GL_WRITE_ONLY;
+#else
+#define DEFAULT_ACCESS GL_READ_WRITE;
+#endif
+
+
 /**
  * Get the buffer object bound to the specified target in a GL context.
  *
@@ -255,7 +262,7 @@ _mesa_initialize_buffer_object( struct gl_buffer_object *obj,
    obj->RefCount = 1;
    obj->Name = name;
    obj->Usage = GL_STATIC_DRAW_ARB;
-   obj->Access = GL_READ_WRITE_ARB;
+   obj->Access = DEFAULT_ACCESS;
 }
 
 
@@ -1037,7 +1044,7 @@ _mesa_UnmapBufferARB(GLenum target)
       status = ctx->Driver.UnmapBuffer( ctx, target, bufObj );
    }
 
-   bufObj->Access = GL_READ_WRITE_ARB; /* initial value, OK? */
+   bufObj->Access = DEFAULT_ACCESS;
    bufObj->Pointer = NULL;
 
    return status;
-- 
cgit v1.2.3


From 006fb638188f083d64a2427cd28979b432622f3e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 25 Sep 2008 18:27:22 -0600
Subject: mesa: fix swizzle failure, fix typo

---
 src/mesa/main/texenvprogram.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index 6877ef96f2..c699c43429 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -375,7 +375,7 @@ static struct ureg get_tex_temp( struct texenv_fragment_program *p )
 {
    int bit;
    
-   /* First try to find availble temp not previously used (to avoid
+   /* First try to find available temp not previously used (to avoid
     * starting a new texture indirection).  According to the spec, the
     * ~p->temps_output isn't necessary, but will keep it there for
     * now:
@@ -575,14 +575,16 @@ static struct ureg register_const4f( struct texenv_fragment_program *p,
 {
    GLfloat values[4];
    GLuint idx, swizzle;
+   struct ureg r;
    values[0] = s0;
    values[1] = s1;
    values[2] = s2;
    values[3] = s3;
    idx = _mesa_add_unnamed_constant( p->program->Base.Parameters, values, 4,
                                      &swizzle );
-   ASSERT(swizzle == SWIZZLE_NOOP);
-   return make_ureg(PROGRAM_CONSTANT, idx);
+   r = make_ureg(PROGRAM_CONSTANT, idx);
+   r.swz = swizzle;
+   return r;
 }
 
 #define register_scalar_const(p, s0)    register_const4f(p, s0, s0, s0, s0)
-- 
cgit v1.2.3


From 3f99f501db2582e241851e63e432c18e2de415be Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 25 Sep 2008 18:40:16 -0600
Subject: mesa: increase MAX_INSTRUCTIONS

---
 src/mesa/main/texenvprogram.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index c699c43429..c64d88faf9 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -37,11 +37,9 @@
 #include "texenvprogram.h"
 
 /**
- * This MAX is probably a bit generous, but that's OK.  There can be
- * up to four instructions per texture unit (TEX + 3 for combine),
- * then there's fog and specular add.
+ * Up to nine instructions per tex unit, plus fog, specular color.
  */
-#define MAX_INSTRUCTIONS ((MAX_TEXTURE_UNITS * 4) + 12)
+#define MAX_INSTRUCTIONS ((MAX_TEXTURE_UNITS * 9) + 12)
 
 #define DISASSEM (MESA_VERBOSE & VERBOSE_DISASSEM)
 
-- 
cgit v1.2.3


From 092748990f75a0348f24a40e92872f08a9958e66 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 25 Sep 2008 19:22:29 -0600
Subject: mesa: fix/simplify initialization of vertex/fragment program limits

Defaults for program length, num ALU instructions, num indirections, etc.
basically indicate no limit for software rendering.  Driver should override
as needed.
---
 src/mesa/main/config.h  |  4 +---
 src/mesa/main/context.c | 53 ++++++++++++++++++++++++-------------------------
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index f8109ec755..5e9a4f8939 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -176,13 +176,11 @@
 /** For GL_ARB_fragment_program */
 /*@{*/
 #define MAX_FRAGMENT_PROGRAM_ADDRESS_REGS 0
-#define MAX_FRAGMENT_PROGRAM_ALU_INSTRUCTIONS 48
-#define MAX_FRAGMENT_PROGRAM_TEX_INSTRUCTIONS 24
-#define MAX_FRAGMENT_PROGRAM_TEX_INDIRECTIONS  4
 /*@}*/
 
 /** For any program target/extension */
 /*@{*/
+#define MAX_PROGRAM_INSTRUCTIONS  (16 * 1024)
 #define MAX_PROGRAM_LOCAL_PARAMS 128 /* KW: power of two */
 #define MAX_PROGRAM_ENV_PARAMS 128
 #define MAX_PROGRAM_MATRICES 8
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index ed3faecf0d..144da61384 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -819,11 +819,33 @@ _mesa_init_current(GLcontext *ctx)
 
 
 /**
- * Init vertex/fragment program native limits from logical limits.
+ * Init vertex/fragment program limits.
+ * Important: drivers should override these with actual limits.
  */
 static void
-init_natives(struct gl_program_constants *prog)
+init_program_limits(GLenum type, struct gl_program_constants *prog)
 {
+   prog->MaxInstructions = MAX_PROGRAM_INSTRUCTIONS;
+   prog->MaxAluInstructions = MAX_PROGRAM_INSTRUCTIONS;
+   prog->MaxTexInstructions = MAX_PROGRAM_INSTRUCTIONS;
+   prog->MaxTexIndirections = MAX_PROGRAM_INSTRUCTIONS;
+   prog->MaxTemps = MAX_PROGRAM_TEMPS;
+   prog->MaxEnvParams = MAX_PROGRAM_ENV_PARAMS;
+   prog->MaxLocalParams = MAX_PROGRAM_LOCAL_PARAMS;
+   prog->MaxUniformComponents = 4 * MAX_UNIFORMS;
+
+   if (type == GL_VERTEX_PROGRAM_ARB) {
+      prog->MaxParameters = MAX_NV_VERTEX_PROGRAM_PARAMS;
+      prog->MaxAttribs = MAX_NV_VERTEX_PROGRAM_INPUTS;
+      prog->MaxAddressRegs = MAX_VERTEX_PROGRAM_ADDRESS_REGS;
+   }
+   else {
+      prog->MaxParameters = MAX_NV_FRAGMENT_PROGRAM_PARAMS;
+      prog->MaxAttribs = MAX_NV_FRAGMENT_PROGRAM_INPUTS;
+      prog->MaxAddressRegs = MAX_FRAGMENT_PROGRAM_ADDRESS_REGS;
+   }
+
+   /* copy the above limits to init native limits */
    prog->MaxNativeInstructions = prog->MaxInstructions;
    prog->MaxNativeAluInstructions = prog->MaxAluInstructions;
    prog->MaxNativeTexInstructions = prog->MaxTexInstructions;
@@ -885,33 +907,10 @@ _mesa_init_constants(GLcontext *ctx)
    ctx->Const.MaxViewportWidth = MAX_WIDTH;
    ctx->Const.MaxViewportHeight = MAX_HEIGHT;
 #if FEATURE_ARB_vertex_program
-   ctx->Const.VertexProgram.MaxInstructions = MAX_NV_VERTEX_PROGRAM_INSTRUCTIONS;
-   ctx->Const.VertexProgram.MaxAluInstructions = 0;
-   ctx->Const.VertexProgram.MaxTexInstructions = 0;
-   ctx->Const.VertexProgram.MaxTexIndirections = 0;
-   ctx->Const.VertexProgram.MaxAttribs = MAX_NV_VERTEX_PROGRAM_INPUTS;
-   ctx->Const.VertexProgram.MaxTemps = MAX_PROGRAM_TEMPS;
-   ctx->Const.VertexProgram.MaxParameters = MAX_NV_VERTEX_PROGRAM_PARAMS;
-   ctx->Const.VertexProgram.MaxLocalParams = MAX_PROGRAM_LOCAL_PARAMS;
-   ctx->Const.VertexProgram.MaxEnvParams = MAX_PROGRAM_ENV_PARAMS;
-   ctx->Const.VertexProgram.MaxAddressRegs = MAX_VERTEX_PROGRAM_ADDRESS_REGS;
-   ctx->Const.VertexProgram.MaxUniformComponents = 4 * MAX_UNIFORMS;
-   init_natives(&ctx->Const.VertexProgram);
+   init_program_limits(GL_VERTEX_PROGRAM_ARB, &ctx->Const.VertexProgram);
 #endif
-
 #if FEATURE_ARB_fragment_program
-   ctx->Const.FragmentProgram.MaxInstructions = MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS;
-   ctx->Const.FragmentProgram.MaxAluInstructions = MAX_FRAGMENT_PROGRAM_ALU_INSTRUCTIONS;
-   ctx->Const.FragmentProgram.MaxTexInstructions = MAX_FRAGMENT_PROGRAM_TEX_INSTRUCTIONS;
-   ctx->Const.FragmentProgram.MaxTexIndirections = MAX_FRAGMENT_PROGRAM_TEX_INDIRECTIONS;
-   ctx->Const.FragmentProgram.MaxAttribs = MAX_NV_FRAGMENT_PROGRAM_INPUTS;
-   ctx->Const.FragmentProgram.MaxTemps = MAX_PROGRAM_TEMPS;
-   ctx->Const.FragmentProgram.MaxParameters = MAX_NV_FRAGMENT_PROGRAM_PARAMS;
-   ctx->Const.FragmentProgram.MaxLocalParams = MAX_PROGRAM_LOCAL_PARAMS;
-   ctx->Const.FragmentProgram.MaxEnvParams = MAX_PROGRAM_ENV_PARAMS;
-   ctx->Const.FragmentProgram.MaxAddressRegs = MAX_FRAGMENT_PROGRAM_ADDRESS_REGS;
-   ctx->Const.FragmentProgram.MaxUniformComponents = 4 * MAX_UNIFORMS;
-   init_natives(&ctx->Const.FragmentProgram);
+   init_program_limits(GL_FRAGMENT_PROGRAM_ARB, &ctx->Const.FragmentProgram);
 #endif
    ctx->Const.MaxProgramMatrices = MAX_PROGRAM_MATRICES;
    ctx->Const.MaxProgramMatrixStackDepth = MAX_PROGRAM_MATRIX_STACK_DEPTH;
-- 
cgit v1.2.3


From 1ef90b3f9af12102101e5d30b2d73cbdabd86f24 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 18 Sep 2008 18:31:47 -0600
Subject: mesa: add some braces

---
 src/mesa/shader/prog_execute.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/shader/prog_execute.c b/src/mesa/shader/prog_execute.c
index 5afd9eb153..768e936d5f 100644
--- a/src/mesa/shader/prog_execute.c
+++ b/src/mesa/shader/prog_execute.c
@@ -81,11 +81,12 @@ get_register_pointer(const struct prog_src_register *source,
 {
    if (source->RelAddr) {
       const GLint reg = source->Index + machine->AddressReg[0][0];
-      if (source->File == PROGRAM_ENV_PARAM)
+      if (source->File == PROGRAM_ENV_PARAM) {
          if (reg < 0 || reg >= MAX_PROGRAM_ENV_PARAMS)
             return ZeroVec;
          else
             return machine->EnvParams[reg];
+      }
       else {
          const struct gl_program_parameter_list *params;
          ASSERT(source->File == PROGRAM_LOCAL_PARAM ||
-- 
cgit v1.2.3


From 13a8c18d3d0ca505f053a15fc664a705cbc8be84 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 09:25:32 -0600
Subject: gallium: rename tgsi_translate_mesa_program() to
 st_translate_mesa_program()

---
 src/mesa/state_tracker/st_mesa_to_tgsi.c | 2 +-
 src/mesa/state_tracker/st_mesa_to_tgsi.h | 2 +-
 src/mesa/state_tracker/st_program.c      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 524d8890b5..50e638df46 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -713,7 +713,7 @@ find_temporaries(const struct gl_program *program,
  * \return number of tokens placed in 'tokens' buffer, or zero if error
  */
 GLuint
-tgsi_translate_mesa_program(
+st_translate_mesa_program(
    uint procType,
    const struct gl_program *program,
    GLuint numInputs,
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.h b/src/mesa/state_tracker/st_mesa_to_tgsi.h
index f17f2eac96..77c74644b8 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.h
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.h
@@ -40,7 +40,7 @@ struct tgsi_token;
 struct gl_program;
 
 GLuint
-tgsi_translate_mesa_program(
+st_translate_mesa_program(
    uint procType,
    const struct gl_program *program,
    GLuint numInputs,
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index b2abf0286e..55b52c3745 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -307,7 +307,7 @@ st_translate_vertex_program(struct st_context *st,
 
    /* XXX: fix static allocation of tokens:
     */
-   num_tokens = tgsi_translate_mesa_program( TGSI_PROCESSOR_VERTEX,
+   num_tokens = st_translate_mesa_program( TGSI_PROCESSOR_VERTEX,
                                 &stvp->Base.Base,
                                 /* inputs */
                                 vs_num_inputs,
@@ -481,7 +481,7 @@ st_translate_fragment_program(struct st_context *st,
 
    /* XXX: fix static allocation of tokens:
     */
-   num_tokens = tgsi_translate_mesa_program( TGSI_PROCESSOR_FRAGMENT,
+   num_tokens = st_translate_mesa_program( TGSI_PROCESSOR_FRAGMENT,
                                 &stfp->Base.Base,
                                 /* inputs */
                                 fs_num_inputs,
-- 
cgit v1.2.3


From ee80c64be8ebeebc1cf4c4913049eb07b9326b96 Mon Sep 17 00:00:00 2001
From: Brad King <brad.king@kitware.com>
Date: Fri, 26 Sep 2008 07:40:05 -0600
Subject: mesa: fix param indexing

---
 src/mesa/main/texparam.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 664adadfb9..acddb6663b 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -903,9 +903,9 @@ _mesa_GetTexParameterfv( GLenum target, GLenum pname, GLfloat *params )
 #ifdef FEATURE_OES_draw_texture
       case GL_TEXTURE_CROP_RECT_OES:
          params[0] = obj->CropRect[0];
-         params[0] = obj->CropRect[1];
-         params[0] = obj->CropRect[2];
-         params[0] = obj->CropRect[3];
+         params[1] = obj->CropRect[1];
+         params[2] = obj->CropRect[2];
+         params[3] = obj->CropRect[3];
          break;
 #endif
       default:
@@ -1053,9 +1053,9 @@ _mesa_GetTexParameteriv( GLenum target, GLenum pname, GLint *params )
 #ifdef FEATURE_OES_draw_texture
       case GL_TEXTURE_CROP_RECT_OES:
          params[0] = obj->CropRect[0];
-         params[0] = obj->CropRect[1];
-         params[0] = obj->CropRect[2];
-         params[0] = obj->CropRect[3];
+         params[1] = obj->CropRect[1];
+         params[2] = obj->CropRect[2];
+         params[3] = obj->CropRect[3];
          break;
 #endif
       default:
-- 
cgit v1.2.3


From 4bc39c58eb7fdf3a0be62bed666998a0d1789dcf Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 07:40:45 -0600
Subject: mesa: fix assertion in _mesa_reference_program()

---
 src/mesa/shader/program.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/shader/program.c b/src/mesa/shader/program.c
index b3618641e5..738891a029 100644
--- a/src/mesa/shader/program.c
+++ b/src/mesa/shader/program.c
@@ -372,7 +372,11 @@ _mesa_reference_program(GLcontext *ctx,
    assert(ptr);
    if (*ptr && prog) {
       /* sanity check */
-      ASSERT((*ptr)->Target == prog->Target);
+      if ((*ptr)->Target == GL_VERTEX_PROGRAM_ARB)
+         ASSERT(prog->Target == GL_VERTEX_PROGRAM_ARB);
+      else if ((*ptr)->Target == GL_FRAGMENT_PROGRAM_ARB)
+         ASSERT(prog->Target == GL_FRAGMENT_PROGRAM_ARB ||
+                prog->Target == GL_FRAGMENT_PROGRAM_NV);
    }
    if (*ptr == prog) {
       return;  /* no change */
-- 
cgit v1.2.3


From a4a5a37f2760eca97b85f699c932c746da4d8e6c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 07:45:06 -0600
Subject: mesa: remove invalid assertions that programs have parameters

Fixes failure with demos/fplight.c
---
 src/mesa/main/state.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index d233201b0b..48656bd35e 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -258,12 +258,6 @@ update_program(GLcontext *ctx)
       }
    }
 
-   if (ctx->VertexProgram._Current)
-      assert(ctx->VertexProgram._Current->Base.Parameters);
-   if (ctx->FragmentProgram._Current)
-      assert(ctx->FragmentProgram._Current->Base.Parameters);
-
-
    /* XXX: get rid of _Active flag.
     */
 #if 1
-- 
cgit v1.2.3


From 8338cc25f913c809bec20f190a0e9f9bf8129aea Mon Sep 17 00:00:00 2001
From: Jeremy Huddleston <jeremyhu@freedesktop.org>
Date: Fri, 26 Sep 2008 12:36:42 -0700
Subject: configs: darwin: Don't build GLw (cherry picked from commit
 ef688ba1ee366a8937a41075cbe8b76a9bf75013)

---
 configs/darwin | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/darwin b/configs/darwin
index aa9efb66f5..42897ac033 100644
--- a/configs/darwin
+++ b/configs/darwin
@@ -12,10 +12,10 @@ CXX = gcc
 PIC_FLAGS = -fPIC
 DEFINES =  -D_DARWIN_C_SOURCE -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L \
 	   -D_BSD_SOURCE -D_SVID_SOURCE -D_GNU_SOURCE -DPTHREADS \
-	   -DGLX_INDIRECT_RENDERING \
-	   -DGLX_ALIAS_UNSUPPORTED
+	   -DGLX_ALIAS_UNSUPPORTED -DGLX_INDIRECT_RENDERING
 
 # -DGLX_DIRECT_RENDERING - pulls in libdrm stuff in glx/x11
+# -DGLX_USE_APPLEGL      - supposed to be used with GLX_DIRECT_RENDERING to use AGL rather than DRM, but doesn't compile
 # -DIN_DRI_DRIVER
 
 ARCH_FLAGS += $(RC_CFLAGS)
@@ -47,7 +47,7 @@ GLW_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GL_LIB) -L$(INSTALL_DIR)/$(LIB_DIR) -lX11
 APP_LIB_DEPS = -L$(TOP)/lib -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -L$(INSTALL_DIR)/$(LIB_DIR) -lX11 -lXmu -lXt -lXi -lm
 
 # omit glw lib for now:
-SRC_DIRS = glx/x11 mesa glu glw glut/glx
+SRC_DIRS = glx/x11 mesa glu glut/glx
 GLU_DIRS = sgi
 DRIVER_DIRS = osmesa
 #DRIVER_DIRS = dri
-- 
cgit v1.2.3


From 7d99ddcb2bb09f1f54d91e6e20e42d217a5bccdf Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 26 Sep 2008 12:48:23 -0700
Subject: intel: Fix a number of memory leaks on context destroy.

---
 src/mesa/drivers/dri/i915/i830_vtbl.c       |  7 ++++++
 src/mesa/drivers/dri/i915/i915_vtbl.c       |  7 ++++++
 src/mesa/drivers/dri/i965/brw_draw.c        | 10 +++++++++
 src/mesa/drivers/dri/i965/brw_state_cache.c |  5 +++--
 src/mesa/drivers/dri/i965/brw_vtbl.c        | 34 +++++++++++++++++++++++++++++
 src/mesa/drivers/dri/intel/intel_context.c  | 12 ++++++++++
 src/mesa/drivers/dri/intel/intel_context.h  |  1 -
 src/mesa/drivers/dri/intel/intel_regions.c  | 10 +++++++++
 8 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 0ab27704d5..773a8b4dd0 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -566,6 +566,13 @@ i830_destroy_context(struct intel_context *intel)
    GLuint i;
    struct i830_context *i830 = i830_context(&intel->ctx);
 
+   intel_region_release(&i830->state.draw_region);
+   intel_region_release(&i830->state.depth_region);
+   intel_region_release(&i830->meta.draw_region);
+   intel_region_release(&i830->meta.depth_region);
+   intel_region_release(&i830->initial.draw_region);
+   intel_region_release(&i830->initial.depth_region);
+
    for (i = 0; i < I830_TEX_UNITS; i++) {
       if (i830->state.tex_buffer[i] != NULL) {
 	 dri_bo_unreference(i830->state.tex_buffer[i]);
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index edbbe23e09..7431a9cf76 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -490,6 +490,13 @@ i915_destroy_context(struct intel_context *intel)
    GLuint i;
    struct i915_context *i915 = i915_context(&intel->ctx);
 
+   intel_region_release(&i915->state.draw_region);
+   intel_region_release(&i915->state.depth_region);
+   intel_region_release(&i915->meta.draw_region);
+   intel_region_release(&i915->meta.depth_region);
+   intel_region_release(&i915->initial.draw_region);
+   intel_region_release(&i915->initial.depth_region);
+
    for (i = 0; i < I915_TEX_UNITS; i++) {
       if (i915->state.tex_buffer[i] != NULL) {
 	 dri_bo_unreference(i915->state.tex_buffer[i]);
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 9a353fc7b6..39ce8eb4b6 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -409,8 +409,18 @@ void brw_draw_init( struct brw_context *brw )
 
 void brw_draw_destroy( struct brw_context *brw )
 {
+   int i;
+
    if (brw->vb.upload.bo != NULL) {
       dri_bo_unreference(brw->vb.upload.bo);
       brw->vb.upload.bo = NULL;
    }
+
+   for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+      dri_bo_unreference(brw->vb.inputs[i].bo);
+      brw->vb.inputs[i].bo = NULL;
+   }
+
+   dri_bo_unreference(brw->ib.bo);
+   brw->ib.bo = NULL;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 1318dea594..d5b5166406 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -497,9 +497,10 @@ void brw_destroy_cache( struct brw_context *brw )
    GLuint i;
 
    brw_clear_cache(brw);
-   for (i = 0; i < BRW_MAX_CACHE; i++)
+   for (i = 0; i < BRW_MAX_CACHE; i++) {
+      dri_bo_unreference(brw->cache.last_bo[i]);
       free(brw->cache.name[i]);
-
+   }
    free(brw->cache.items);
    brw->cache.items = NULL;
    brw->cache.size = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 2a03fc59f3..3780d3dad2 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -51,6 +51,12 @@
 #include "brw_vs.h"
 #include <stdarg.h>
 
+static void
+dri_bo_release(dri_bo **bo)
+{
+   dri_bo_unreference(*bo);
+   *bo = NULL;
+}
 
 /* called from intelDestroyContext()
  */
@@ -58,6 +64,7 @@ static void brw_destroy_context( struct intel_context *intel )
 {
    GLcontext *ctx = &intel->ctx;
    struct brw_context *brw = brw_context(&intel->ctx);
+   int i;
 
    brw_destroy_metaops(brw);
    brw_destroy_state(brw);
@@ -65,6 +72,33 @@ static void brw_destroy_context( struct intel_context *intel )
 
    brw_ProgramCacheDestroy( ctx );
    brw_FrameBufferTexDestroy( brw );
+
+   for (i = 0; i < brw->state.nr_draw_regions; i++)
+       intel_region_release(&brw->state.draw_regions[i]);
+   brw->state.nr_draw_regions = 0;
+   intel_region_release(&brw->state.depth_region);
+
+   dri_bo_release(&brw->curbe.curbe_bo);
+   dri_bo_release(&brw->vs.prog_bo);
+   dri_bo_release(&brw->vs.state_bo);
+   dri_bo_release(&brw->gs.prog_bo);
+   dri_bo_release(&brw->gs.state_bo);
+   dri_bo_release(&brw->clip.prog_bo);
+   dri_bo_release(&brw->clip.state_bo);
+   dri_bo_release(&brw->clip.vp_bo);
+   dri_bo_release(&brw->sf.prog_bo);
+   dri_bo_release(&brw->sf.state_bo);
+   dri_bo_release(&brw->sf.vp_bo);
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++)
+      dri_bo_release(&brw->wm.sdc_bo[i]);
+   dri_bo_release(&brw->wm.bind_bo);
+   for (i = 0; i < BRW_WM_MAX_SURF; i++)
+      dri_bo_release(&brw->wm.surf_bo[i]);
+   dri_bo_release(&brw->wm.prog_bo);
+   dri_bo_release(&brw->wm.state_bo);
+   dri_bo_release(&brw->cc.prog_bo);
+   dri_bo_release(&brw->cc.state_bo);
+   dri_bo_release(&brw->cc.vp_bo);
 }
 
 /* called from intelDrawBuffer()
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 57e574447a..ccd74baa7c 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -810,7 +810,12 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
       intel->Fallback = 0;      /* don't call _swrast_Flush later */
 
       intel_batchbuffer_free(intel->batch);
+      intel->batch = NULL;
+
       free(intel->prim.vb);
+      intel->prim.vb = NULL;
+      dri_bo_unreference(intel->prim.vb_bo);
+      intel->prim.vb_bo = NULL;
 
       if (release_texture_heaps) {
          /* This share group is about to go away, free our private
@@ -820,6 +825,13 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
             fprintf(stderr, "do something to free texture heaps\n");
       }
 
+      intel_region_release(&intel->front_region);
+      intel_region_release(&intel->back_region);
+      intel_region_release(&intel->third_region);
+      intel_region_release(&intel->depth_region);
+
+      driDestroyOptionCache(&intel->optionCache);
+
       /* free the Mesa context */
       _mesa_free_context_data(&intel->ctx);
    }
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 4af4cb9c96..554159ac44 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -226,7 +226,6 @@ struct intel_context
    GLenum reduced_primitive;
    GLuint vertex_size;
    GLubyte *verts;              /* points to tnl->clipspace.vertex_buf */
-   struct intel_region *draw_region;
 
    /* Fallback rasterization functions 
     */
diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c
index cb0f4ba083..45faf64c71 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.c
+++ b/src/mesa/drivers/dri/intel/intel_regions.c
@@ -478,6 +478,11 @@ intel_recreate_static(struct intel_context *intel,
    region->pitch = intelScreen->pitch;
    region->height = intelScreen->height;     /* needed? */
 
+   if (region->buffer != NULL) {
+      dri_bo_unreference(region->buffer);
+      region->buffer = NULL;
+   }
+
    if (intel->ttm) {
       assert(region_desc->bo_handle != -1);
       region->buffer = intel_bo_gem_create_from_name(intel->bufmgr,
@@ -486,6 +491,11 @@ intel_recreate_static(struct intel_context *intel,
 
       intel_set_region_tiling_gem(intel, region, region_desc->bo_handle);
    } else {
+      if (region->classic_map != NULL) {
+	 drmUnmap(region->classic_map,
+		  region->pitch * region->cpp * region->height);
+	 region->classic_map = NULL;
+      }
       ret = drmMap(intel->driFd, region_desc->handle,
 		   region->pitch * region->cpp * region->height,
 		   &region->classic_map);
-- 
cgit v1.2.3


From cce2ec2754cec6b407827717d01a5b3ad7fc5f6b Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 11:16:44 -0600
Subject: egl: remove space after -L flag

---
 src/egl/drivers/xdri/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egl/drivers/xdri/Makefile b/src/egl/drivers/xdri/Makefile
index afd551dea5..a721b997e6 100644
--- a/src/egl/drivers/xdri/Makefile
+++ b/src/egl/drivers/xdri/Makefile
@@ -48,7 +48,7 @@ $(TOP)/$(LIB_DIR)/$(DRIVER_NAME): $(OBJECTS)
 	$(TOP)/bin/mklib -o $(DRIVER_NAME) \
 		-noprefix \
 		-major 1 -minor 0 \
-		-L $(TOP)/$(LIB_DIR) \
+		-L$(TOP)/$(LIB_DIR) \
 		-install $(TOP)/$(LIB_DIR) \
 		$(OBJECTS) $(DRM_LIB) $(MISC_LIBS)
 
-- 
cgit v1.2.3


From 7944efffff837e5945b2493392a05b87f431cc19 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 11:17:09 -0600
Subject: egl: check for null ptr/name

---
 src/egl/drivers/xdri/egl_xdri.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/egl/drivers/xdri/egl_xdri.c b/src/egl/drivers/xdri/egl_xdri.c
index 9ff71588df..bb38513e7d 100644
--- a/src/egl/drivers/xdri/egl_xdri.c
+++ b/src/egl/drivers/xdri/egl_xdri.c
@@ -654,7 +654,10 @@ xdri_eglInitialize(_EGLDriver *drv, EGLDisplay dpy,
 
    xdri_drv->Base.Initialized = EGL_TRUE;
 
-   snprintf(name, sizeof(name), "X/DRI:%s", xdri_drv->dri_driver_name);
+   if (xdri_drv->dri_driver_name)
+      snprintf(name, sizeof(name), "X/DRI:%s", xdri_drv->dri_driver_name);
+   else
+      snprintf(name, sizeof(name), "X/DRI");
    xdri_drv->Base.Name = name;
 
    /* we're supporting EGL 1.4 */
-- 
cgit v1.2.3


From 8fd329d04885eba7587bbe7604d3a1088e35de40 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 11:18:06 -0600
Subject: mesa: fix temp register allocation problems.

Complex texcombine modes were running out of registers (>32 registers for 8 tex units).
---
 src/mesa/main/texenvprogram.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index c64d88faf9..97aa87e58c 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -398,6 +398,14 @@ static struct ureg get_tex_temp( struct texenv_fragment_program *p )
 }
 
 
+/** Mark a temp reg as being no longer allocatable. */
+static void reserve_temp( struct texenv_fragment_program *p, struct ureg r )
+{
+   if (r.file == PROGRAM_TEMPORARY)
+      p->temps_output |= (1 << r.idx);
+}
+
+
 static void release_temps(GLcontext *ctx, struct texenv_fragment_program *p )
 {
    GLuint max_temp = ctx->Const.FragmentProgram.MaxTemps;
@@ -491,10 +499,12 @@ emit_op(struct texenv_fragment_program *p,
 
    emit_dst( &inst->DstReg, dest, mask );
 
+#if 0
    /* Accounting for indirection tracking:
     */
    if (dest.file == PROGRAM_TEMPORARY)
       p->temps_output |= 1 << dest.idx;
+#endif
 
    return inst;
 }
@@ -549,6 +559,10 @@ static struct ureg emit_texld( struct texenv_fragment_program *p,
 
    p->program->Base.NumTexInstructions++;
 
+   /* Accounting for indirection tracking:
+    */
+   reserve_temp(p, dest);
+
    /* Is this a texture indirection?
     */
    if ((coord.file == PROGRAM_TEMPORARY &&
@@ -1062,6 +1076,7 @@ create_new_program(GLcontext *ctx, struct state_key *key,
       for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++)
 	 if (key->enabled_units & (1<<unit)) {
 	    p.src_previous = emit_texenv( &p, unit );
+            reserve_temp(&p, p.src_previous); /* don't re-use this temp reg */
 	    release_temps(ctx, &p);	/* release all temps */
 	 }
    }
-- 
cgit v1.2.3


From a23026effca921a77fbaa0a7effdc2826212b11e Mon Sep 17 00:00:00 2001
From: Tobias Jakobi <liquid.acid@gmx.net>
Date: Tue, 23 Sep 2008 21:38:19 -0500
Subject: glapi: add gl_dispatch_functions_start and end

---
 src/mesa/glapi/glapi.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/mesa/glapi/glapi.c b/src/mesa/glapi/glapi.c
index 09aaea3d93..c92b096f68 100644
--- a/src/mesa/glapi/glapi.c
+++ b/src/mesa/glapi/glapi.c
@@ -290,6 +290,16 @@ _glapi_get_context(void)
 #endif
 }
 
+#ifdef USE_X86_ASM
+
+#if defined( GLX_USE_TLS )
+extern       GLubyte gl_dispatch_functions_start[];
+extern       GLubyte gl_dispatch_functions_end[];
+#else
+extern const GLubyte gl_dispatch_functions_start[];
+#endif
+
+#endif /* USE_X86_ASM */
 
 
 #if defined(PTHREADS) || defined(GLX_USE_TLS)
-- 
cgit v1.2.3


From 2e5d717007ba6515b094b9af8ed869130185a308 Mon Sep 17 00:00:00 2001
From: Tobias Jakobi <liquid.acid@gmx.net>
Date: Sat, 27 Sep 2008 08:51:45 +0100
Subject: glapi: add DISPATCH_FUNCTION_SIZE

---
 src/mesa/glapi/glapi.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/mesa/glapi/glapi.c b/src/mesa/glapi/glapi.c
index c92b096f68..c3ebf60719 100644
--- a/src/mesa/glapi/glapi.c
+++ b/src/mesa/glapi/glapi.c
@@ -302,6 +302,20 @@ extern const GLubyte gl_dispatch_functions_start[];
 #endif /* USE_X86_ASM */
 
 
+#if defined(USE_X64_64_ASM) && defined(GLX_USE_TLS)
+# define DISPATCH_FUNCTION_SIZE  16
+#elif defined(USE_X86_ASM)
+# if defined(THREADS) && !defined(GLX_USE_TLS)
+#  define DISPATCH_FUNCTION_SIZE  32
+# else
+#  define DISPATCH_FUNCTION_SIZE  16
+# endif
+#endif
+
+#if !defined(DISPATCH_FUNCTION_SIZE) && !defined(XFree86Server) && !defined(XGLServer)
+# define NEED_FUNCTION_POINTER
+#endif
+
 #if defined(PTHREADS) || defined(GLX_USE_TLS)
 /**
  * Perform platform-specific GL API entry-point fixups.
-- 
cgit v1.2.3


From ab74b8e3549838c0c480555134f5451949bac59f Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 28 Sep 2008 18:33:23 +0200
Subject: Gallivm: make it compile again, add some opcodes.

---
 src/gallium/auxiliary/draw/draw_vs_llvm.c      |    1 +
 src/gallium/auxiliary/gallivm/gallivm_cpu.cpp  |    1 +
 src/gallium/auxiliary/gallivm/instructions.cpp | 1210 ++++++++++++++----------
 src/gallium/auxiliary/gallivm/instructions.h   |   26 +-
 src/gallium/auxiliary/gallivm/tgsitollvm.cpp   |   60 +-
 5 files changed, 792 insertions(+), 506 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 2ce30b9a02..727977bc3a 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -32,6 +32,7 @@
   *   Brian Paul
   */
 
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_private.h"
 #include "draw_context.h"
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index e64bfb1c6c..3a4a41e544 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -46,6 +46,7 @@
 #include "tgsi/tgsi_dump.h"
 
 #include "util/u_memory.h"
+#include "util/u_math.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index a82dc30306..5fdfe09d18 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -83,6 +83,7 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
    m_llvmPow   = 0;
    m_llvmFloor = 0;
    m_llvmFlog  = 0;
+   m_llvmFexp  = 0;
    m_llvmLit  = 0;
    m_fmtPtr = 0;
 
@@ -92,194 +93,247 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
    m_mod = ParseBitcodeFile(buffer);
 }
 
-llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
+llvm::BasicBlock * Instructions::currentBlock() const
 {
-   return m_builder.CreateAdd(in1, in2, name("add"));
+   return m_builder.GetInsertBlock();
 }
 
-llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
+llvm::Value * Instructions::abs(llvm::Value *in)
 {
-   Value *mulRes = mul(in1, in2);
-   return add(mulRes, in3);
+   std::vector<llvm::Value*> vec = extractVector(in);
+   Value *xabs  = callFAbs(vec[0]);
+   Value *yabs  = callFAbs(vec[1]);
+   Value *zabs  = callFAbs(vec[2]);
+   Value *wabs  = callFAbs(vec[3]);
+   return vectorFromVals(xabs, yabs, zabs, wabs);
 }
- 
-llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
+
+llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
 {
-   return m_builder.CreateMul(in1, in2, name("mul"));
+   return m_builder.CreateAdd(in1, in2, name("add"));
 }
 
-const char * Instructions::name(const char *prefix)
+llvm::Value * Instructions::arl(llvm::Value *in)
 {
-   ++m_idx;
-   snprintf(m_name, 32, "%s%d", prefix, m_idx);
-   return m_name;
+   return floor(in);
 }
 
-llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+void Instructions::beginLoop()
 {
-   Value *mulRes = mul(in1, in2);
-   Value *x = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(0),
-                                                          name("extractx"));
-   Value *y = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(1),
-                                                          name("extracty"));
-   Value *z = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(2),
-                                                          name("extractz"));
-   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
-   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
-   return vectorFromVals(dot3, dot3, dot3, dot3);
+   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
+   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
+
+   m_builder.CreateBr(begin);
+   Loop loop;
+   loop.begin = begin;
+   loop.end   = end;
+   m_builder.SetInsertPoint(begin);
+   m_loopStack.push(loop);
 }
 
-llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+void Instructions::bgnSub(unsigned label)
 {
-   if (!m_llvmFSqrt) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fsqrtArgs;
-      fsqrtArgs.push_back(Type::FloatTy);
-      PAListPtr fsqrtPal;
-      FunctionType* fsqrtType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fsqrtArgs,
-         /*isVarArg=*/false);
-      m_llvmFSqrt = Function::Create(
-         /*Type=*/fsqrtType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.sqrt.f32", m_mod);
-      m_llvmFSqrt->setCallingConv(CallingConv::C);
-      m_llvmFSqrt->setParamAttrs(fsqrtPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
-                                         name("sqrt"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
+   llvm::Function *func = findFunction(label);
+
+   Function::arg_iterator args = func->arg_begin();
+   Value *ptr_INPUT = args++;
+   ptr_INPUT->setName("INPUT");
+   m_storage->pushArguments(ptr_INPUT);
+
+   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
+
+   m_func = func;
+   m_builder.SetInsertPoint(entry);
 }
 
-llvm::Value * Instructions::rsq(llvm::Value *in1)
+void Instructions::brk()
 {
-   Value *x = m_builder.CreateExtractElement(in1,
-                                             m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *abs  = callFAbs(x);
-   Value *sqrt = callFSqrt(abs);
+   assert(!m_loopStack.empty());
+   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
+   m_builder.CreateBr(m_loopStack.top().end);
+   m_builder.SetInsertPoint(unr);
+}
 
-   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                       sqrt,
-                                       name("rsqrt"));
-   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
+void Instructions::cal(int label, llvm::Value *input)
+{
+   std::vector<Value*> params;
+   params.push_back(input);
+   llvm::Function *func = findFunction(label);
+
+   m_builder.CreateCall(func, params.begin(), params.end());
 }
 
-llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
-                                           llvm::Value *z, llvm::Value *w)
+llvm::Value * Instructions::clamp(llvm::Value *in1)
 {
-   Constant *const_vec = Constant::getNullValue(m_floatVecType);
-   Value *res = m_builder.CreateInsertElement(const_vec, x,
-                                              m_storage->constantInt(0),
-                                              name("vecx"));
-   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
-                               name("vecxy"));
-   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
-                               name("vecxyz"));
-   if (w)
-      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
-                                          name("vecxyzw"));
-   return res;
+	// FIXME
 }
 
-llvm::Value *Instructions::callFAbs(llvm::Value *val)
+llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   if (!m_llvmFAbs) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fabsArgs;
-      fabsArgs.push_back(Type::FloatTy);
-      PAListPtr fabsPal;
-      FunctionType* fabsType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fabsArgs,
-         /*isVarArg=*/false);
-      m_llvmFAbs = Function::Create(
-         /*Type=*/fabsType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"fabs", m_mod);
-      m_llvmFAbs->setCallingConv(CallingConv::C);
-      m_llvmFAbs->setParamAttrs(fabsPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
-                                         name("fabs"));
-   call->setCallingConv(CallingConv::C);
+   llvm::Function *func = m_mod->getFunction("cmp");
+   assert(func);
+
+   std::vector<Value*> params;
+   params.push_back(in1);
+   params.push_back(in2);
+   params.push_back(in3);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
    call->setTailCall(false);
    return call;
 }
 
-llvm::Value * Instructions::lit(llvm::Value *in)
+llvm::Value * Instructions::cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   if (!m_llvmLit) {
-      m_llvmLit = m_mod->getFunction("lit");
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *half = ConstantFP::get(APFloat(0.5f));
+
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], half, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], half, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], half, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], half, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
-   return res;
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *zero = Constant::getNullValue(Type::FloatTy);
+
+   Value *xcmp  = m_builder.CreateFCmpOGE(vec1[0], zero, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGE(vec1[1], zero, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGE(vec1[2], zero, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGE(vec1[3], zero, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+llvm::Value * Instructions::cos(llvm::Value *in)
 {
-   if (!m_llvmPow) {
-      // predeclare the intrinsic
-      std::vector<const Type*> powArgs;
-      powArgs.push_back(Type::FloatTy);
-      powArgs.push_back(Type::FloatTy);
-      PAListPtr powPal;
-      FunctionType* powType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/powArgs,
-         /*isVarArg=*/false);
-      m_llvmPow = Function::Create(
-         /*Type=*/powType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.pow.f32", m_mod);
-      m_llvmPow->setCallingConv(CallingConv::C);
-      m_llvmPow->setParamAttrs(powPal);
-   }
-   std::vector<Value*> params;
-   params.push_back(val1);
-   params.push_back(val2);
-   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
-                                         name("pow"));
-   call->setCallingConv(CallingConv::C);
+#if 0
+   llvm::Function *func = m_mod->getFunction("vcos");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
    call->setTailCall(false);
    return call;
+#else
+   std::vector<llvm::Value*> elems = extractVector(in);
+   Function *func = m_mod->getFunction("cosf");
+   assert(func);
+   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
+   cos->setCallingConv(CallingConv::C);
+   cos->setTailCall(true);
+   return vectorFromVals(cos, cos, cos, cos);
+#endif
 }
 
-llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
 {
    Value *x1 = m_builder.CreateExtractElement(in1,
                                               m_storage->constantInt(0),
                                               name("x1"));
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(2),
+                                              name("z1"));
+
    Value *x2 = m_builder.CreateExtractElement(in2,
                                               m_storage->constantInt(0),
                                               name("x2"));
-   llvm::Value *val = callPow(x1, x2);
-   return vectorFromVals(val, val, val, val);
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *z2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(2),
+                                              name("z2"));
+   Value *y1z2 = mul(y1, z2);
+   Value *z1y2 = mul(z1, y2);
+
+   Value *z1x2 = mul(z1, x2);
+   Value *x1z2 = mul(x1, z2);
+
+   Value *x1y2 = mul(x1, y2);
+   Value *y1x2 = mul(y1, x2);
+
+   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
 }
 
-llvm::Value * Instructions::rcp(llvm::Value *in1)
+llvm::Value * Instructions::ddx(llvm::Value *in)
 {
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                     x1, name("rcp"));
-   return vectorFromVals(res, res, res, res);
+	// FIXME
+}
+
+llvm::Value * Instructions::ddy(llvm::Value *in)
+{
+	// FIXME
+}
+
+llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2)
+{
+   return m_builder.CreateFDiv(in1, in2, name("div"));
+}
+
+llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(in3,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot2add = m_builder.CreateAdd(xy, z, name("dot2add"));
+   return vectorFromVals(dot2add, dot2add, dot2add, dot2add);
+}
+
+llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
+   return vectorFromVals(dot3, dot3, dot3, dot3);
 }
 
 llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2)
@@ -302,23 +356,70 @@ llvm::Value * Instructions::dph(llvm::Value *in1, llvm::Value *in2)
    return vectorFromVals(dph, dph, dph, dph);
 }
 
-llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(2),
+                                             name("z"));
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *w = m_builder.CreateExtractElement(in2,
+                                             m_storage->constantInt(3),
+                                             name("w"));
+   Value *ry = m_builder.CreateMul(y1, y2, name("tyuy"));
+   return vectorFromVals(ConstantFP::get(APFloat(1.f)),
+                         ry, z, w);
+}
+
+void Instructions::elseop()
+{
+   assert(!m_ifStack.empty());
+   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
+   m_builder.CreateBr(ifend);
+   m_builder.SetInsertPoint(m_ifStack.top());
+   currentBlock()->setName(name("ifelse"));
+   m_ifStack.pop();
+   m_ifStack.push(ifend);
+}
+
+void Instructions::endif()
+{
+   assert(!m_ifStack.empty());
+   m_builder.CreateBr(m_ifStack.top());
+   m_builder.SetInsertPoint(m_ifStack.top());
+   m_ifStack.pop();
+}
+
+void Instructions::endLoop()
+{
+   assert(!m_loopStack.empty());
+   Loop loop = m_loopStack.top();
+   m_builder.CreateBr(loop.begin);
+   loop.end->moveAfter(currentBlock());
+   m_builder.SetInsertPoint(loop.end);
+   m_loopStack.pop();
+}
+
+void Instructions::end()
+{
+   m_builder.CreateRetVoid();
+}
+
+void Instructions::endSub()
+{
+   m_func = 0;
+   m_builder.SetInsertPoint(0);
+}
+
+llvm::Value * Instructions::exp(llvm::Value *in)
 {
-   Value *y1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(1),
-                                              name("y1"));
-   Value *z = m_builder.CreateExtractElement(in1,
-                                             m_storage->constantInt(2),
-                                             name("z"));
-   Value *y2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(1),
-                                              name("y2"));
-   Value *w = m_builder.CreateExtractElement(in2,
-                                             m_storage->constantInt(3),
-                                             name("w"));
-   Value *ry = m_builder.CreateMul(y1, y2, name("tyuy"));
-   return vectorFromVals(ConstantFP::get(APFloat(1.f)),
-                         ry, z, w);
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFExp(vec[0]), callFExp(vec[1]),
+                             callFExp(vec[2]), callFExp(vec[3]));
 }
 
 llvm::Value * Instructions::ex2(llvm::Value *in)
@@ -330,31 +431,6 @@ llvm::Value * Instructions::ex2(llvm::Value *in)
    return vectorFromVals(val, val, val, val);
 }
 
-llvm::Value * Instructions::callFloor(llvm::Value *val)
-{
-   if (!m_llvmFloor) {
-      // predeclare the intrinsic
-      std::vector<const Type*> floorArgs;
-      floorArgs.push_back(Type::FloatTy);
-      PAListPtr floorPal;
-      FunctionType* floorType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/floorArgs,
-         /*isVarArg=*/false);
-      m_llvmFloor = Function::Create(
-         /*Type=*/floorType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"floorf", m_mod);
-      m_llvmFloor->setCallingConv(CallingConv::C);
-      m_llvmFloor->setParamAttrs(floorPal);
-   }
-   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
-                                          name("floorf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
 llvm::Value * Instructions::floor(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
@@ -362,42 +438,52 @@ llvm::Value * Instructions::floor(llvm::Value *in)
                          callFloor(vec[2]), callFloor(vec[3]));
 }
 
-llvm::Value * Instructions::arl(llvm::Value *in)
-{
-   return floor(in);
-}
-
 llvm::Value * Instructions::frc(llvm::Value *in)
 {
    llvm::Value *flr = floor(in);
    return sub(in, flr);
 }
 
-llvm::Value * Instructions::callFLog(llvm::Value *val)
+void Instructions::ifop(llvm::Value *in)
 {
-   if (!m_llvmFlog) {
-      // predeclare the intrinsic
-      std::vector<const Type*> flogArgs;
-      flogArgs.push_back(Type::FloatTy);
-      PAListPtr flogPal;
-      FunctionType* flogType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/flogArgs,
-         /*isVarArg=*/false);
-      m_llvmFlog = Function::Create(
-         /*Type=*/flogType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"logf", m_mod);
-      m_llvmFlog->setCallingConv(CallingConv::C);
-      m_llvmFlog->setParamAttrs(flogPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
-                                         name("logf"));
-   call->setCallingConv(CallingConv::C);
+   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
+   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+
+   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
+   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
+   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+
+   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+
+   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
+   m_builder.CreateCondBr(xcmp, ifthen, ifend);
+   //m_builder.SetInsertPoint(yblock);
+
+   m_builder.SetInsertPoint(ifthen);
+   m_ifStack.push(ifend);
+}
+
+llvm::Value * Instructions::kil(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("kil");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
    call->setTailCall(false);
    return call;
 }
 
+llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   llvm::Value *m = mul(in1, in2);
+   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
+   llvm::Value *s = sub(vec1, in1);
+   return add(m, mul(s, in3));
+}
+
 llvm::Value * Instructions::lg2(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
@@ -407,120 +493,192 @@ llvm::Value * Instructions::lg2(llvm::Value *in)
                              callFLog(vec[2]), callFLog(vec[3])), const_vec);
 }
 
-llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::lit(llvm::Value *in)
+{
+   if (!m_llvmLit) {
+      m_llvmLit = m_mod->getFunction("lit");
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::log(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
+                             callFLog(vec[2]), callFLog(vec[3]));
+}
+
+llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   return add(mulRes, in3);
+}
+
+llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
 {
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
+                                          name("xcmp"));
    Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
                                         name("selx"));
 
-   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
+                                          name("ycmp"));
    Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
                                         name("sely"));
 
-   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
+                                          name("zcmp"));
    Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
                                         name("selz"));
 
-   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
+                                          name("wcmp"));
    Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
                                         name("selw"));
 
    return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
 {
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
-                                          name("xcmp"));
+   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
    Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
                                         name("selx"));
 
-   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
-                                          name("ycmp"));
+   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
    Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
                                         name("sely"));
 
-   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
-                                          name("zcmp"));
+   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
    Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
                                         name("selz"));
 
-   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
-                                          name("wcmp"));
+   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
    Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
                                         name("selw"));
 
    return vectorFromVals(selx, sely, selz, selw);
 }
 
-void Instructions::printVector(llvm::Value *val)
+llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
 {
-   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+   return m_builder.CreateMul(in1, in2, name("mul"));
+}
 
-   if (!m_fmtPtr) {
-      Constant *format = ConstantArray::get(frmt, true);
-      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
-      GlobalVariable* globalFormat = new GlobalVariable(
-         /*Type=*/arrayTy,
-         /*isConstant=*/true,
-         /*Linkage=*/GlobalValue::InternalLinkage,
-         /*Initializer=*/0, // has initializer, specified below
-         /*Name=*/name(".str"),
-         m_mod);
-      globalFormat->setInitializer(format);
+llvm::Value * Instructions::neg(llvm::Value *in)
+{
+   Value *neg = m_builder.CreateNeg(in, name("neg"));
+   return neg;
+}
 
-      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
-      std::vector<Constant*> const_ptr_21_indices;
-      const_ptr_21_indices.push_back(const_int0);
-      const_ptr_21_indices.push_back(const_int0);
-      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
-                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
-   }
+llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *x2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(0),
+                                              name("x2"));
+   llvm::Value *val = callPow(x1, x2);
+   return vectorFromVals(val, val, val, val);
+}
 
-   Function *func_printf = m_mod->getFunction("printf");
-   if (!func_printf)
-      func_printf = declarePrintf();
-   assert(func_printf);
-   std::vector<llvm::Value*> vec = extractVector(val);
-   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
-   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
-   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
-   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
-   std::vector<Value*> params;
-   params.push_back(m_fmtPtr);
-   params.push_back(dx);
-   params.push_back(dy);
-   params.push_back(dz);
-   params.push_back(dw);
-   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
-                                         name("printf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(true);
+llvm::Value * Instructions::rcp(llvm::Value *in1)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                     x1, name("rcp"));
+   return vectorFromVals(res, res, res, res);
+}
+
+llvm::Value * Instructions::rsq(llvm::Value *in1)
+{
+   Value *x = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *abs  = callFAbs(x);
+   Value *sqrt = callFSqrt(abs);
+
+   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                       sqrt,
+                                       name("rsqrt"));
+   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
+}
+
+llvm::Value * Instructions::scs(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("scs");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::seq(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOEQ(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOEQ(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOEQ(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOEQ(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
 }
 
-llvm::Function * Instructions::declarePrintf()
+llvm::Value * Instructions::sfl(llvm::Value *in1, llvm::Value *in2)
 {
-   std::vector<const Type*> args;
-   PAListPtr params;
-   FunctionType* funcTy = FunctionType::get(
-      /*Result=*/IntegerType::get(32),
-      /*Params=*/args,
-      /*isVarArg=*/true);
-   Function* func_printf = Function::Create(
-      /*Type=*/funcTy,
-      /*Linkage=*/GlobalValue::ExternalLinkage,
-      /*Name=*/"printf", m_mod);
-   func_printf->setCallingConv(CallingConv::C);
-   func_printf->setParamAttrs(params);
-   return func_printf;
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   return vectorFromVals(const0f, const0f, const0f, const0f);
 }
 
+llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
 
 llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
 {
@@ -543,7 +701,18 @@ llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
 
    return vectorFromVals(x, y, z, w);
 }
-llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
+
+llvm::Value * Instructions::sin(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("vsin");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::sle(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
    Constant *const0f = Constant::getNullValue(Type::FloatTy);
@@ -551,22 +720,21 @@ llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], vec2[0], name("xcmp"));
+   Value *xcmp = m_builder.CreateFCmpOLE(vec1[0], vec2[0], name("xcmp"));
    Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp = m_builder.CreateFCmpOLE(vec1[1], vec2[1], name("ycmp"));
    Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp = m_builder.CreateFCmpOLE(vec1[2], vec2[2], name("zcmp"));
    Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
 
-   Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp = m_builder.CreateFCmpOLE(vec1[3], vec2[3], name("wcmp"));
    Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
 
    return vectorFromVals(x, y, z, w);
 }
 
-
 llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
@@ -590,169 +758,331 @@ llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
    return vectorFromVals(x, y, z, w);
 }
 
-llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::sne(llvm::Value *in1, llvm::Value *in2)
 {
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *y1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(1),
-                                              name("y1"));
-   Value *z1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(2),
-                                              name("z1"));
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
-   Value *x2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(0),
-                                              name("x2"));
-   Value *y2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(1),
-                                              name("y2"));
-   Value *z2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(2),
-                                              name("z2"));
-   Value *y1z2 = mul(y1, z2);
-   Value *z1y2 = mul(z1, y2);
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *z1x2 = mul(z1, x2);
-   Value *x1z2 = mul(x1, z2);
+   Value *xcmp = m_builder.CreateFCmpONE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *x1y2 = mul(x1, y2);
-   Value *y1x2 = mul(y1, x2);
+   Value *ycmp = m_builder.CreateFCmpONE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
+   Value *zcmp = m_builder.CreateFCmpONE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpONE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
 }
 
+llvm::Value * Instructions::str(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
 
-llvm::Value * Instructions::abs(llvm::Value *in)
+   return vectorFromVals(const1f, const1f, const1f, const1f);
+}
+
+llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
+   return res;
+}
+
+llvm::Value * Instructions::trunc(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
-   Value *xabs  = callFAbs(vec[0]);
-   Value *yabs  = callFAbs(vec[1]);
-   Value *zabs  = callFAbs(vec[2]);
-   Value *wabs  = callFAbs(vec[3]);
-   return vectorFromVals(xabs, yabs, zabs, wabs);
+   Value *icastx = m_builder.CreateFPToSI(vec[0], IntegerType::get(32),
+                                          name("ftoix"));
+   Value *icasty = m_builder.CreateFPToSI(vec[1], IntegerType::get(32),
+                                          name("ftoiy"));
+   Value *icastz = m_builder.CreateFPToSI(vec[2], IntegerType::get(32),
+                                          name("ftoiz"));
+   Value *icastw = m_builder.CreateFPToSI(vec[3], IntegerType::get(32),
+                                          name("ftoiw"));
+   Value *fx = m_builder.CreateSIToFP(icastx, Type::FloatTy,
+                                      name("fx"));
+   Value *fy = m_builder.CreateSIToFP(icasty, Type::FloatTy,
+                                      name("fy"));
+   Value *fz = m_builder.CreateSIToFP(icastz, Type::FloatTy,
+                                      name("fz"));
+   Value *fw = m_builder.CreateSIToFP(icastw, Type::FloatTy,
+                                      name("fw"));
+   return vectorFromVals(fx, fy, fz, fw);
 }
 
-void Instructions::ifop(llvm::Value *in)
+llvm::Value * Instructions::x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
-   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
 
-   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
-   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
-   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+   Value *x2x3 = m_builder.CreateMul( vec2[0], vec3[0], name("x2x3"));
+   Value *y2y3 = m_builder.CreateMul( vec2[1], vec3[1], name("y2y3"));
+   Value *x1px2x3 = m_builder.CreateAdd (vec1[0], x2x3, name("x1 + x2x3"));
+   Value *x1px2x3py2y3 = m_builder.CreateAdd (x1px2x3, y2y3, name("x1 + x2x3 + y2y3"));
 
-   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+   Value *x2z3 = m_builder.CreateMul( vec2[0], vec3[2], name("x2z3"));
+   Value *y2w3 = m_builder.CreateMul( vec2[1], vec3[3], name("y2w3"));
+   Value *y1px2z3 = m_builder.CreateAdd (vec1[1], x2z3, name("y1 + x2z3"));
+   Value *y1px2z3py2w3 = m_builder.CreateAdd (y1px2z3, y2w3, name("y1 + x2z3 + y2w3"));
 
-   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
-   m_builder.CreateCondBr(xcmp, ifthen, ifend);
-   //m_builder.SetInsertPoint(yblock);
+   return vectorFromVals(x1px2x3py2y3, y1px2z3py2w3, x1px2x3py2y3, y1px2z3py2w3);
+}
 
-   m_builder.SetInsertPoint(ifthen);
-   m_ifStack.push(ifend);
+void Instructions::printVector(llvm::Value *val)
+{
+   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+
+   if (!m_fmtPtr) {
+      Constant *format = ConstantArray::get(frmt, true);
+      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
+      GlobalVariable* globalFormat = new GlobalVariable(
+         /*Type=*/arrayTy,
+         /*isConstant=*/true,
+         /*Linkage=*/GlobalValue::InternalLinkage,
+         /*Initializer=*/0, // has initializer, specified below
+         /*Name=*/name(".str"),
+         m_mod);
+      globalFormat->setInitializer(format);
+
+      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
+      std::vector<Constant*> const_ptr_21_indices;
+      const_ptr_21_indices.push_back(const_int0);
+      const_ptr_21_indices.push_back(const_int0);
+      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
+                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
+   }
+
+   Function *func_printf = m_mod->getFunction("printf");
+   if (!func_printf)
+      func_printf = declarePrintf();
+   assert(func_printf);
+   std::vector<llvm::Value*> vec = extractVector(val);
+   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
+   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
+   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
+   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
+   std::vector<Value*> params;
+   params.push_back(m_fmtPtr);
+   params.push_back(dx);
+   params.push_back(dy);
+   params.push_back(dz);
+   params.push_back(dw);
+   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
+                                         name("printf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(true);
 }
 
-llvm::BasicBlock * Instructions::currentBlock() const
+const char * Instructions::name(const char *prefix)
 {
-   return m_builder.GetInsertBlock();
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
 }
 
-void Instructions::elseop()
+llvm::Value *Instructions::callFAbs(llvm::Value *val)
 {
-   assert(!m_ifStack.empty());
-   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
-   m_builder.CreateBr(ifend);
-   m_builder.SetInsertPoint(m_ifStack.top());
-   currentBlock()->setName(name("ifelse"));
-   m_ifStack.pop();
-   m_ifStack.push(ifend);
+   if (!m_llvmFAbs) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fabsArgs;
+      fabsArgs.push_back(Type::FloatTy);
+      PAListPtr fabsPal;
+      FunctionType* fabsType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fabsArgs,
+         /*isVarArg=*/false);
+      m_llvmFAbs = Function::Create(
+         /*Type=*/fabsType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"fabs", m_mod);
+      m_llvmFAbs->setCallingConv(CallingConv::C);
+      m_llvmFAbs->setParamAttrs(fabsPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
+                                         name("fabs"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-void Instructions::endif()
+llvm::Value * Instructions::callFExp(llvm::Value *val)
 {
-   assert(!m_ifStack.empty());
-   m_builder.CreateBr(m_ifStack.top());
-   m_builder.SetInsertPoint(m_ifStack.top());
-   m_ifStack.pop();
+   if (!m_llvmFexp) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fexpArgs;
+      fexpArgs.push_back(Type::FloatTy);
+      PAListPtr fexpPal;
+      FunctionType* fexpType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fexpArgs,
+         /*isVarArg=*/false);
+      m_llvmFexp = Function::Create(
+         /*Type=*/fexpType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"expf", m_mod);
+      m_llvmFexp->setCallingConv(CallingConv::C);
+      m_llvmFexp->setParamAttrs(fexpPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFexp, val,
+                                         name("expf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
+llvm::Value * Instructions::callFLog(llvm::Value *val)
 {
-   llvm::Value *m = mul(in1, in2);
-   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
-   llvm::Value *s = sub(vec1, in1);
-   return add(m, mul(s, in3));
+   if (!m_llvmFlog) {
+      // predeclare the intrinsic
+      std::vector<const Type*> flogArgs;
+      flogArgs.push_back(Type::FloatTy);
+      PAListPtr flogPal;
+      FunctionType* flogType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/flogArgs,
+         /*isVarArg=*/false);
+      m_llvmFlog = Function::Create(
+         /*Type=*/flogType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"logf", m_mod);
+      m_llvmFlog->setCallingConv(CallingConv::C);
+      m_llvmFlog->setParamAttrs(flogPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
+                                         name("logf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-void Instructions::beginLoop()
+llvm::Value * Instructions::callFloor(llvm::Value *val)
 {
-   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
-   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
-
-   m_builder.CreateBr(begin);
-   Loop loop;
-   loop.begin = begin;
-   loop.end   = end;
-   m_builder.SetInsertPoint(begin);
-   m_loopStack.push(loop);
+   if (!m_llvmFloor) {
+      // predeclare the intrinsic
+      std::vector<const Type*> floorArgs;
+      floorArgs.push_back(Type::FloatTy);
+      PAListPtr floorPal;
+      FunctionType* floorType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/floorArgs,
+         /*isVarArg=*/false);
+      m_llvmFloor = Function::Create(
+         /*Type=*/floorType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"floorf", m_mod);
+      m_llvmFloor->setCallingConv(CallingConv::C);
+      m_llvmFloor->setParamAttrs(floorPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
+                                          name("floorf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-void Instructions::endLoop()
+llvm::Value *Instructions::callFSqrt(llvm::Value *val)
 {
-   assert(!m_loopStack.empty());
-   Loop loop = m_loopStack.top();
-   m_builder.CreateBr(loop.begin);
-   loop.end->moveAfter(currentBlock());
-   m_builder.SetInsertPoint(loop.end);
-   m_loopStack.pop();
+   if (!m_llvmFSqrt) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fsqrtArgs;
+      fsqrtArgs.push_back(Type::FloatTy);
+      PAListPtr fsqrtPal;
+      FunctionType* fsqrtType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fsqrtArgs,
+         /*isVarArg=*/false);
+      m_llvmFSqrt = Function::Create(
+         /*Type=*/fsqrtType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.sqrt.f32", m_mod);
+      m_llvmFSqrt->setCallingConv(CallingConv::C);
+      m_llvmFSqrt->setParamAttrs(fsqrtPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
+                                         name("sqrt"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-void Instructions::brk()
+llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
 {
-   assert(!m_loopStack.empty());
-   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
-   m_builder.CreateBr(m_loopStack.top().end);
-   m_builder.SetInsertPoint(unr);
+   if (!m_llvmPow) {
+      // predeclare the intrinsic
+      std::vector<const Type*> powArgs;
+      powArgs.push_back(Type::FloatTy);
+      powArgs.push_back(Type::FloatTy);
+      PAListPtr powPal;
+      FunctionType* powType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/powArgs,
+         /*isVarArg=*/false);
+      m_llvmPow = Function::Create(
+         /*Type=*/powType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.pow.f32", m_mod);
+      m_llvmPow->setCallingConv(CallingConv::C);
+      m_llvmPow->setParamAttrs(powPal);
+   }
+   std::vector<Value*> params;
+   params.push_back(val1);
+   params.push_back(val2);
+   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
+                                         name("pow"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-llvm::Value * Instructions::trunc(llvm::Value *in)
+llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
+                                           llvm::Value *z, llvm::Value *w)
 {
-   std::vector<llvm::Value*> vec = extractVector(in);
-   Value *icastx = m_builder.CreateFPToSI(vec[0], IntegerType::get(32),
-                                          name("ftoix"));
-   Value *icasty = m_builder.CreateFPToSI(vec[1], IntegerType::get(32),
-                                          name("ftoiy"));
-   Value *icastz = m_builder.CreateFPToSI(vec[2], IntegerType::get(32),
-                                          name("ftoiz"));
-   Value *icastw = m_builder.CreateFPToSI(vec[3], IntegerType::get(32),
-                                          name("ftoiw"));
-   Value *fx = m_builder.CreateSIToFP(icastx, Type::FloatTy,
-                                      name("fx"));
-   Value *fy = m_builder.CreateSIToFP(icasty, Type::FloatTy,
-                                      name("fy"));
-   Value *fz = m_builder.CreateSIToFP(icastz, Type::FloatTy,
-                                      name("fz"));
-   Value *fw = m_builder.CreateSIToFP(icastw, Type::FloatTy,
-                                      name("fw"));
-   return vectorFromVals(fx, fy, fz, fw);
+   Constant *const_vec = Constant::getNullValue(m_floatVecType);
+   Value *res = m_builder.CreateInsertElement(const_vec, x,
+                                              m_storage->constantInt(0),
+                                              name("vecx"));
+   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
+                               name("vecxy"));
+   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
+                               name("vecxyz"));
+   if (w)
+      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
+                                          name("vecxyzw"));
+   return res;
 }
 
-void Instructions::end()
+llvm::Value * Instructions::constVector(float x, float y, float z, float w)
 {
-   m_builder.CreateRetVoid();
+   std::vector<Constant*> vec(4);
+   vec[0] = ConstantFP::get(APFloat(x));
+   vec[1] = ConstantFP::get(APFloat(y));
+   vec[2] = ConstantFP::get(APFloat(z));
+   vec[3] = ConstantFP::get(APFloat(w));
+   return ConstantVector::get(m_floatVecType, vec);
 }
 
-void Instructions::cal(int label, llvm::Value *input)
+llvm::Function * Instructions::declarePrintf()
 {
-   std::vector<Value*> params;
-   params.push_back(input);
-   llvm::Function *func = findFunction(label);
-
-   m_builder.CreateCall(func, params.begin(), params.end());
+   std::vector<const Type*> args;
+   PAListPtr params;
+   FunctionType* funcTy = FunctionType::get(
+      /*Result=*/IntegerType::get(32),
+      /*Params=*/args,
+      /*isVarArg=*/true);
+   Function* func_printf = Function::Create(
+      /*Type=*/funcTy,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Name=*/"printf", m_mod);
+   func_printf->setCallingConv(CallingConv::C);
+   func_printf->setParamAttrs(params);
+   return func_printf;
 }
 
 llvm::Function * Instructions::declareFunc(int label)
@@ -778,27 +1108,6 @@ llvm::Function * Instructions::declareFunc(int label)
    return func;
 }
 
-void Instructions::bgnSub(unsigned label)
-{
-   llvm::Function *func = findFunction(label);
-
-   Function::arg_iterator args = func->arg_begin();
-   Value *ptr_INPUT = args++;
-   ptr_INPUT->setName("INPUT");
-   m_storage->pushArguments(ptr_INPUT);
-
-   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
-
-   m_func = func;
-   m_builder.SetInsertPoint(entry);
-}
-
-void Instructions::endSub()
-{
-   m_func = 0;
-   m_builder.SetInsertPoint(0);
-}
-
 llvm::Function * Instructions::findFunction(int label)
 {
    llvm::Function *func = m_functions[label];
@@ -809,17 +1118,6 @@ llvm::Function * Instructions::findFunction(int label)
    return func;
 }
 
-llvm::Value * Instructions::constVector(float x, float y, float z, float w)
-{
-   std::vector<Constant*> vec(4);
-   vec[0] = ConstantFP::get(APFloat(x));
-   vec[1] = ConstantFP::get(APFloat(y));
-   vec[2] = ConstantFP::get(APFloat(z));
-   vec[3] = ConstantFP::get(APFloat(w));
-   return ConstantVector::get(m_floatVecType, vec);
-}
-
-
 std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
 {
    std::vector<llvm::Value*> elems(4);
@@ -834,69 +1132,7 @@ std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
    return elems;
 }
 
-llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
-   llvm::Function *func = m_mod->getFunction("cmp");
-   assert(func);
-
-   std::vector<Value*> params;
-   params.push_back(in1);
-   params.push_back(in2);
-   params.push_back(in3);
-   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::cos(llvm::Value *in)
-{
-#if 0
-   llvm::Function *func = m_mod->getFunction("vcos");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
-   call->setTailCall(false);
-   return call;
-#else
-   std::vector<llvm::Value*> elems = extractVector(in);
-   Function *func = m_mod->getFunction("cosf");
-   assert(func);
-   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
-   cos->setCallingConv(CallingConv::C);
-   cos->setTailCall(true);
-   return vectorFromVals(cos, cos, cos, cos);
-#endif
-}
-
-llvm::Value * Instructions::scs(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("scs");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::kil(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("kil");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::sin(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("vsin");
-   assert(func);
 
-   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
-   call->setTailCall(false);
-   return call;
-}
 #endif //MESA_LLVM
 
 
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index d286ce80c7..8df30f62c8 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -57,15 +57,22 @@ public:
    llvm::BasicBlock *currentBlock() const;
 
    llvm::Value *abs(llvm::Value *in1);
-   llvm::Value *arl(llvm::Value *in1);
    llvm::Value *add(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *arl(llvm::Value *in1);
    void         beginLoop();
    void         bgnSub(unsigned);
    void         brk();
    void         cal(int label, llvm::Value *input);
+   llvm::Value *clamp(llvm::Value *in);
    llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
    llvm::Value *cos(llvm::Value *in);
    llvm::Value *cross(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *ddx(llvm::Value *in);
+   llvm::Value *ddy(llvm::Value *in);
+   llvm::Value *div(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3);
    llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
@@ -75,6 +82,7 @@ public:
    void         endLoop();
    void         end();
    void         endSub();
+   llvm::Value *exp(llvm::Value *in);
    llvm::Value *ex2(llvm::Value *in);
    llvm::Value *floor(llvm::Value *in);
    llvm::Value *frc(llvm::Value *in);
@@ -82,32 +90,41 @@ public:
    llvm::Value *kil(llvm::Value *in);
    llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2,
                      llvm::Value *in3);
-   llvm::Value *lit(llvm::Value *in);
    llvm::Value *lg2(llvm::Value *in);
+   llvm::Value *lit(llvm::Value *in);
+   llvm::Value *log(llvm::Value *in);
    llvm::Value *madd(llvm::Value *in1, llvm::Value *in2,
                      llvm::Value *in3);
-   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *max(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *neg(llvm::Value *in);
    llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *rcp(llvm::Value *in);
    llvm::Value *rsq(llvm::Value *in);
    llvm::Value *scs(llvm::Value *in);
+   llvm::Value *seq(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sfl(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sge(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sin(llvm::Value *in);
+   llvm::Value *sle(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *slt(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sne(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *str(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sub(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *trunc(llvm::Value *in);
+   llvm::Value *x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
 
    void printVector(llvm::Value *val);
 private:
    const char *name(const char *prefix);
 
    llvm::Value *callFAbs(llvm::Value *val);
+   llvm::Value *callFExp(llvm::Value *val);
+   llvm::Value *callFLog(llvm::Value *val);
    llvm::Value *callFloor(llvm::Value *val);
    llvm::Value *callFSqrt(llvm::Value *val);
-   llvm::Value *callFLog(llvm::Value *val);
    llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2);
 
    llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
@@ -135,6 +152,7 @@ private:
    llvm::Function   *m_llvmPow;
    llvm::Function   *m_llvmFloor;
    llvm::Function   *m_llvmFlog;
+   llvm::Function   *m_llvmFexp;
    llvm::Function   *m_llvmLit;
 
    llvm::Constant   *m_fmtPtr;
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index cc1516a45e..398fbd67bd 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -286,9 +286,13 @@ translate_instruction(llvm::Module *module,
       out = instr->rsq(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_EXP: {
+      out = instr->exp(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_LOG:
+   case TGSI_OPCODE_LOG: {
+      out = instr->log(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_MUL: {
       out = instr->mul(inputs[0], inputs[1]);
@@ -338,21 +342,31 @@ translate_instruction(llvm::Module *module,
       out = instr->lerp(inputs[0], inputs[1], inputs[2]);
    }
       break;
-   case TGSI_OPCODE_CND:
+   case TGSI_OPCODE_CND: {
+      out = instr->cnd(inputs[0], inputs[1], inputs[2]);
+   }
       break;
-   case TGSI_OPCODE_CND0:
+   case TGSI_OPCODE_CND0: {
+      out = instr->cnd0(inputs[0], inputs[1], inputs[2]);
+   }
       break;
-   case TGSI_OPCODE_DOT2ADD:
+   case TGSI_OPCODE_DOT2ADD: {
+      out = instr->dot2add(inputs[0], inputs[1], inputs[2]);
+   }
       break;
    case TGSI_OPCODE_INDEX:
       break;
-   case TGSI_OPCODE_NEGATE:
+   case TGSI_OPCODE_NEGATE: {
+      out = instr->neg(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_FRAC: {
       out = instr->frc(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_CLAMP:
+   case TGSI_OPCODE_CLAMP: {
+      out = instr->clamp(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_FLOOR: {
       out = instr->floor(inputs[0]);
@@ -392,9 +406,13 @@ translate_instruction(llvm::Module *module,
       out = instr->cos(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDX: {
+      out = instr->ddx(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_DDY:
+   case TGSI_OPCODE_DDY: {
+      out = instr->ddy(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_KILP:
       break;
@@ -408,9 +426,13 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_RFL:
       break;
-   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SEQ: {
+      out = instr->seq(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_SFL:
+   case TGSI_OPCODE_SFL: {
+      out = instr->sfl(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_SGT: {
       out = instr->sgt(inputs[0], inputs[1]);
@@ -420,11 +442,17 @@ translate_instruction(llvm::Module *module,
       out = instr->sin(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SLE: {
+      out = instr->sle(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SNE: {
+      out = instr->sne(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_STR:
+   case TGSI_OPCODE_STR: {
+      out = instr->str(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_TEX:
       break;
@@ -438,7 +466,9 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_UP4UB:
       break;
-   case TGSI_OPCODE_X2D:
+   case TGSI_OPCODE_X2D: {
+      out = instr->x2d(inputs[0], inputs[1], inputs[2]);
+   }
       break;
    case TGSI_OPCODE_ARA:
       break;
-- 
cgit v1.2.3


From a0a06cbc5b26d7530bd5066f09efe3c1f980d35d Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 28 Sep 2008 19:48:26 +0200
Subject: Gallivm: more instructions.

---
 src/gallium/auxiliary/gallivm/instructions.cpp | 61 ++++++++++++++++++++++++--
 src/gallium/auxiliary/gallivm/instructions.h   |  5 +++
 src/gallium/auxiliary/gallivm/tgsitollvm.cpp   | 15 ++++---
 3 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index 5fdfe09d18..3eaf9aacf6 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -163,9 +163,18 @@ void Instructions::cal(int label, llvm::Value *input)
    m_builder.CreateCall(func, params.begin(), params.end());
 }
 
+llvm::Value * Instructions::ceil(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callCeil(vec[0]), callCeil(vec[1]),
+                         callCeil(vec[2]), callCeil(vec[3]));
+}
+
 llvm::Value * Instructions::clamp(llvm::Value *in1)
 {
-	// FIXME
+   llvm::Value *zero = constVector(0.0f, 0.0f, 0.0f, 0.0f);
+   llvm::Value *one = constVector(1.0f, 1.0f, 1.0f, 1.0f);
+   return min( max(zero, in1), one);
 }
 
 llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
@@ -289,12 +298,14 @@ llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
 
 llvm::Value * Instructions::ddx(llvm::Value *in)
 {
-	// FIXME
+   // FIXME
+   assert(0);
 }
 
 llvm::Value * Instructions::ddy(llvm::Value *in)
 {
-	// FIXME
+   // FIXME
+   assert(0);
 }
 
 llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2)
@@ -319,6 +330,19 @@ llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Va
    return vectorFromVals(dot2add, dot2add, dot2add, dot2add);
 }
 
+llvm::Value * Instructions::dp2(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   return vectorFromVals(xy, xy, xy, xy);
+}
+
 llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
 {
    Value *mulRes = mul(in1, in2);
@@ -581,6 +605,12 @@ llvm::Value * Instructions::neg(llvm::Value *in)
    return neg;
 }
 
+llvm::Value * Instructions::nrm(llvm::Value *in)
+{
+   llvm::Value *v = rsq(in);
+   return mul(v, in);
+}
+
 llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
 {
    Value *x1 = m_builder.CreateExtractElement(in1,
@@ -887,6 +917,31 @@ const char * Instructions::name(const char *prefix)
    return m_name;
 }
 
+llvm::Value * Instructions::callCeil(llvm::Value *val)
+{
+   if (!m_llvmCeil) {
+      // predeclare the intrinsic
+      std::vector<const Type*> ceilArgs;
+      ceilArgs.push_back(Type::FloatTy);
+      PAListPtr ceilPal;
+      FunctionType* ceilType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/ceilArgs,
+         /*isVarArg=*/false);
+      m_llvmCeil = Function::Create(
+         /*Type=*/ceilType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"ceilf", m_mod);
+      m_llvmCeil->setCallingConv(CallingConv::C);
+      m_llvmCeil->setParamAttrs(ceilPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmCeil, val,
+                                          name("ceilf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
 llvm::Value *Instructions::callFAbs(llvm::Value *val)
 {
    if (!m_llvmFAbs) {
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index 8df30f62c8..c3b28e9746 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -63,6 +63,7 @@ public:
    void         bgnSub(unsigned);
    void         brk();
    void         cal(int label, llvm::Value *input);
+   llvm::Value *ceil(llvm::Value *in);
    llvm::Value *clamp(llvm::Value *in);
    llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
    llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
@@ -73,6 +74,7 @@ public:
    llvm::Value *ddy(llvm::Value *in);
    llvm::Value *div(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *dp2(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
@@ -99,6 +101,7 @@ public:
    llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *neg(llvm::Value *in);
+   llvm::Value *nrm(llvm::Value *in);
    llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *rcp(llvm::Value *in);
    llvm::Value *rsq(llvm::Value *in);
@@ -120,6 +123,7 @@ public:
 private:
    const char *name(const char *prefix);
 
+   llvm::Value *callCeil(llvm::Value *val);
    llvm::Value *callFAbs(llvm::Value *val);
    llvm::Value *callFExp(llvm::Value *val);
    llvm::Value *callFLog(llvm::Value *val);
@@ -147,6 +151,7 @@ private:
 
    llvm::VectorType *m_floatVecType;
 
+   llvm::Function   *m_llvmCeil;
    llvm::Function   *m_llvmFSqrt;
    llvm::Function   *m_llvmFAbs;
    llvm::Function   *m_llvmPow;
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 398fbd67bd..fdfbb76c16 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -498,11 +498,18 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_TXB:
       break;
-   case TGSI_OPCODE_NRM:
+   case TGSI_OPCODE_NRM4:
+   case TGSI_OPCODE_NRM: {
+      out = instr->nrm(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_DIV:
+   case TGSI_OPCODE_DIV: {
+      out = instr->div(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_DP2:
+   case TGSI_OPCODE_DP2: {
+      out = instr->dp2(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_TXL:
       break;
@@ -620,8 +627,6 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_M3X2:
       break;
-   case TGSI_OPCODE_NRM4:
-      break;
    case TGSI_OPCODE_CALLNZ:
       break;
    case TGSI_OPCODE_IFC:
-- 
cgit v1.2.3


From 6dacc942e158211a1f8be77cd7ba52947e504e7c Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 28 Sep 2008 21:45:48 +0200
Subject: Gallivm: need to link with libstdc++ for llvm.

---
 configs/linux-llvm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/linux-llvm b/configs/linux-llvm
index 44e200e856..3b32db34d8 100644
--- a/configs/linux-llvm
+++ b/configs/linux-llvm
@@ -31,4 +31,4 @@ else
   LLVM_CXXFLAGS=
 endif
 
-GL_LIB_DEPS = $(LLVM_LDFLAGS) $(LLVM_LIBS) $(EXTRA_LIB_PATH) -lX11 -lXext -lm -lpthread
+GL_LIB_DEPS = $(LLVM_LDFLAGS) $(LLVM_LIBS) $(EXTRA_LIB_PATH) -lX11 -lXext -lm -lpthread -lstdc++
-- 
cgit v1.2.3


From 7379d0ef8f533b0aa760cd21b219223602002a56 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 28 Sep 2008 23:18:55 +0200
Subject: Gallivm: fix off-by-one.

---
 src/gallium/auxiliary/gallivm/instructionssoa.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index efddc04e81..9a3ed9f538 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -259,7 +259,7 @@ void InstructionsSoa::createBuiltins()
 {
    MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
       (const char*)&soabuiltins_data[0],
-      (const char*)&soabuiltins_data[Elements(soabuiltins_data)]);
+      (const char*)&soabuiltins_data[Elements(soabuiltins_data)-1]);
    m_builtins = ParseBitcodeFile(buffer);
    std::cout<<"Builtins created at "<<m_builtins<<std::endl;
    assert(m_builtins);
-- 
cgit v1.2.3


From b5d59222ccbec9db23b6847737765a4dc0d8c47b Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 25 Sep 2008 17:19:47 -0700
Subject: Remove TNL-to-VP tracking from i965

The i965 driver previously had it's own set of code to convert
fixed-function TNL state to a vertex program.  Core Mesa has code to
do this, so there is no reason to duplicate that effort in the driver.
In fact, this duplication leads to bugs when other aspects of the Mesa
infrastructure change.
---
 src/mesa/drivers/dri/i965/brw_context.c      |    2 -
 src/mesa/drivers/dri/i965/brw_context.h      |    5 -
 src/mesa/drivers/dri/i965/brw_state.h        |    1 -
 src/mesa/drivers/dri/i965/brw_state_upload.c |    1 -
 src/mesa/drivers/dri/i965/brw_vs.h           |    4 -
 src/mesa/drivers/dri/i965/brw_vs_tnl.c       | 1653 +-------------------------
 src/mesa/drivers/dri/i965/brw_vtbl.c         |    1 -
 7 files changed, 2 insertions(+), 1665 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 92629016d9..5f60477176 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -152,8 +152,6 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 
    brw_draw_init( brw );
 
-   brw_ProgramCacheInit( ctx );
-
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index b04487ecee..8bddc9da40 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -130,7 +130,6 @@ struct brw_context;
 #define BRW_NEW_CONTEXT                 0x80
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_INPUT_VARYING           0x200
-#define BRW_NEW_TNL_PROGRAM             0x400
 #define BRW_NEW_PSP                     0x800
 #define BRW_NEW_METAOPS                 0x1000
 #define BRW_NEW_FENCE                   0x2000
@@ -488,10 +487,6 @@ struct brw_context
       GLboolean active;
    } metaops;
 
-   /* Track fixed function t&l in a vertex program:
-    */
-   struct gl_vertex_program *tnl_program;
-   struct brw_tnl_cache tnl_program_cache;
 
    /* Active vertex program: 
     */
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 3ea6151ae9..4c04036ef0 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -74,7 +74,6 @@ const struct brw_tracked_state brw_wm_unit;
 const struct brw_tracked_state brw_psp_urb_cbs;
 
 const struct brw_tracked_state brw_active_vertprog;
-const struct brw_tracked_state brw_tnl_vertprog;
 const struct brw_tracked_state brw_pipe_control;
 
 const struct brw_tracked_state brw_clear_surface_cache;
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 7d4fd467b1..b6a52843a8 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -45,7 +45,6 @@ const struct brw_tracked_state *atoms[] =
 {
    &brw_check_fallback,
 
-   &brw_tnl_vertprog,
    &brw_active_vertprog,
    &brw_wm_input_sizes,
    &brw_vs_prog,
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 41a33ffe38..22388ec99d 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -80,8 +80,4 @@ struct brw_vs_compile {
 
 void brw_vs_emit( struct brw_vs_compile *c );
 
-
-void brw_ProgramCacheDestroy( GLcontext *ctx );
-void brw_ProgramCacheInit( GLcontext *ctx );
-
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_vs_tnl.c b/src/mesa/drivers/dri/i965/brw_vs_tnl.c
index 9b04f19112..eacc289f1f 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_tnl.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_tnl.c
@@ -33,1635 +33,15 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
 #include "brw_vs.h"
 #include "brw_state.h"
 
 
-struct state_key {
-   unsigned light_global_enabled:1;
-   unsigned light_local_viewer:1;
-   unsigned light_twoside:1;
-   unsigned light_color_material:1;
-   unsigned light_color_material_mask:12;
-   unsigned light_material_mask:12;
-   unsigned normalize:1;
-   unsigned rescale_normals:1;
-   unsigned fog_source_is_depth:1;
-   unsigned tnl_do_vertex_fog:1;
-   unsigned separate_specular:1;
-   unsigned fog_option:2;
-   unsigned point_attenuated:1;
-   unsigned texture_enabled_global:1;
-   unsigned fragprog_inputs_read:12;
-
-   struct {
-      unsigned light_enabled:1;
-      unsigned light_eyepos3_is_zero:1;
-      unsigned light_spotcutoff_is_180:1;
-      unsigned light_attenuated:1;      
-      unsigned texunit_really_enabled:1;
-      unsigned texmat_enabled:1;
-      unsigned texgen_enabled:4;
-      unsigned texgen_mode0:4;
-      unsigned texgen_mode1:4;
-      unsigned texgen_mode2:4;
-      unsigned texgen_mode3:4;
-   } unit[8];
-};
-
-
-
-#define FOG_NONE   0
-#define FOG_LINEAR 1
-#define FOG_EXP    2
-#define FOG_EXP2   3
-
-static GLuint translate_fog_mode( GLenum mode )
-{
-   switch (mode) {
-   case GL_LINEAR: return FOG_LINEAR;
-   case GL_EXP: return FOG_EXP;
-   case GL_EXP2: return FOG_EXP2;
-   default: return FOG_NONE;
-   }
-}
-
-#define TXG_NONE           0
-#define TXG_OBJ_LINEAR     1
-#define TXG_EYE_LINEAR     2
-#define TXG_SPHERE_MAP     3
-#define TXG_REFLECTION_MAP 4
-#define TXG_NORMAL_MAP     5
-
-static GLuint translate_texgen( GLboolean enabled, GLenum mode )
-{
-   if (!enabled)
-      return TXG_NONE;
-
-   switch (mode) {
-   case GL_OBJECT_LINEAR: return TXG_OBJ_LINEAR;
-   case GL_EYE_LINEAR: return TXG_EYE_LINEAR;
-   case GL_SPHERE_MAP: return TXG_SPHERE_MAP;
-   case GL_REFLECTION_MAP_NV: return TXG_REFLECTION_MAP;
-   case GL_NORMAL_MAP_NV: return TXG_NORMAL_MAP;
-   default: return TXG_NONE;
-   }
-}
-
-static void make_state_key( GLcontext *ctx, struct state_key *key )
-{
-   struct brw_context *brw = brw_context(ctx);
-   const struct gl_fragment_program *fp = brw->fragment_program;
-   GLuint i;
-
-   /* This now relies on texenvprogram.c being active:
-    */
-   assert(fp);
-
-   memset(key, 0, sizeof(*key));
-
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   key->fragprog_inputs_read = fp->Base.InputsRead;
-
-   /* _NEW_LIGHT */
-   key->separate_specular = (brw->attribs.Light->Model.ColorControl ==
-			     GL_SEPARATE_SPECULAR_COLOR);
-
-   /* _NEW_LIGHT */
-   if (brw->attribs.Light->Enabled) {
-      key->light_global_enabled = 1;
-
-      if (brw->attribs.Light->Model.LocalViewer)
-	 key->light_local_viewer = 1;
-
-      if (brw->attribs.Light->Model.TwoSide)
-	 key->light_twoside = 1;
-
-      if (brw->attribs.Light->ColorMaterialEnabled) {
-	 key->light_color_material = 1;
-	 key->light_color_material_mask = brw->attribs.Light->ColorMaterialBitmask;
-      }
-
-      /* BRW_NEW_INPUT_VARYING */
-
-      /* For these programs, material values are stuffed into the
-       * generic slots:
-       */
-      for (i = 0 ; i < MAT_ATTRIB_MAX ; i++) 
-	 if (brw->vb.info.varying & (1<<(VERT_ATTRIB_GENERIC0 + i))) 
-	    key->light_material_mask |= 1<<i;
-
-      for (i = 0; i < MAX_LIGHTS; i++) {
-	 struct gl_light *light = &brw->attribs.Light->Light[i];
-
-	 if (light->Enabled) {
-	    key->unit[i].light_enabled = 1;
-
-	    if (light->EyePosition[3] == 0.0)
-	       key->unit[i].light_eyepos3_is_zero = 1;
-	    
-	    if (light->SpotCutoff == 180.0)
-	       key->unit[i].light_spotcutoff_is_180 = 1;
-
-	    if (light->ConstantAttenuation != 1.0 ||
-		light->LinearAttenuation != 0.0 ||
-		light->QuadraticAttenuation != 0.0)
-	       key->unit[i].light_attenuated = 1;
-	 }
-      }
-   }
-
-   /* _NEW_TRANSFORM */
-   if (brw->attribs.Transform->Normalize)
-      key->normalize = 1;
-
-   if (brw->attribs.Transform->RescaleNormals)
-      key->rescale_normals = 1;
-
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   key->fog_option = translate_fog_mode(fp->FogOption);
-   if (key->fog_option)
-      key->fragprog_inputs_read |= FRAG_BIT_FOGC;
-   
-   /* _NEW_FOG */
-   if (brw->attribs.Fog->FogCoordinateSource == GL_FRAGMENT_DEPTH_EXT)
-      key->fog_source_is_depth = 1;
-   
-   /* _NEW_HINT, ??? */
-   if (1)
-      key->tnl_do_vertex_fog = 1;
-
-   /* _NEW_POINT */
-   if (brw->attribs.Point->_Attenuated)
-      key->point_attenuated = 1;
-
-   /* _NEW_TEXTURE */
-   if (brw->attribs.Texture->_TexGenEnabled ||
-       brw->attribs.Texture->_TexMatEnabled ||
-       brw->attribs.Texture->_EnabledUnits)
-      key->texture_enabled_global = 1;
-      
-   for (i = 0; i < MAX_TEXTURE_UNITS; i++) {
-      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[i];
-
-      if (texUnit->_ReallyEnabled)
- 	 key->unit[i].texunit_really_enabled = 1;
-
-      if (brw->attribs.Texture->_TexMatEnabled & ENABLE_TEXMAT(i))      
-	 key->unit[i].texmat_enabled = 1;
-      
-      if (texUnit->TexGenEnabled) {
-	 key->unit[i].texgen_enabled = 1;
-      
-	 key->unit[i].texgen_mode0 = 
-	    translate_texgen( texUnit->TexGenEnabled & (1<<0),
-			      texUnit->GenModeS );
-	 key->unit[i].texgen_mode1 = 
-	    translate_texgen( texUnit->TexGenEnabled & (1<<1),
-			      texUnit->GenModeT );
-	 key->unit[i].texgen_mode2 = 
-	    translate_texgen( texUnit->TexGenEnabled & (1<<2),
-			      texUnit->GenModeR );
-	 key->unit[i].texgen_mode3 = 
-	    translate_texgen( texUnit->TexGenEnabled & (1<<3),
-			      texUnit->GenModeQ );
-      }
-   }
-}
-
-
-   
-/* Very useful debugging tool - produces annotated listing of
- * generated program with line/function references for each
- * instruction back into this file:
- */
-#define DISASSEM 0
-
-/* Should be tunable by the driver - do we want to do matrix
- * multiplications with DP4's or with MUL/MAD's?  SSE works better
- * with the latter, drivers may differ.
- */
-#define PREFER_DP4 1
-
-
-/* Use uregs to represent registers internally, translate to Mesa's
- * expected formats on emit.  
- *
- * NOTE: These are passed by value extensively in this file rather
- * than as usual by pointer reference.  If this disturbs you, try
- * remembering they are just 32bits in size.
- *
- * GCC is smart enough to deal with these dword-sized structures in
- * much the same way as if I had defined them as dwords and was using
- * macros to access and set the fields.  This is much nicer and easier
- * to evolve.
- */
-struct ureg {
-   GLuint file:4;
-   GLint idx:8;      /* relative addressing may be negative */
-   GLuint negate:1;
-   GLuint swz:12;
-   GLuint pad:7;
-};
-
-
-struct tnl_program {
-   const struct state_key *state;
-   struct gl_vertex_program *program;
-   
-   GLuint nr_instructions;
-   GLuint temp_in_use;
-   GLuint temp_reserved;
-   
-   struct ureg eye_position;
-   struct ureg eye_position_normalized;
-   struct ureg eye_normal;
-   struct ureg identity;
-
-   GLuint materials;
-   GLuint color_materials;
-};
-
-
-const static struct ureg undef = { 
-   PROGRAM_UNDEFINED,
-   ~0,
-   0,
-   0,
-   0
-};
-
-/* Local shorthand:
- */
-#define X    SWIZZLE_X
-#define Y    SWIZZLE_Y
-#define Z    SWIZZLE_Z
-#define W    SWIZZLE_W
-
-
-/* Construct a ureg:
- */
-static struct ureg make_ureg(GLuint file, GLint idx)
-{
-   struct ureg reg;
-   reg.file = file;
-   reg.idx = idx;
-   reg.negate = 0;
-   reg.swz = SWIZZLE_NOOP;
-   reg.pad = 0;
-   return reg;
-}
-
-
-
-static struct ureg ureg_negate( struct ureg reg )
-{
-   reg.negate ^= 1;
-   return reg;
-} 
-
-
-static struct ureg swizzle( struct ureg reg, int x, int y, int z, int w )
-{
-   reg.swz = MAKE_SWIZZLE4(GET_SWZ(reg.swz, x),
-			   GET_SWZ(reg.swz, y),
-			   GET_SWZ(reg.swz, z),
-			   GET_SWZ(reg.swz, w));
-
-   return reg;
-}
-
-static struct ureg swizzle1( struct ureg reg, int x )
-{
-   return swizzle(reg, x, x, x, x);
-}
-
-static struct ureg get_temp( struct tnl_program *p )
-{
-   int bit = ffs( ~p->temp_in_use );
-   if (!bit) {
-      fprintf(stderr, "%s: out of temporaries\n", __FILE__);
-      assert(0);
-   }
-
-   if (bit > p->program->Base.NumTemporaries)
-      p->program->Base.NumTemporaries = bit;
-
-   p->temp_in_use |= 1<<(bit-1);
-   return make_ureg(PROGRAM_TEMPORARY, bit-1);
-}
-
-static struct ureg reserve_temp( struct tnl_program *p )
-{
-   struct ureg temp = get_temp( p );
-   p->temp_reserved |= 1<<temp.idx;
-   return temp;
-}
-
-static void release_temp( struct tnl_program *p, struct ureg reg )
-{
-   if (reg.file == PROGRAM_TEMPORARY) {
-      p->temp_in_use &= ~(1<<reg.idx);
-      p->temp_in_use |= p->temp_reserved; /* can't release reserved temps */
-   }
-}
-
-static void release_temps( struct tnl_program *p )
-{
-   p->temp_in_use = p->temp_reserved;
-}
-
-
-
-static struct ureg register_input( struct tnl_program *p, GLuint input )
-{
-   assert(input < 32);
-
-   p->program->Base.InputsRead |= (1<<input);
-   return make_ureg(PROGRAM_INPUT, input);
-}
-
-static struct ureg register_output( struct tnl_program *p, GLuint output )
-{
-   p->program->Base.OutputsWritten |= (1<<output);
-   return make_ureg(PROGRAM_OUTPUT, output);
-}
-
-static struct ureg register_const4f( struct tnl_program *p, 
-			      GLfloat s0,
-			      GLfloat s1,
-			      GLfloat s2,
-			      GLfloat s3)
-{
-   GLfloat values[4];
-   GLint idx;
-   GLuint swizzle;
-   values[0] = s0;
-   values[1] = s1;
-   values[2] = s2;
-   values[3] = s3;
-   idx = _mesa_add_unnamed_constant( p->program->Base.Parameters, values, 4,
-                                     &swizzle);
-   assert(swizzle == SWIZZLE_NOOP); /* Need to handle swizzle in reg setup */
-   return make_ureg(PROGRAM_STATE_VAR, idx);
-}
-
-#define register_const1f(p, s0)         register_const4f(p, s0, 0, 0, 1)
-#define register_scalar_const(p, s0)    register_const4f(p, s0, s0, s0, s0)
-#define register_const2f(p, s0, s1)     register_const4f(p, s0, s1, 0, 1)
-#define register_const3f(p, s0, s1, s2) register_const4f(p, s0, s1, s2, 1)
-
-static GLboolean is_undef( struct ureg reg )
-{
-   return reg.file == PROGRAM_UNDEFINED;
-}
-
-static struct ureg get_identity_param( struct tnl_program *p )
-{
-   if (is_undef(p->identity)) 
-      p->identity = register_const4f(p, 0,0,0,1);
-
-   return p->identity;
-}
-
-static struct ureg register_param5( struct tnl_program *p, 
-                                    GLint s0,
-                                    GLint s1,
-                                    GLint s2,
-                                    GLint s3,
-                                    GLint s4)
-{
-   gl_state_index tokens[STATE_LENGTH];
-   GLint idx;
-   tokens[0] = s0;
-   tokens[1] = s1;
-   tokens[2] = s2;
-   tokens[3] = s3;
-   tokens[4] = s4;
-   idx = _mesa_add_state_reference( p->program->Base.Parameters, tokens );
-   return make_ureg(PROGRAM_STATE_VAR, idx);
-}
-
-
-#define register_param1(p,s0)          register_param5(p,s0,0,0,0,0)
-#define register_param2(p,s0,s1)       register_param5(p,s0,s1,0,0,0)
-#define register_param3(p,s0,s1,s2)    register_param5(p,s0,s1,s2,0,0)
-#define register_param4(p,s0,s1,s2,s3) register_param5(p,s0,s1,s2,s3,0)
-
-
-static void register_matrix_param5( struct tnl_program *p,
-				    GLint s0, /* matrix name */
-				    GLint s1, /* texture matrix number */
-				    GLint s2, /* first row */
-				    GLint s3, /* last row */
-				    GLint s4, /* modifier */
-				    struct ureg *matrix )
-{
-   GLint i;
-
-   /* This is a bit sad as the support is there to pull the whole
-    * matrix out in one go:
-    */
-   for (i = 0; i <= s3 - s2; i++) 
-      matrix[i] = register_param5( p, s0, s1, i, i, s4 );
-}
-
-
-static void emit_arg( struct prog_src_register *src,
-		      struct ureg reg )
-{
-   src->File = reg.file;
-   src->Index = reg.idx;
-   src->Swizzle = reg.swz;
-   src->RelAddr = 0;
-   src->NegateBase = reg.negate;
-   src->Abs = 0;
-   src->NegateAbs = 0;
-}
-
-static void emit_dst( struct prog_dst_register *dst,
-		      struct ureg reg, GLuint mask )
-{
-   dst->File = reg.file;
-   dst->Index = reg.idx;
-   /* allow zero as a shorthand for xyzw */
-   dst->WriteMask = mask ? mask : WRITEMASK_XYZW; 
-   dst->CondMask = 0;
-   dst->CondSwizzle = 0;
-   dst->CondSrc = 0;
-   dst->pad = 0;
-}
-
-static void debug_insn( struct prog_instruction *inst, const char *fn,
-			GLuint line )
-{
-   if (DISASSEM) {
-      static const char *last_fn;
-   
-      if (fn != last_fn) {
-	 last_fn = fn;
-	 _mesa_printf("%s:\n", fn);
-      }
-	 
-      _mesa_printf("%d:\t", line);
-      _mesa_print_instruction(inst);
-   }
-}
-
-
-static void emit_op3fn(struct tnl_program *p,
-		       GLuint op,
-		       struct ureg dest,
-		       GLuint mask,
-		       struct ureg src0,
-		       struct ureg src1,
-		       struct ureg src2,
-		       const char *fn,
-		       GLuint line)
-{
-   GLuint nr = p->program->Base.NumInstructions++;
-      
-   if (nr >= p->nr_instructions) {
-      int new_nr_instructions = p->nr_instructions * 2;
-
-      p->program->Base.Instructions = 
-	 _mesa_realloc(p->program->Base.Instructions,
-		       sizeof(struct prog_instruction) * p->nr_instructions,
-		       sizeof(struct prog_instruction) * new_nr_instructions);
-      p->nr_instructions = new_nr_instructions;
-   }
-
-   {      
-      struct prog_instruction *inst = &p->program->Base.Instructions[nr];
-      memset(inst, 0, sizeof(*inst));
-      inst->Opcode = op; 
-      inst->StringPos = 0;
-      inst->Data = 0;
-   
-      emit_arg( &inst->SrcReg[0], src0 );
-      emit_arg( &inst->SrcReg[1], src1 );
-      emit_arg( &inst->SrcReg[2], src2 );   
-
-      emit_dst( &inst->DstReg, dest, mask );
-
-      debug_insn(inst, fn, line);
-   }
-}
-
-   
-
-#define emit_op3(p, op, dst, mask, src0, src1, src2) \
-   emit_op3fn(p, op, dst, mask, src0, src1, src2, __FUNCTION__, __LINE__)
-
-#define emit_op2(p, op, dst, mask, src0, src1) \
-    emit_op3fn(p, op, dst, mask, src0, src1, undef, __FUNCTION__, __LINE__)
-
-#define emit_op1(p, op, dst, mask, src0) \
-    emit_op3fn(p, op, dst, mask, src0, undef, undef, __FUNCTION__, __LINE__)
-
-
-static struct ureg make_temp( struct tnl_program *p, struct ureg reg )
-{
-   if (reg.file == PROGRAM_TEMPORARY && 
-       !(p->temp_reserved & (1<<reg.idx)))
-      return reg;
-   else {
-      struct ureg temp = get_temp(p);
-      emit_op1(p, OPCODE_MOV, temp, 0, reg);
-      return temp;
-   }
-}
-
-
-/* Currently no tracking performed of input/output/register size or
- * active elements.  Could be used to reduce these operations, as
- * could the matrix type.
- */
-static void emit_matrix_transform_vec4( struct tnl_program *p,
-					struct ureg dest,
-					const struct ureg *mat,
-					struct ureg src)
-{
-   emit_op2(p, OPCODE_DP4, dest, WRITEMASK_X, src, mat[0]);
-   emit_op2(p, OPCODE_DP4, dest, WRITEMASK_Y, src, mat[1]);
-   emit_op2(p, OPCODE_DP4, dest, WRITEMASK_Z, src, mat[2]);
-   emit_op2(p, OPCODE_DP4, dest, WRITEMASK_W, src, mat[3]);
-}
-
-/* This version is much easier to implement if writemasks are not
- * supported natively on the target or (like SSE), the target doesn't
- * have a clean/obvious dotproduct implementation.
- */
-static void emit_transpose_matrix_transform_vec4( struct tnl_program *p,
-						  struct ureg dest,
-						  const struct ureg *mat,
-						  struct ureg src)
-{
-   struct ureg tmp;
-
-   if (dest.file != PROGRAM_TEMPORARY)
-      tmp = get_temp(p);
-   else
-      tmp = dest;
-
-   emit_op2(p, OPCODE_MUL, tmp, 0, swizzle1(src,X), mat[0]);
-   emit_op3(p, OPCODE_MAD, tmp, 0, swizzle1(src,Y), mat[1], tmp);
-   emit_op3(p, OPCODE_MAD, tmp, 0, swizzle1(src,Z), mat[2], tmp);
-   emit_op3(p, OPCODE_MAD, dest, 0, swizzle1(src,W), mat[3], tmp);
-
-   if (dest.file != PROGRAM_TEMPORARY)
-      release_temp(p, tmp);
-}
-
-static void emit_matrix_transform_vec3( struct tnl_program *p,
-					struct ureg dest,
-					const struct ureg *mat,
-					struct ureg src)
-{
-   emit_op2(p, OPCODE_DP3, dest, WRITEMASK_X, src, mat[0]);
-   emit_op2(p, OPCODE_DP3, dest, WRITEMASK_Y, src, mat[1]);
-   emit_op2(p, OPCODE_DP3, dest, WRITEMASK_Z, src, mat[2]);
-}
-
-
-static void emit_normalize_vec3( struct tnl_program *p,
-				 struct ureg dest,
-				 struct ureg src )
-{
-   emit_op2(p, OPCODE_DP3, dest, WRITEMASK_W, src, src);
-   emit_op1(p, OPCODE_RSQ, dest, WRITEMASK_W, swizzle1(dest,W));
-   emit_op2(p, OPCODE_MUL, dest, WRITEMASK_XYZ, src, swizzle1(dest,W));
-}
-
-static void emit_passthrough( struct tnl_program *p, 
-			      GLuint input,
-			      GLuint output )
-{
-   struct ureg out = register_output(p, output);
-   emit_op1(p, OPCODE_MOV, out, 0, register_input(p, input)); 
-}
-
-static struct ureg get_eye_position( struct tnl_program *p )
-{
-   if (is_undef(p->eye_position)) {
-      struct ureg pos = register_input( p, VERT_ATTRIB_POS ); 
-      struct ureg modelview[4];
-
-      p->eye_position = reserve_temp(p);
-
-      if (PREFER_DP4) {
-	 register_matrix_param5( p, STATE_MODELVIEW_MATRIX, 0, 0, 3, 
-				 0, modelview );
-
-	 emit_matrix_transform_vec4(p, p->eye_position, modelview, pos);
-      }
-      else {
-	 register_matrix_param5( p, STATE_MODELVIEW_MATRIX, 0, 0, 3, 
-				 STATE_MATRIX_TRANSPOSE, modelview );
-
-	 emit_transpose_matrix_transform_vec4(p, p->eye_position, modelview, pos);
-      }
-   }
-   
-   return p->eye_position;
-}
-
-
-#if 0
-static struct ureg get_eye_z( struct tnl_program *p )
-{
-   if (!is_undef(p->eye_position)) {
-      return swizzle1(p->eye_position, Z);
-   }
-   else if (!is_undef(p->eye_z)) {
-      struct ureg pos = register_input( p, BRW_ATTRIB_POS ); 
-      struct ureg modelview2;
-
-      p->eye_z = reserve_temp(p);
-
-      register_matrix_param6( p, STATE_MATRIX, STATE_MODELVIEW, 0, 2, 1, 
-			      STATE_MATRIX, &modelview2 );
-
-      emit_matrix_transform_vec4(p, p->eye_position, modelview, pos);
-      emit_op2(p, OPCODE_DP4, p->eye_z, WRITEMASK_Z, pos, modelview2);
-   }
-   
-   return swizzle1(p->eye_z, Z)
-}
-#endif
-
-
-
-static struct ureg get_eye_position_normalized( struct tnl_program *p )
-{
-   if (is_undef(p->eye_position_normalized)) {
-      struct ureg eye = get_eye_position(p);
-      p->eye_position_normalized = reserve_temp(p);
-      emit_normalize_vec3(p, p->eye_position_normalized, eye);
-   }
-   
-   return p->eye_position_normalized;
-}
-
-
-static struct ureg get_eye_normal( struct tnl_program *p )
-{
-   if (is_undef(p->eye_normal)) {
-      struct ureg normal = register_input(p, VERT_ATTRIB_NORMAL );
-      struct ureg mvinv[3];
-
-      register_matrix_param5( p, STATE_MODELVIEW_MATRIX, 0, 0, 2,
-			      STATE_MATRIX_INVTRANS, mvinv );
-
-      p->eye_normal = reserve_temp(p);
-
-      /* Transform to eye space:
-       */
-      emit_matrix_transform_vec3( p, p->eye_normal, mvinv, normal );
-
-      /* Normalize/Rescale:
-       */
-      if (p->state->normalize) {
-	 emit_normalize_vec3( p, p->eye_normal, p->eye_normal );
-      }
-      else if (p->state->rescale_normals) {
-	 struct ureg rescale = register_param2(p, STATE_INTERNAL,
-					       STATE_NORMAL_SCALE);
-
-	 emit_op2( p, OPCODE_MUL, p->eye_normal, 0, p->eye_normal, 
-		   swizzle1(rescale, X));
-      }
-   }
-
-   return p->eye_normal;
-}
-
-
-
-static void build_hpos( struct tnl_program *p )
-{
-   struct ureg pos = register_input( p, VERT_ATTRIB_POS ); 
-   struct ureg hpos = register_output( p, VERT_RESULT_HPOS );
-   struct ureg mvp[4];
-
-   if (PREFER_DP4) {
-      register_matrix_param5( p, STATE_MVP_MATRIX, 0, 0, 3, 
-			      0, mvp );
-      emit_matrix_transform_vec4( p, hpos, mvp, pos );
-   }
-   else {
-      register_matrix_param5( p, STATE_MVP_MATRIX, 0, 0, 3, 
-			      STATE_MATRIX_TRANSPOSE, mvp );
-      emit_transpose_matrix_transform_vec4( p, hpos, mvp, pos );
-   }
-}
-
-
-static GLuint material_attrib( GLuint side, GLuint property )
-{
-   return (property - STATE_AMBIENT) * 2 + side;
-}
-
-/* Get a bitmask of which material values vary on a per-vertex basis.
- */
-static void set_material_flags( struct tnl_program *p )
-{
-   p->color_materials = 0;
-   p->materials = 0;
-
-   if (p->state->light_color_material) {
-      p->materials = 
-	 p->color_materials = p->state->light_color_material_mask;
-   }
-
-   p->materials |= p->state->light_material_mask;
-}
-
-
-static struct ureg get_material( struct tnl_program *p, GLuint side, 
-				 GLuint property )
-{
-   GLuint attrib = material_attrib(side, property);
-
-   if (p->color_materials & (1<<attrib))
-      return register_input(p, VERT_ATTRIB_COLOR0);
-   else if (p->materials & (1<<attrib)) 
-      return register_input( p, attrib + _TNL_ATTRIB_MAT_FRONT_AMBIENT );
-   else
-      return register_param3( p, STATE_MATERIAL, side, property );
-}
-
-#define SCENE_COLOR_BITS(side) ((MAT_BIT_FRONT_EMISSION | \
-				 MAT_BIT_FRONT_AMBIENT | \
-				 MAT_BIT_FRONT_DIFFUSE) << (side))
-
-/* Either return a precalculated constant value or emit code to
- * calculate these values dynamically in the case where material calls
- * are present between begin/end pairs.
- *
- * Probably want to shift this to the program compilation phase - if
- * we always emitted the calculation here, a smart compiler could
- * detect that it was constant (given a certain set of inputs), and
- * lift it out of the main loop.  That way the programs created here
- * would be independent of the vertex_buffer details.
- */
-static struct ureg get_scenecolor( struct tnl_program *p, GLuint side )
-{
-   if (p->materials & SCENE_COLOR_BITS(side)) {
-      struct ureg lm_ambient = register_param1(p, STATE_LIGHTMODEL_AMBIENT);
-      struct ureg material_emission = get_material(p, side, STATE_EMISSION);
-      struct ureg material_ambient = get_material(p, side, STATE_AMBIENT);
-      struct ureg material_diffuse = get_material(p, side, STATE_DIFFUSE);
-      struct ureg tmp = make_temp(p, material_diffuse);
-      emit_op3(p, OPCODE_MAD, tmp,  WRITEMASK_XYZ, lm_ambient, 
-	       material_ambient, material_emission);
-      return tmp;
-   }
-   else
-      return register_param2( p, STATE_LIGHTMODEL_SCENECOLOR, side );
-}
-
-
-static struct ureg get_lightprod( struct tnl_program *p, GLuint light, 
-				  GLuint side, GLuint property )
-{
-   GLuint attrib = material_attrib(side, property);
-   if (p->materials & (1<<attrib)) {
-      struct ureg light_value = 
-	 register_param3(p, STATE_LIGHT, light, property);
-      struct ureg material_value = get_material(p, side, property);
-      struct ureg tmp = get_temp(p);
-      emit_op2(p, OPCODE_MUL, tmp,  0, light_value, material_value);
-      return tmp;
-   }
-   else
-      return register_param4(p, STATE_LIGHTPROD, light, side, property);
-}
-
-static struct ureg calculate_light_attenuation( struct tnl_program *p,
-						GLuint i, 
-						struct ureg VPpli,
-						struct ureg dist )
-{
-   struct ureg attenuation = register_param3(p, STATE_LIGHT, i,
-					     STATE_ATTENUATION);
-   struct ureg att = get_temp(p);
-
-   /* Calculate spot attenuation:
-    */
-   if (!p->state->unit[i].light_spotcutoff_is_180) {
-      struct ureg spot_dir_norm = register_param3(p, STATE_INTERNAL,
-						  STATE_LIGHT_SPOT_DIR_NORMALIZED, i);
-      struct ureg spot = get_temp(p);
-      struct ureg slt = get_temp(p);
-
-      emit_op2(p, OPCODE_DP3, spot, 0, ureg_negate(VPpli), spot_dir_norm);
-      emit_op2(p, OPCODE_SLT, slt, 0, swizzle1(spot_dir_norm,W), spot);
-      emit_op2(p, OPCODE_POW, spot, 0, spot, swizzle1(attenuation, W));
-      emit_op2(p, OPCODE_MUL, att, 0, slt, spot);
-
-      release_temp(p, spot);
-      release_temp(p, slt);
-   }
-
-   /* Calculate distance attenuation:
-    */
-   if (p->state->unit[i].light_attenuated) {
-
-      /* 1/d,d,d,1/d */
-      emit_op1(p, OPCODE_RCP, dist, WRITEMASK_YZ, dist); 
-      /* 1,d,d*d,1/d */
-      emit_op2(p, OPCODE_MUL, dist, WRITEMASK_XZ, dist, swizzle1(dist,Y)); 
-      /* 1/dist-atten */
-      emit_op2(p, OPCODE_DP3, dist, 0, attenuation, dist); 
-
-      if (!p->state->unit[i].light_spotcutoff_is_180) {
-	 /* dist-atten */
-	 emit_op1(p, OPCODE_RCP, dist, 0, dist); 
-	 /* spot-atten * dist-atten */
-	 emit_op2(p, OPCODE_MUL, att, 0, dist, att);	
-      } else {
-	 /* dist-atten */
-	 emit_op1(p, OPCODE_RCP, att, 0, dist); 
-      }
-   }
-
-   return att;
-}
-						
-
-
-
-
-/* Need to add some addtional parameters to allow lighting in object
- * space - STATE_SPOT_DIRECTION and STATE_HALF_VECTOR implicitly assume eye
- * space lighting.
- */
-static void build_lighting( struct tnl_program *p )
-{
-   const GLboolean twoside = p->state->light_twoside;
-   const GLboolean separate = p->state->separate_specular;
-   GLuint nr_lights = 0, count = 0;
-   struct ureg normal = get_eye_normal(p);
-   struct ureg lit = get_temp(p);
-   struct ureg dots = get_temp(p);
-   struct ureg _col0 = undef, _col1 = undef;
-   struct ureg _bfc0 = undef, _bfc1 = undef;
-   GLuint i;
-
-   for (i = 0; i < MAX_LIGHTS; i++) 
-      if (p->state->unit[i].light_enabled)
-	 nr_lights++;
-   
-   set_material_flags(p);
-
-   {
-      struct ureg shininess = get_material(p, 0, STATE_SHININESS);
-      emit_op1(p, OPCODE_MOV, dots,  WRITEMASK_W, swizzle1(shininess,X));
-      release_temp(p, shininess);
-
-      _col0 = make_temp(p, get_scenecolor(p, 0));
-      if (separate)
-	 _col1 = make_temp(p, get_identity_param(p));
-      else
-	 _col1 = _col0;
-
-   }
-
-   if (twoside) {
-      struct ureg shininess = get_material(p, 1, STATE_SHININESS);
-      emit_op1(p, OPCODE_MOV, dots, WRITEMASK_Z, 
-	       ureg_negate(swizzle1(shininess,X)));
-      release_temp(p, shininess);
-
-      _bfc0 = make_temp(p, get_scenecolor(p, 1));
-      if (separate)
-	 _bfc1 = make_temp(p, get_identity_param(p));
-      else
-	 _bfc1 = _bfc0;
-   }
-
-
-   /* If no lights, still need to emit the scenecolor.
-    */
-   /* KW: changed to do this always - v1.17 "Fix lighting alpha result"? 
-    */
-   if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-   {
-      struct ureg res0 = register_output( p, VERT_RESULT_COL0 );
-      emit_op1(p, OPCODE_MOV, res0, 0, _col0);
-
-      if (twoside) {
-	 struct ureg res0 = register_output( p, VERT_RESULT_BFC0 );
-	 emit_op1(p, OPCODE_MOV, res0, 0, _bfc0);
-      }
-   }
-
-   if (separate && (p->state->fragprog_inputs_read & FRAG_BIT_COL1)) {
-
-      struct ureg res1 = register_output( p, VERT_RESULT_COL1 );
-      emit_op1(p, OPCODE_MOV, res1, 0, _col1);
-      
-      if (twoside) {
-	 struct ureg res1 = register_output( p, VERT_RESULT_BFC1 );
-	 emit_op1(p, OPCODE_MOV, res1, 0, _bfc1);
-      }
-   }
-      
-   if (nr_lights == 0) {
-      release_temps(p);
-      return;
-   }
-
-
-   for (i = 0; i < MAX_LIGHTS; i++) {
-      if (p->state->unit[i].light_enabled) {
-	 struct ureg half = undef;
-	 struct ureg att = undef, VPpli = undef;
-	  
-	 count++;
-
-	 if (p->state->unit[i].light_eyepos3_is_zero) {
-	    /* Can used precomputed constants in this case.
-	     * Attenuation never applies to infinite lights.
-	     */
-	    VPpli = register_param3(p, STATE_LIGHT, i, 
-				    STATE_LIGHT_POSITION_NORMALIZED);
-            if (p->state->light_local_viewer) {
-                struct ureg eye_hat = get_eye_position_normalized(p);
-                half = get_temp(p);
-                emit_op2(p, OPCODE_SUB, half, 0, VPpli, eye_hat);
-                emit_normalize_vec3(p, half, half);
-            } else {
-                half = register_param3(p, STATE_LIGHT, i, STATE_HALF_VECTOR);
-            }
-	 } 
-	 else {
-	    struct ureg Ppli = register_param3(p, STATE_LIGHT, i, 
-					       STATE_POSITION); 
-	    struct ureg V = get_eye_position(p);
-	    struct ureg dist = get_temp(p);
-	    struct ureg tmpPpli = get_temp(p);
-
-	    VPpli = get_temp(p); 
-	    half = get_temp(p);
-
-	    /* In homogeneous object coordinates
-	     */
-	    emit_op1(p, OPCODE_RCP, dist, 0, swizzle1(Ppli, W));
-	    emit_op2(p, OPCODE_MUL, tmpPpli, 0, Ppli, dist);
- 
-	    /* Calulate VPpli vector
-	     */
-	    emit_op2(p, OPCODE_SUB, VPpli, 0, tmpPpli, V); 
-
-	    /* Normalize VPpli.  The dist value also used in
-	     * attenuation below.
-	     */
-	    emit_op2(p, OPCODE_DP3, dist, 0, VPpli, VPpli);
-	    emit_op1(p, OPCODE_RSQ, dist, 0, dist);
-	    emit_op2(p, OPCODE_MUL, VPpli, 0, VPpli, dist);
-
-
-	    /* Calculate  attenuation:
-	     */ 
-	    if (!p->state->unit[i].light_spotcutoff_is_180 ||
-		p->state->unit[i].light_attenuated) {
-	       att = calculate_light_attenuation(p, i, VPpli, dist);
-	    }
-	 
-      
-	    /* Calculate viewer direction, or use infinite viewer:
-	     */
-	    if (p->state->light_local_viewer) {
-	       struct ureg eye_hat = get_eye_position_normalized(p);
-	       emit_op2(p, OPCODE_SUB, half, 0, VPpli, eye_hat);
-	    }
-	    else {
-	       struct ureg z_dir = swizzle(get_identity_param(p),X,Y,W,Z); 
-	       emit_op2(p, OPCODE_ADD, half, 0, VPpli, z_dir);
-	    }
-
-	    emit_normalize_vec3(p, half, half);
-
-	    release_temp(p, dist);
-	    release_temp(p, tmpPpli);
-	 }
-
-	 /* Calculate dot products:
-	  */
-	 emit_op2(p, OPCODE_DP3, dots, WRITEMASK_X, normal, VPpli);
-	 emit_op2(p, OPCODE_DP3, dots, WRITEMASK_Y, normal, half);
-
-	
-	 /* Front face lighting:
-	  */
-	 {
-	    struct ureg ambient = get_lightprod(p, i, 0, STATE_AMBIENT);
-	    struct ureg diffuse = get_lightprod(p, i, 0, STATE_DIFFUSE);
-	    struct ureg specular = get_lightprod(p, i, 0, STATE_SPECULAR);
-	    struct ureg res0, res1;
-	    GLuint mask0, mask1;
-
-	    emit_op1(p, OPCODE_LIT, lit, 0, dots);
-   
-	    if (!is_undef(att)) 
-	       emit_op2(p, OPCODE_MUL, lit, 0, lit, att);
-
-
-	    mask0 = 0;
-	    mask1 = 0;
-	    res0 = _col0;
-	    res1 = _col1;
-	    
-	    if (count == nr_lights) {
-	       if (separate) {
-		  mask0 = WRITEMASK_XYZ;
-		  mask1 = WRITEMASK_XYZ;
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-		     res0 = register_output( p, VERT_RESULT_COL0 );
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL1)
-		     res1 = register_output( p, VERT_RESULT_COL1 );
-	       }
-	       else {
-		  mask1 = WRITEMASK_XYZ;
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-		     res1 = register_output( p, VERT_RESULT_COL0 );
-	       }
-	    } 
-
-	    emit_op3(p, OPCODE_MAD, _col0, 0, swizzle1(lit,X), ambient, _col0);
-	    emit_op3(p, OPCODE_MAD, res0, mask0, swizzle1(lit,Y), diffuse, _col0);
-	    emit_op3(p, OPCODE_MAD, res1, mask1, swizzle1(lit,Z), specular, _col1);
-      
-	    release_temp(p, ambient);
-	    release_temp(p, diffuse);
-	    release_temp(p, specular);
-	 }
-
-	 /* Back face lighting:
-	  */
-	 if (twoside) {
-	    struct ureg ambient = get_lightprod(p, i, 1, STATE_AMBIENT);
-	    struct ureg diffuse = get_lightprod(p, i, 1, STATE_DIFFUSE);
-	    struct ureg specular = get_lightprod(p, i, 1, STATE_SPECULAR);
-	    struct ureg res0, res1;
-	    GLuint mask0, mask1;
-	       
-	    emit_op1(p, OPCODE_LIT, lit, 0, ureg_negate(swizzle(dots,X,Y,W,Z)));
-
-	    if (!is_undef(att)) 
-	       emit_op2(p, OPCODE_MUL, lit, 0, lit, att);
-
-	    mask0 = 0;
-	    mask1 = 0;
-	    res0 = _bfc0;
-	    res1 = _bfc1;
-
-	    if (count == nr_lights) {
-	       if (separate) {
-		  mask0 = WRITEMASK_XYZ;
-		  mask1 = WRITEMASK_XYZ;
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-		     res0 = register_output( p, VERT_RESULT_BFC0 );
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL1)
-		     res1 = register_output( p, VERT_RESULT_BFC1 );
-	       }
-	       else {
-		  mask1 = WRITEMASK_XYZ;
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-		     res1 = register_output( p, VERT_RESULT_BFC0 );
-	       }
-	    }
-
-	    emit_op3(p, OPCODE_MAD, _bfc0, 0, swizzle1(lit,X), ambient, _bfc0);
-	    emit_op3(p, OPCODE_MAD, res0, mask0, swizzle1(lit,Y), diffuse, _bfc0);
-	    emit_op3(p, OPCODE_MAD, res1, mask1, swizzle1(lit,Z), specular, _bfc1);
-
-	    release_temp(p, ambient);
-	    release_temp(p, diffuse);
-	    release_temp(p, specular);
-	 }
-
-	 release_temp(p, half);
-	 release_temp(p, VPpli);
-	 release_temp(p, att);
-      }
-   }
-
-   release_temps( p );
-}
-
-
-static void build_fog( struct tnl_program *p )
-{
-   struct ureg fog = register_output(p, VERT_RESULT_FOGC);
-   struct ureg input;
-   GLuint useabs = p->state->fog_source_is_depth && p->state->fog_option &&
-		   (p->state->fog_option != FOG_EXP2);
-
-   if (p->state->fog_source_is_depth) {
-      input = swizzle1(get_eye_position(p), Z);
-   }
-   else {
-      input = swizzle1(register_input(p, VERT_ATTRIB_FOG), X);
-      if (p->state->fog_option &&
-	  p->state->tnl_do_vertex_fog)
-	  input = swizzle1(register_input(p, VERT_ATTRIB_FOG), X);
-      else
-	  input = register_input(p, VERT_ATTRIB_FOG);
-   }
-
-   if (p->state->fog_option &&
-       p->state->tnl_do_vertex_fog) {
-      struct ureg params = register_param2(p, STATE_INTERNAL,
-					   STATE_FOG_PARAMS_OPTIMIZED);
-      struct ureg tmp = get_temp(p);
-      struct ureg id = get_identity_param(p);
-
-      emit_op1(p, OPCODE_MOV, fog, 0, id);
-
-      if (useabs) {
-	 emit_op1(p, OPCODE_ABS, tmp, 0, input);
-      }
-
-      switch (p->state->fog_option) {
-      case FOG_LINEAR: {
-	 emit_op3(p, OPCODE_MAD, tmp, 0, useabs ? tmp : input,
-			swizzle1(params,X), swizzle1(params,Y));
-	 emit_op2(p, OPCODE_MAX, tmp, 0, tmp, swizzle1(id,X)); /* saturate */
-	 emit_op2(p, OPCODE_MIN, fog, WRITEMASK_X, tmp, swizzle1(id,W));
-	 break;
-      }
-      case FOG_EXP:
-	 emit_op2(p, OPCODE_MUL, tmp, 0, useabs ? tmp : input,
-			swizzle1(params,Z));
-	 emit_op1(p, OPCODE_EX2, fog, WRITEMASK_X, ureg_negate(tmp));
-	 break;
-      case FOG_EXP2:
-	 emit_op2(p, OPCODE_MUL, tmp, 0, input, swizzle1(params,W));
-	 emit_op2(p, OPCODE_MUL, tmp, 0, tmp, tmp);
-	 emit_op1(p, OPCODE_EX2, fog, WRITEMASK_X, ureg_negate(tmp));
-	 break;
-      }
-
-      release_temp(p, tmp);
-   }
-   else {
-      /* results = incoming fog coords (compute fog per-fragment later) 
-       *
-       * KW:  Is it really necessary to do anything in this case?
-       */
-      emit_op1(p, useabs ? OPCODE_ABS : OPCODE_MOV, fog, 0, input);
-   }
-}
- 
-static void build_reflect_texgen( struct tnl_program *p,
-				  struct ureg dest,
-				  GLuint writemask )
-{
-   struct ureg normal = get_eye_normal(p);
-   struct ureg eye_hat = get_eye_position_normalized(p);
-   struct ureg tmp = get_temp(p);
-
-   /* n.u */
-   emit_op2(p, OPCODE_DP3, tmp, 0, normal, eye_hat); 
-   /* 2n.u */
-   emit_op2(p, OPCODE_ADD, tmp, 0, tmp, tmp); 
-   /* (-2n.u)n + u */
-   emit_op3(p, OPCODE_MAD, dest, writemask, ureg_negate(tmp), normal, eye_hat);
-
-   release_temp(p, tmp);
-}
-
-static void build_sphere_texgen( struct tnl_program *p,
-				 struct ureg dest,
-				 GLuint writemask )
-{
-   struct ureg normal = get_eye_normal(p);
-   struct ureg eye_hat = get_eye_position_normalized(p);
-   struct ureg tmp = get_temp(p);
-   struct ureg half = register_scalar_const(p, .5);
-   struct ureg r = get_temp(p);
-   struct ureg inv_m = get_temp(p);
-   struct ureg id = get_identity_param(p);
-
-   /* Could share the above calculations, but it would be
-    * a fairly odd state for someone to set (both sphere and
-    * reflection active for different texture coordinate
-    * components.  Of course - if two texture units enable
-    * reflect and/or sphere, things start to tilt in favour
-    * of seperating this out:
-    */
-
-   /* n.u */
-   emit_op2(p, OPCODE_DP3, tmp, 0, normal, eye_hat); 
-   /* 2n.u */
-   emit_op2(p, OPCODE_ADD, tmp, 0, tmp, tmp); 
-   /* (-2n.u)n + u */
-   emit_op3(p, OPCODE_MAD, r, 0, ureg_negate(tmp), normal, eye_hat); 
-   /* r + 0,0,1 */
-   emit_op2(p, OPCODE_ADD, tmp, 0, r, swizzle(id,X,Y,W,Z)); 
-   /* rx^2 + ry^2 + (rz+1)^2 */
-   emit_op2(p, OPCODE_DP3, tmp, 0, tmp, tmp); 
-   /* 2/m */
-   emit_op1(p, OPCODE_RSQ, tmp, 0, tmp); 
-   /* 1/m */
-   emit_op2(p, OPCODE_MUL, inv_m, 0, tmp, half); 
-   /* r/m + 1/2 */
-   emit_op3(p, OPCODE_MAD, dest, writemask, r, inv_m, half); 
-	       
-   release_temp(p, tmp);
-   release_temp(p, r);
-   release_temp(p, inv_m);
-}
-
-
-static void build_texture_transform( struct tnl_program *p )
-{
-   GLuint i, j;
-
-   for (i = 0; i < MAX_TEXTURE_UNITS; i++) {
-
-      if (!(p->state->fragprog_inputs_read & (FRAG_BIT_TEX0<<i)))
-	 continue;
-							     
-      if (p->state->unit[i].texgen_enabled || 
-	  p->state->unit[i].texmat_enabled) {
-	 
-	 GLuint texmat_enabled = p->state->unit[i].texmat_enabled;
-	 struct ureg out = register_output(p, VERT_RESULT_TEX0 + i);
-	 struct ureg out_texgen = undef;
-
-	 if (p->state->unit[i].texgen_enabled) {
-	    GLuint copy_mask = 0;
-	    GLuint sphere_mask = 0;
-	    GLuint reflect_mask = 0;
-	    GLuint normal_mask = 0;
-	    GLuint modes[4];
-	 
-	    if (texmat_enabled) 
-	       out_texgen = get_temp(p);
-	    else
-	       out_texgen = out;
-
-	    modes[0] = p->state->unit[i].texgen_mode0;
-	    modes[1] = p->state->unit[i].texgen_mode1;
-	    modes[2] = p->state->unit[i].texgen_mode2;
-	    modes[3] = p->state->unit[i].texgen_mode3;
-
-	    for (j = 0; j < 4; j++) {
-	       switch (modes[j]) {
-	       case TXG_OBJ_LINEAR: {
-		  struct ureg obj = register_input(p, VERT_ATTRIB_POS);
-		  struct ureg plane = 
-		     register_param3(p, STATE_TEXGEN, i,
-				     STATE_TEXGEN_OBJECT_S + j);
-
-		  emit_op2(p, OPCODE_DP4, out_texgen, WRITEMASK_X << j, 
-			   obj, plane );
-		  break;
-	       }
-	       case TXG_EYE_LINEAR: {
-		  struct ureg eye = get_eye_position(p);
-		  struct ureg plane = 
-		     register_param3(p, STATE_TEXGEN, i, 
-				     STATE_TEXGEN_EYE_S + j);
-
-		  emit_op2(p, OPCODE_DP4, out_texgen, WRITEMASK_X << j, 
-			   eye, plane );
-		  break;
-	       }
-	       case TXG_SPHERE_MAP: 
-		  sphere_mask |= WRITEMASK_X << j;
-		  break;
-	       case TXG_REFLECTION_MAP:
-		  reflect_mask |= WRITEMASK_X << j;
-		  break;
-	       case TXG_NORMAL_MAP: 
-		  normal_mask |= WRITEMASK_X << j;
-		  break;
-	       case TXG_NONE:
-		  copy_mask |= WRITEMASK_X << j;
-	       }
-
-	    }
-
-	 
-	    if (sphere_mask) {
-	       build_sphere_texgen(p, out_texgen, sphere_mask);
-	    }
-
-	    if (reflect_mask) {
-	       build_reflect_texgen(p, out_texgen, reflect_mask);
-	    }
-
-	    if (normal_mask) {
-	       struct ureg normal = get_eye_normal(p);
-	       emit_op1(p, OPCODE_MOV, out_texgen, normal_mask, normal );
-	    }
-
-	    if (copy_mask) {
-	       struct ureg in = register_input(p, VERT_ATTRIB_TEX0+i);
-	       emit_op1(p, OPCODE_MOV, out_texgen, copy_mask, in );
-	    }
-	 }
-
-	 if (texmat_enabled) {
-	    struct ureg texmat[4];
-	    struct ureg in = (!is_undef(out_texgen) ? 
-			      out_texgen : 
-			      register_input(p, VERT_ATTRIB_TEX0+i));
-	    if (PREFER_DP4) {
-	       register_matrix_param5( p, STATE_TEXTURE_MATRIX, i, 0, 3,
-				       0, texmat );
-	       emit_matrix_transform_vec4( p, out, texmat, in );
-	    }
-	    else {
-	       register_matrix_param5( p, STATE_TEXTURE_MATRIX, i, 0, 3,
-				       STATE_MATRIX_TRANSPOSE, texmat );
-	       emit_transpose_matrix_transform_vec4( p, out, texmat, in );
-	    }
-	 }
-
-	 release_temps(p);
-      } 
-      else {
-	 emit_passthrough(p, VERT_ATTRIB_TEX0+i, VERT_RESULT_TEX0+i);
-      }
-   }
-}
-
-
-/* Seems like it could be tighter:
- */
-static void build_pointsize( struct tnl_program *p )
-{
-   struct ureg eye = get_eye_position(p);
-   struct ureg state_size = register_param1(p, STATE_POINT_SIZE);
-   struct ureg state_attenuation = register_param1(p, STATE_POINT_ATTENUATION);
-   struct ureg out = register_output(p, VERT_RESULT_PSIZ);
-   struct ureg ut = get_temp(p);
-
-   /* 1, Z, Z * Z, 1 */      
-   emit_op1(p, OPCODE_MOV, ut, WRITEMASK_XW, swizzle1(get_identity_param(p), W));
-   emit_op1(p, OPCODE_ABS, ut, WRITEMASK_YZ, swizzle1(eye, Z));
-   emit_op2(p, OPCODE_MUL, ut, WRITEMASK_Z, ut, ut);
-
-
-   /* p1 +  p2 * dist + p3 * dist * dist, 0 */
-   emit_op2(p, OPCODE_DP3, ut, WRITEMASK_X, ut, state_attenuation);
-
-   /* 1 / sqrt(factor) */
-   emit_op1(p, OPCODE_RSQ, ut, WRITEMASK_X, ut ); 
-
-   /* ut = pointSize / factor */
-   emit_op2(p, OPCODE_MUL, ut, WRITEMASK_X, ut, state_size); 
-
-   /* Clamp to min/max - state_size.[yz]
-    */
-   emit_op2(p, OPCODE_MAX, ut, WRITEMASK_X, ut, swizzle1(state_size, Y)); 
-   emit_op2(p, OPCODE_MIN, out, 0, swizzle1(ut, X), swizzle1(state_size, Z)); 
-   
-   release_temp(p, ut);
-}
-
-static void build_tnl_program( struct tnl_program *p )
-{  
-   /* Emit the program, starting with modelviewproject:
-    */
-   build_hpos(p);
-
-   /* Lighting calculations:
-    */
-   if (p->state->fragprog_inputs_read & (FRAG_BIT_COL0|FRAG_BIT_COL1)) {
-      if (p->state->light_global_enabled)
-	 build_lighting(p);
-      else {
-	 if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-	    emit_passthrough(p, VERT_ATTRIB_COLOR0, VERT_RESULT_COL0);
-
-	 if (p->state->fragprog_inputs_read & FRAG_BIT_COL1)
-	    emit_passthrough(p, VERT_ATTRIB_COLOR1, VERT_RESULT_COL1);
-      }
-   }
-
-   if ((p->state->fragprog_inputs_read & FRAG_BIT_FOGC) ||
-       p->state->fog_option != FOG_NONE)
-      build_fog(p);
-
-   if (p->state->fragprog_inputs_read & FRAG_BITS_TEX_ANY)
-      build_texture_transform(p);
-
-   if (p->state->point_attenuated)
-      build_pointsize(p);
-
-   /* Finish up:
-    */
-   emit_op1(p, OPCODE_END, undef, 0, undef);
-
-   /* Disassemble:
-    */
-   if (DISASSEM) {
-      _mesa_printf ("\n");
-   }
-}
-
-
-static void build_new_tnl_program( const struct state_key *key,
-				   struct gl_vertex_program *program,
-				   GLuint max_temps)
-{
-   struct tnl_program p;
-
-   _mesa_memset(&p, 0, sizeof(p));
-   p.state = key;
-   p.program = program;
-   p.eye_position = undef;
-   p.eye_position_normalized = undef;
-   p.eye_normal = undef;
-   p.identity = undef;
-   p.temp_in_use = 0;
-   p.nr_instructions = 16;
-   
-   if (max_temps >= sizeof(int) * 8)
-      p.temp_reserved = 0;
-   else
-      p.temp_reserved = ~((1<<max_temps)-1);
-
-   p.program->Base.Instructions = 
-      _mesa_malloc(sizeof(struct prog_instruction) * p.nr_instructions);
-   p.program->Base.String = 0;
-   p.program->Base.NumInstructions =
-   p.program->Base.NumTemporaries =
-   p.program->Base.NumParameters =
-   p.program->Base.NumAttributes = p.program->Base.NumAddressRegs = 0;
-   p.program->Base.Parameters = _mesa_new_parameter_list();
-   p.program->Base.InputsRead = 0;
-   p.program->Base.OutputsWritten = 0;
-
-   build_tnl_program( &p );
-}
-
-static void *search_cache( struct brw_tnl_cache *cache,
-			   GLuint hash,
-			   const void *key,
-			   GLuint keysize)
-{
-   struct brw_tnl_cache_item *c;
-
-   for (c = cache->items[hash % cache->size]; c; c = c->next) {
-      if (c->hash == hash && memcmp(c->key, key, keysize) == 0)
-	 return c->data;
-   }
-
-   return NULL;
-}
-
-static void rehash( struct brw_tnl_cache *cache )
-{
-   struct brw_tnl_cache_item **items;
-   struct brw_tnl_cache_item *c, *next;
-   GLuint size, i;
-
-   size = cache->size * 3;
-   items = (struct brw_tnl_cache_item**) _mesa_malloc(size * sizeof(*items));
-   _mesa_memset(items, 0, size * sizeof(*items));
-
-   for (i = 0; i < cache->size; i++)
-      for (c = cache->items[i]; c; c = next) {
-	 next = c->next;
-	 c->next = items[c->hash % size];
-	 items[c->hash % size] = c;
-      }
-
-   FREE(cache->items);
-   cache->items = items;
-   cache->size = size;
-}
-
-static void cache_item( struct brw_tnl_cache *cache,
-			GLuint hash,
-			const struct state_key *key,
-			void *data )
-{
-   struct brw_tnl_cache_item *c = MALLOC(sizeof(*c));
-   c->hash = hash;
-
-   c->key = malloc(sizeof(*key));
-   memcpy(c->key, key, sizeof(*key));
-
-   c->data = data;
-
-   if (++cache->n_items > cache->size * 1.5)
-      rehash(cache);
-
-   c->next = cache->items[hash % cache->size];
-   cache->items[hash % cache->size] = c;
-}
-
-
-static GLuint hash_key( struct state_key *key )
-{
-   GLuint *ikey = (GLuint *)key;
-   GLuint hash = 0, i;
-
-   /* I'm sure this can be improved on, but speed is important:
-    */
-   for (i = 0; i < sizeof(*key)/sizeof(GLuint); i++)
-      hash += ikey[i];
-
-   return hash;
-}
-
-static void prepare_tnl_program( struct brw_context *brw )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-   struct state_key key;
-   GLuint hash;
-   struct gl_vertex_program *old = brw->tnl_program;
-
-   /* _NEW_PROGRAM */
-   if (brw->attribs.VertexProgram->_Current) 
-      return;
-      
-   /* Grab all the relevent state and put it in a single structure:
-    */
-   make_state_key(ctx, &key);
-   hash = hash_key(&key);
-
-   /* Look for an already-prepared program for this state:
-    */
-   brw->tnl_program = (struct gl_vertex_program *)
-      search_cache( &brw->tnl_program_cache, hash, &key, sizeof(key) );
-   
-   /* OK, we'll have to build a new one:
-    */
-   if (!brw->tnl_program) {
-      brw->tnl_program = (struct gl_vertex_program *)
-	 ctx->Driver.NewProgram(ctx, GL_VERTEX_PROGRAM_ARB, 0); 
-
-      build_new_tnl_program( &key, brw->tnl_program, 
-/* 			     ctx->Const.MaxVertexProgramTemps  */
-			     32
-	 );
-
-      if (ctx->Driver.ProgramStringNotify)
-	 ctx->Driver.ProgramStringNotify( ctx, GL_VERTEX_PROGRAM_ARB, 
-					  &brw->tnl_program->Base );
-
-      cache_item( &brw->tnl_program_cache, 
-		  hash, &key, brw->tnl_program );
-   }
-
-   if (old != brw->tnl_program)
-      brw->state.dirty.brw |= BRW_NEW_TNL_PROGRAM;
-   return;
-}
-
-/* Note: See brw_draw.c - the vertex program must not rely on
- * brw->primitive or brw->reduced_prim.
- */
-const struct brw_tracked_state brw_tnl_vertprog = {
-   .dirty = {
-      .mesa = (_NEW_PROGRAM | 
-	       _NEW_LIGHT | 
-	       _NEW_TRANSFORM | 
-	       _NEW_FOG | 
-	       _NEW_HINT | 
-	       _NEW_POINT | 
-	       _NEW_TEXTURE |
-          _NEW_TEXTURE_MATRIX),
-      .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
-	      BRW_NEW_INPUT_VARYING),
-      .cache = 0
-   },
-   .prepare = prepare_tnl_program
-};
-
-
-
-
 static void prepare_active_vertprog( struct brw_context *brw )
 {
    const struct gl_vertex_program *prev = brw->vertex_program;
 
-   /* NEW_PROGRAM */
-   if (brw->attribs.VertexProgram->_Current) {
-      brw->vertex_program = brw->attribs.VertexProgram->_Current;
-   }
-   else {
-      /* BRW_NEW_TNL_PROGRAM */
-      brw->vertex_program = brw->tnl_program;
-   }
+   brw->vertex_program = brw->attribs.VertexProgram->_Current;
 
    if (brw->vertex_program != prev) 
       brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
@@ -1672,37 +52,8 @@ static void prepare_active_vertprog( struct brw_context *brw )
 const struct brw_tracked_state brw_active_vertprog = {
    .dirty = {
       .mesa = _NEW_PROGRAM,
-      .brw = BRW_NEW_TNL_PROGRAM,
+      .brw = 0,
       .cache = 0
    },
    .prepare = prepare_active_vertprog
 };
-
-
-void brw_ProgramCacheInit( GLcontext *ctx )
-{
-   struct brw_context *brw = brw_context(ctx);
-
-   brw->tnl_program_cache.size = 17;
-   brw->tnl_program_cache.n_items = 0;
-   brw->tnl_program_cache.items = (struct brw_tnl_cache_item **)
-      _mesa_calloc(brw->tnl_program_cache.size * 
-		   sizeof(struct brw_tnl_cache_item));
-}
-
-void brw_ProgramCacheDestroy( GLcontext *ctx )
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_tnl_cache_item *c, *next;
-   GLuint i;
-
-   for (i = 0; i < brw->tnl_program_cache.size; i++)
-      for (c = brw->tnl_program_cache.items[i]; c; c = next) {
-	 next = c->next;
-	 FREE(c->key);
-	 FREE(c->data);
-	 FREE(c);
-      }
-
-   FREE(brw->tnl_program_cache.items);
-}
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 3780d3dad2..cd074dfed6 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -70,7 +70,6 @@ static void brw_destroy_context( struct intel_context *intel )
    brw_destroy_state(brw);
    brw_draw_destroy( brw );
 
-   brw_ProgramCacheDestroy( ctx );
    brw_FrameBufferTexDestroy( brw );
 
    for (i = 0; i < brw->state.nr_draw_regions; i++)
-- 
cgit v1.2.3


From 2b8d8989fb6f9c36baf166fc715182a1407ebadb Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Sun, 28 Sep 2008 20:31:46 -0700
Subject: Use 3Dnow! x86-64 routines only on processors that support 3Dnow!

Added an x86-64 CPUID function and use it to detect 3Dnow!  If 3Dnow!
is available, use _mesa_3dnow_transform_points4_3d_no_rot,
_mesa_3dnow_transform_points4_perspective,
_mesa_3dnow_transform_points4_2d_no_rot, and _mesa_3dnow_transform_points4_2d.

This fixes long standing bug #8724.
---
 src/mesa/x86-64/x86-64.c | 34 +++++++++++++++++++++++-----------
 src/mesa/x86-64/xform4.S | 31 +++++++++++++++++++++++--------
 2 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/src/mesa/x86-64/x86-64.c b/src/mesa/x86-64/x86-64.c
index 9ec43c841d..96f8da87f0 100644
--- a/src/mesa/x86-64/x86-64.c
+++ b/src/mesa/x86-64/x86-64.c
@@ -41,7 +41,10 @@
 #include "math/m_debug.h"
 #endif
 
+extern void _mesa_x86_64_cpuid(unsigned int *regs);
+
 DECLARE_XFORM_GROUP( x86_64, 4 )
+DECLARE_XFORM_GROUP( 3dnow, 4 )
 
 #else
 /* just to silence warning below */
@@ -81,6 +84,7 @@ static void message( const char *msg )
 void _mesa_init_all_x86_64_transform_asm(void)
 {
 #ifdef USE_X86_64_ASM
+   unsigned int regs[4];
 
    if ( _mesa_getenv( "MESA_NO_ASM" ) ) {
      return;
@@ -88,24 +92,32 @@ void _mesa_init_all_x86_64_transform_asm(void)
 
    message("Initializing x86-64 optimizations\n");
 
-   ASSIGN_XFORM_GROUP( x86_64, 4 );
 
-   /*
    _mesa_transform_tab[4][MATRIX_GENERAL] =
       _mesa_x86_64_transform_points4_general;
    _mesa_transform_tab[4][MATRIX_IDENTITY] =
       _mesa_x86_64_transform_points4_identity;
    _mesa_transform_tab[4][MATRIX_3D] =
       _mesa_x86_64_transform_points4_3d;
-   _mesa_transform_tab[4][MATRIX_3D_NO_ROT] =
-      _mesa_x86_64_transform_points4_3d_no_rot;
-   _mesa_transform_tab[4][MATRIX_PERSPECTIVE] =
-      _mesa_x86_64_transform_points4_perspective;
-   _mesa_transform_tab[4][MATRIX_2D_NO_ROT] =
-      _mesa_x86_64_transform_points4_2d_no_rot;
-   _mesa_transform_tab[4][MATRIX_2D] =
-      _mesa_x86_64_transform_points4_2d;
-   */
+
+   regs[0] = 0x80000001;
+   regs[1] = 0x00000000;
+   regs[2] = 0x00000000;
+   regs[3] = 0x00000000;
+   _mesa_x86_64_cpuid(regs);
+   if (regs[3] & (1U << 31)) {
+      message("3Dnow! detected\n");
+      _mesa_transform_tab[4][MATRIX_3D_NO_ROT] =
+	  _mesa_3dnow_transform_points4_3d_no_rot;
+      _mesa_transform_tab[4][MATRIX_PERSPECTIVE] =
+	  _mesa_3dnow_transform_points4_perspective;
+      _mesa_transform_tab[4][MATRIX_2D_NO_ROT] =
+	  _mesa_3dnow_transform_points4_2d_no_rot;
+      _mesa_transform_tab[4][MATRIX_2D] =
+	  _mesa_3dnow_transform_points4_2d;
+
+   }
+
    
 #ifdef DEBUG_MATH
    _math_test_all_transform_functions("x86_64");
diff --git a/src/mesa/x86-64/xform4.S b/src/mesa/x86-64/xform4.S
index 3f9c9d56ab..805969127d 100644
--- a/src/mesa/x86-64/xform4.S
+++ b/src/mesa/x86-64/xform4.S
@@ -29,7 +29,22 @@
 .text
 
 .align 16
+.globl _mesa_x86_64_cpuid
+_mesa_x86_64_cpuid:
+	pushq	%rbx
+	movl	(%rdi), %eax
+	movl	8(%rdi), %ecx
+
+	cpuid
+
+	movl	%ebx, 4(%rdi)
+	movl	%eax, (%rdi)
+	movl	%ecx, 8(%rdi)
+	movl	%edx, 12(%rdi)
+	popq	%rbx
+	ret
 
+.align 16
 .globl _mesa_x86_64_transform_points4_general
 _mesa_x86_64_transform_points4_general:
 /*
@@ -204,8 +219,8 @@ p4_identity_done:
 
 	
 .align 16
-.globl _mesa_x86_64_transform_points4_3d_no_rot
-_mesa_x86_64_transform_points4_3d_no_rot:
+.globl _mesa_3dnow_transform_points4_3d_no_rot
+_mesa_3dnow_transform_points4_3d_no_rot:
 
 	movl V4F_COUNT(%rdx), %ecx	/* count */
 	movzx V4F_STRIDE(%rdx), %eax	/* stride */
@@ -268,8 +283,8 @@ p4_3d_no_rot_done:
 
 	
 .align 16
-.globl _mesa_x86_64_transform_points4_perspective
-_mesa_x86_64_transform_points4_perspective:
+.globl _mesa_3dnow_transform_points4_perspective
+_mesa_3dnow_transform_points4_perspective:
 
 	movl V4F_COUNT(%rdx), %ecx	/* count */
 	movzx V4F_STRIDE(%rdx), %eax	/* stride */
@@ -334,8 +349,8 @@ p4_perspective_done:
 	ret
 
 .align 16
-.globl _mesa_x86_64_transform_points4_2d_no_rot
-_mesa_x86_64_transform_points4_2d_no_rot:
+.globl _mesa_3dnow_transform_points4_2d_no_rot
+_mesa_3dnow_transform_points4_2d_no_rot:
 
 	movl V4F_COUNT(%rdx), %ecx	/* count */
 	movzx V4F_STRIDE(%rdx), %eax	/* stride */
@@ -389,8 +404,8 @@ p4_2d_no_rot_done:
 
 	
 .align 16
-.globl _mesa_x86_64_transform_points4_2d
-_mesa_x86_64_transform_points4_2d:
+.globl _mesa_3dnow_transform_points4_2d
+_mesa_3dnow_transform_points4_2d:
 
 	movl V4F_COUNT(%rdx), %ecx	/* count */
 	movzx V4F_STRIDE(%rdx), %eax	/* stride */
-- 
cgit v1.2.3


From e095d5812a0237d08eabae1977730e38ac5751c9 Mon Sep 17 00:00:00 2001
From: Shunichi Fuji <palglowr@gmail.com>
Date: Fri, 26 Sep 2008 18:55:13 +0900
Subject: mesa: drop calloc from _mesa_get_fixed_func_vertex_program

Signed-off-by: Shunichi Fuji <palglowr@gmail.com>
---
 src/mesa/main/ffvertex_prog.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c
index 787672be9f..308b4ef711 100644
--- a/src/mesa/main/ffvertex_prog.c
+++ b/src/mesa/main/ffvertex_prog.c
@@ -178,12 +178,12 @@ static GLboolean check_active_shininess( GLcontext *ctx,
 
 
-static struct state_key *make_state_key( GLcontext *ctx )
+static void make_state_key( GLcontext *ctx, struct state_key *key )
 {
    const struct gl_fragment_program *fp;
-   struct state_key *key = CALLOC_STRUCT(state_key);
    GLuint i;
 
+   memset(key, 0, sizeof(struct state_key));
    fp = ctx->FragmentProgram._Current;
 
    /* This now relies on texenvprogram.c being active:
@@ -301,8 +301,6 @@ static struct state_key *make_state_key( GLcontext *ctx )
 			      texUnit->GenModeQ );
       }
    }
-   
-   return key;
 }
 
 
@@ -1714,16 +1712,16 @@ struct gl_vertex_program *
 _mesa_get_fixed_func_vertex_program(GLcontext *ctx)
 {
    struct gl_vertex_program *prog;
-   struct state_key *key;
+   struct state_key key;
 
    /* Grab all the relevent state and put it in a single structure:
     */
-   key = make_state_key(ctx);
+   make_state_key(ctx, &key);
 
    /* Look for an already-prepared program for this state:
     */
    prog = (struct gl_vertex_program *)
-      _mesa_search_program_cache(ctx->VertexProgram.Cache, key, sizeof(*key));
+      _mesa_search_program_cache(ctx->VertexProgram.Cache, &key, sizeof(key));
    
    if (!prog) {
       /* OK, we'll have to build a new one */
@@ -1735,7 +1733,7 @@ _mesa_get_fixed_func_vertex_program(GLcontext *ctx)
       if (!prog)
          return NULL;
 
-      create_new_program( key, prog,
+      create_new_program( &key, prog,
                           ctx->Const.VertexProgram.MaxTemps );
 
 #if 0
@@ -1744,10 +1742,8 @@ _mesa_get_fixed_func_vertex_program(GLcontext *ctx)
                                           &prog->Base );
 #endif
       _mesa_program_cache_insert(ctx, ctx->VertexProgram.Cache,
-                                 key, sizeof(*key), &prog->Base);
+                                 &key, sizeof(key), &prog->Base);
    }
 
-   _mesa_free(key);
-
    return prog;
 }
-- 
cgit v1.2.3


From fc13269b820a842b401b7a7a4aed098e59b25b91 Mon Sep 17 00:00:00 2001
From: Jouk Jansen <jouk@hrem.nano.tudelft.nl>
Date: Mon, 29 Sep 2008 08:35:05 -0600
Subject: mesa: asst updates for VMS

---
 src/mesa/drivers/common/descrip.mms |  5 +++--
 src/mesa/glapi/descrip.mms          | 10 ++++++----
 src/mesa/main/descrip.mms           | 32 +++++++++++++++++++++++++++++++-
 src/mesa/shader/descrip.mms         |  6 ++++--
 src/mesa/shader/grammar/descrip.mms |  5 +++--
 src/mesa/tnl/descrip.mms            |  7 ++++---
 6 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/common/descrip.mms b/src/mesa/drivers/common/descrip.mms
index 99a2ae6c37..d5bbc69dfd 100644
--- a/src/mesa/drivers/common/descrip.mms
+++ b/src/mesa/drivers/common/descrip.mms
@@ -1,6 +1,6 @@
 # Makefile for core library for VMS
 # contributed by Jouk Jansen  joukj@hrem.nano.tudelft.nl
-# Last revision : 3 October 2007
+# Last revision : 29 September 2008
 
 .first
 	define gl [----.include.gl]
@@ -19,7 +19,8 @@ VPATH = RCS
 
 INCDIR = [----.include],[--.main],[--.glapi],[--.shader]
 LIBDIR = [----.lib]
-CFLAGS = /include=($(INCDIR),[])/define=(PTHREADS=1)/name=(as_is,short)/float=ieee/ieee=denorm
+CFLAGS = /include=($(INCDIR),[])/define=(PTHREADS=1)/name=(as_is,short)\
+	/float=ieee/ieee=denorm/warn=disable=(PTRMISMATCH)
 
 SOURCES = driverfuncs.c
 
diff --git a/src/mesa/glapi/descrip.mms b/src/mesa/glapi/descrip.mms
index f17e5329b6..ee96baa6fc 100644
--- a/src/mesa/glapi/descrip.mms
+++ b/src/mesa/glapi/descrip.mms
@@ -1,9 +1,10 @@
 # Makefile for core library for VMS
-# contributed by Jouk Jansen  joukj@hrem.stm.tudelft.nl
-# Last revision : 16 June 2003
+# contributed by Jouk Jansen  joukj@hrem.nano.tudelft.nl
+# Last revision : 29 September 2008
 
 .first
 	define gl [---.include.gl]
+	define main [-.main]
 
 .include [---]mms-config.
 
@@ -15,9 +16,9 @@ INCDIR = [---.include],[-.main]
 LIBDIR = [---.lib]
 CFLAGS = /include=($(INCDIR),[])/define=(PTHREADS=1)/name=(as_is,short)/float=ieee/ieee=denorm
 
-SOURCES = glapi.c glthread.c
+SOURCES = glapi.c glthread.c glapi_getproc.c
 
-OBJECTS =  glapi.obj,glthread.obj
+OBJECTS =  glapi.obj,glthread.obj,glapi_getproc.obj
 
 ##### RULES #####
 
@@ -35,3 +36,4 @@ clean :
 glapi.obj : glapi.c
 
 glthread.obj : glthread.c
+glapi_getproc.obj : glapi_getproc.c
diff --git a/src/mesa/main/descrip.mms b/src/mesa/main/descrip.mms
index 3ef215f47f..e49ec65d42 100644
--- a/src/mesa/main/descrip.mms
+++ b/src/mesa/main/descrip.mms
@@ -1,6 +1,6 @@
 # Makefile for core library for VMS
 # contributed by Jouk Jansen  joukj@hrem.nano.tudelft.nl
-# Last revision : 2 October 2007
+# Last revision : 29 September 2008
 
 .first
 	define gl [---.include.gl]
@@ -21,6 +21,7 @@ CFLAGS = /include=($(INCDIR),[])/define=(PTHREADS=1)/name=(as_is,short)/float=ie
 
 SOURCES =accum.c \
 	api_arrayelt.c \
+	api_exec.c \
 	api_loopback.c \
 	api_noop.c \
 	api_validate.c \
@@ -29,6 +30,7 @@ SOURCES =accum.c \
 	blend.c \
 	bufferobj.c \
 	buffers.c \
+	clear.c \
 	clip.c \
 	colortab.c \
 	context.c \
@@ -46,6 +48,7 @@ SOURCES =accum.c \
 	extensions.c \
 	fbobject.c \
 	feedback.c \
+	ffvertex_prog.c \
 	fog.c \
 	framebuffer.c \
 	get.c \
@@ -60,22 +63,29 @@ SOURCES =accum.c \
 	matrix.c \
 	mipmap.c \
 	mm.c \
+	multisample.c \
 	pixel.c \
+	pixelstore.c \
 	points.c \
 	polygon.c \
 	rastpos.c \
 	rbadaptors.c \
+	readpix.c \
 	renderbuffer.c \
+	scissor.c \
 	shaders.c \
 	state.c \
 	stencil.c \
 	texcompress.c \
 	texcompress_fxt1.c \
 	texcompress_s3tc.c \
+	texenv.c \
 	texenvprogram.c \
 	texformat.c \
+	texgen.c \
 	teximage.c \
 	texobj.c \
+	texparam.c \
 	texrender.c \
 	texstate.c \
 	texstore.c \
@@ -86,6 +96,7 @@ SOURCES =accum.c \
 
 OBJECTS=accum.obj,\
 api_arrayelt.obj,\
+api_exec.obj,\
 api_loopback.obj,\
 api_noop.obj,\
 api_validate.obj,\
@@ -94,6 +105,7 @@ attrib.obj,\
 blend.obj,\
 bufferobj.obj,\
 buffers.obj,\
+clear.obj,\
 clip.obj,\
 colortab.obj,\
 context.obj,\
@@ -111,6 +123,7 @@ execmem.obj,\
 extensions.obj,\
 fbobject.obj,\
 feedback.obj,\
+ffvertex_prog.obj,\
 fog.obj,\
 framebuffer.obj,\
 get.obj,\
@@ -125,21 +138,28 @@ lines.obj,\
 matrix.obj,\
 mipmap.obj,\
 mm.obj,\
+multisample.obj,\
 pixel.obj,\
+pixelstore.obj,\
 points.obj,\
 polygon.obj,\
 rastpos.obj,\
+readpix.obj,\
 renderbuffer.obj,\
+scissor.obj,\
 shaders.obj,\
 state.obj,\
 stencil.obj,\
 texcompress.obj,\
 texcompress_fxt1.obj,\
 texcompress_s3tc.obj,\
+texenv.obj,\
 texenvprogram.obj,\
 texformat.obj,\
+texgen.obj,\
 teximage.obj,\
 texobj.obj,\
+texparam.obj,\
 texrender.obj,\
 texstate.obj,\
 texstore.obj,\
@@ -226,3 +246,13 @@ vtxfmt.obj : vtxfmt.c
 shaders.obj : shaders.c
 queryobj.obj : queryobj.c
 rbadaptors.obj : rbadaptors.c
+clear.obj : clear.c
+multisample.obj : multisample.c
+scissor.obj : scissor.c
+texenv.obj : texenv.c
+texgen.obj : texgen.c
+texparam.obj : texparam.c
+readpix.obj : readpix.c
+ffvertex_prog.obj : ffvertex_prog.c
+api_exec.obj : api_exec.c
+pixelstore.obj : pixelstore.c
diff --git a/src/mesa/shader/descrip.mms b/src/mesa/shader/descrip.mms
index bdac946efe..19bafd4830 100644
--- a/src/mesa/shader/descrip.mms
+++ b/src/mesa/shader/descrip.mms
@@ -1,6 +1,6 @@
 # Makefile for core library for VMS
 # contributed by Jouk Jansen  joukj@hrem.nano.tudelft.nl
-# Last revision : 27 May 2008
+# Last revision : 29 September 2008
 .first
 	define gl [---.include.gl]
 	define math [-.math]
@@ -34,6 +34,7 @@ SOURCES = \
 	prog_instruction.c \
 	prog_parameter.c \
 	prog_print.c \
+	prog_cache.c \
 	prog_statevars.c \
 	shader_api.c prog_uniform.c
 
@@ -52,7 +53,7 @@ OBJECTS = \
 	prog_parameter.obj,\
 	prog_print.obj,\
 	prog_statevars.obj,\
-	shader_api.obj,prog_uniform.obj
+	shader_api.obj,prog_uniform.obj,prog_cache.obj
 
 ##### RULES #####
 
@@ -91,3 +92,4 @@ prog_print.obj : prog_print.c
 prog_statevars.obj : prog_statevars.c
 shader_api.obj : shader_api.c
 prog_uniform.obj : prog_uniform.c
+prog_cache.obj : prog_cache.c
diff --git a/src/mesa/shader/grammar/descrip.mms b/src/mesa/shader/grammar/descrip.mms
index f7fbee96bc..6976b70d6a 100644
--- a/src/mesa/shader/grammar/descrip.mms
+++ b/src/mesa/shader/grammar/descrip.mms
@@ -1,12 +1,13 @@
 # Makefile for core library for VMS
-# contributed by Jouk Jansen  joukj@hrem.stm.tudelft.nl
-# Last revision : 1 June 2005
+# contributed by Jouk Jansen  joukj@hrem.nano.tudelft.nl
+# Last revision : 29 September 2008
 
 .first
 	define gl [----.include.gl]
 	define math [--.math]
 	define swrast [--.swrast]
 	define array_cache [--.array_cache]
+	define main [--.main]
 
 .include [----]mms-config.
 
diff --git a/src/mesa/tnl/descrip.mms b/src/mesa/tnl/descrip.mms
index f77f672dc8..25dd1aecb1 100644
--- a/src/mesa/tnl/descrip.mms
+++ b/src/mesa/tnl/descrip.mms
@@ -1,6 +1,6 @@
 # Makefile for core library for VMS
 # contributed by Jouk Jansen  joukj@hrem.nano.tudelft.nl
-# Last revision : 30 November 2007
+# Last revision : 39 September 2008
 
 .first
 	define gl [---.include.gl]
@@ -27,13 +27,13 @@ SOURCES = t_context.c t_draw.c \
 	t_pipeline.c t_vb_fog.c \
 	t_vb_light.c t_vb_normals.c t_vb_points.c t_vb_program.c \
 	t_vb_render.c t_vb_texgen.c t_vb_texmat.c t_vb_vertex.c \
-	t_vertex.c \
+	t_vertex.c t_rasterpos.c\
 	t_vertex_generic.c t_vp_build.c
 
 OBJECTS = t_context.obj,t_draw.obj,\
 	t_pipeline.obj,t_vb_fog.obj,t_vb_light.obj,t_vb_normals.obj,\
 	t_vb_points.obj,t_vb_program.obj,t_vb_render.obj,t_vb_texgen.obj,\
-	t_vb_texmat.obj,t_vb_vertex.obj,\
+	t_vb_texmat.obj,t_vb_vertex.obj,t_rasterpos.obj,\
 	t_vertex.obj,t_vertex_generic.obj,\
 	t_vp_build.obj
 
@@ -65,3 +65,4 @@ t_vb_vertex.obj : t_vb_vertex.c
 t_vertex.obj : t_vertex.c
 t_vertex_generic.obj : t_vertex_generic.c
 t_vp_build.obj : t_vp_build.c
+t_rasterpos.obj : t_rasterpos.c
-- 
cgit v1.2.3


From d806d451e660bb582c04947ae3bd8b95173e8fd4 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 29 Sep 2008 12:18:06 -0700
Subject: GLSL: AttachShader returns INVALID_OPERATION for repeated attach

The GL_ARB_shader_objects spec says that glAttachShaderARB is supposed
to return GL_INVALID_OPERATION if a shader is attached to a program
where it is already attached.  _mesa_attach_shader perviously returned
without error in this case.
---
 src/mesa/shader/shader_api.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/mesa/shader/shader_api.c b/src/mesa/shader/shader_api.c
index 3c530d1727..3ab590351a 100644
--- a/src/mesa/shader/shader_api.c
+++ b/src/mesa/shader/shader_api.c
@@ -455,7 +455,13 @@ _mesa_attach_shader(GLcontext *ctx, GLuint program, GLuint shader)
    n = shProg->NumShaders;
    for (i = 0; i < n; i++) {
       if (shProg->Shaders[i] == sh) {
-         /* already attached */
+         /* The shader is already attched to this program.  The
+          * GL_ARB_shader_objects spec says:
+          *
+          *     "The error INVALID_OPERATION is generated by AttachObjectARB
+          *     if <obj> is already attached to <containerObj>."
+          */
+         _mesa_error(ctx, GL_INVALID_OPERATION, "glAttachShader");
          return;
       }
    }
-- 
cgit v1.2.3


From 905d8e0742d200558677dac01a838e95877f7b5e Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 29 Sep 2008 12:27:00 -0700
Subject: GLSL: Implement _mesa_get_handle

Implementing _mesa_get_handle in using
glGetIntegerv(GL_CURRENT_PROGRAM, ...) allows glGetHandleARB to work.
---
 src/mesa/shader/shader_api.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/src/mesa/shader/shader_api.c b/src/mesa/shader/shader_api.c
index 3ab590351a..504d769323 100644
--- a/src/mesa/shader/shader_api.c
+++ b/src/mesa/shader/shader_api.c
@@ -47,7 +47,7 @@
 #include "shader/shader_api.h"
 #include "shader/slang/slang_compile.h"
 #include "shader/slang/slang_link.h"
-
+#include "glapi/dispatch.h"
 
 
 #ifndef GL_PROGRAM_BINARY_LENGTH_OES
@@ -925,24 +925,15 @@ _mesa_get_attached_shaders(GLcontext *ctx, GLuint program, GLsizei maxCount,
 static GLuint
 _mesa_get_handle(GLcontext *ctx, GLenum pname)
 {
-#if 0
-   GET_CURRENT_CONTEXT(ctx);
-
-   switch (pname) {
-   case GL_PROGRAM_OBJECT_ARB:
-      {
-         struct gl2_program_intf **pro = ctx->Shader.CurrentProgram;
-
-         if (pro != NULL)
-            return (**pro)._container._generic.
-               GetName((struct gl2_generic_intf **) (pro));
-      }
-      break;
-   default:
+   GLint handle = 0;
+   
+   if (pname == GL_PROGRAM_OBJECT_ARB) {
+      CALL_GetIntegerv(ctx->Exec, (GL_CURRENT_PROGRAM, &handle));
+   } else {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetHandleARB");
    }
-#endif
-   return 0;
+
+   return handle;
 }
 
 
-- 
cgit v1.2.3


From 3ab4b2066fd0b1493af38510e26e5a6ba83b5cf4 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 29 Sep 2008 12:30:05 -0700
Subject: GLSL: Implement GL_OBJECT_TYPE_ARB query

The GL_OBJECT_TYPE_ARB query is handled directly in
_mesa_GetObjectParamterivARB because it is only supported in the
extension version of the shanding language API.  glGetProgramiv and
glGetShaderiv should not accept this enum.
---
 src/mesa/main/shaders.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/shaders.c b/src/mesa/main/shaders.c
index f0db0d2a81..aeb5d4ca2a 100644
--- a/src/mesa/main/shaders.c
+++ b/src/mesa/main/shaders.c
@@ -233,10 +233,18 @@ _mesa_GetObjectParameterivARB(GLhandleARB object, GLenum pname, GLint *params)
    GET_CURRENT_CONTEXT(ctx);
    /* Implement in terms of GetProgramiv, GetShaderiv */
    if (ctx->Driver.IsProgram(ctx, object)) {
-      ctx->Driver.GetProgramiv(ctx, object, pname, params);
+      if (pname == GL_OBJECT_TYPE_ARB) {
+	 *params = GL_PROGRAM_OBJECT_ARB;
+      } else {
+	 ctx->Driver.GetProgramiv(ctx, object, pname, params);
+      }
    }
    else if (ctx->Driver.IsShader(ctx, object)) {
-      ctx->Driver.GetShaderiv(ctx, object, pname, params);
+      if (pname == GL_OBJECT_TYPE_ARB) {
+	 *params = GL_SHADER_OBJECT_ARB;
+      } else {
+	 ctx->Driver.GetShaderiv(ctx, object, pname, params);
+      }
    }
    else {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glGetObjectParameterivARB");
-- 
cgit v1.2.3


From 08b9e29c1d4d28fee13658b0421b4522d9c36b3a Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 29 Sep 2008 18:50:05 -0700
Subject: intel: Clean-up the extension string madness!

- Sort extensions by ARB, then EXT, then vendor by name
- Remove redundant (only one of GL_{ARB,EXT,NV}_texture_rectangle) or
  duplicate extension strings
---
 src/mesa/drivers/dri/intel/intel_context.c | 134 ++++++++++++++---------------
 1 file changed, 64 insertions(+), 70 deletions(-)

diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index ccd74baa7c..1dd3ee7d0a 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -68,14 +68,15 @@
 int INTEL_DEBUG = (0);
 #endif
 
-#define need_GL_NV_point_sprite
 #define need_GL_ARB_multisample
+#define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
+#define need_GL_ARB_shader_objects
 #define need_GL_ARB_texture_compression
 #define need_GL_ARB_vertex_buffer_object
 #define need_GL_ARB_vertex_program
+#define need_GL_ARB_vertex_shader
 #define need_GL_ARB_window_pos
-#define need_GL_ARB_occlusion_query
 #define need_GL_EXT_blend_color
 #define need_GL_EXT_blend_equation_separate
 #define need_GL_EXT_blend_func_separate
@@ -84,14 +85,13 @@ int INTEL_DEBUG = (0);
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_framebuffer_object
 #define need_GL_EXT_multi_draw_arrays
+#define need_GL_EXT_point_parameters
 #define need_GL_EXT_secondary_color
-#define need_GL_NV_vertex_program
 #define need_GL_ATI_separate_stencil
-#define need_GL_EXT_point_parameters
+#define need_GL_NV_point_sprite
+#define need_GL_NV_vertex_program
 #define need_GL_VERSION_2_0
 #define need_GL_VERSION_2_1
-#define need_GL_ARB_shader_objects
-#define need_GL_ARB_vertex_shader
 
 #include "extension_helper.h"
 
@@ -344,88 +344,82 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
  * i965_dri.
  */
 static const struct dri_extension card_extensions[] = {
-   {"GL_ARB_multisample", GL_ARB_multisample_functions},
-   {"GL_ARB_multitexture", NULL},
-   {"GL_ARB_point_parameters", GL_ARB_point_parameters_functions},
-   {"GL_NV_point_sprite", GL_NV_point_sprite_functions},
-   {"GL_ARB_texture_border_clamp", NULL},
-   {"GL_ARB_texture_compression", GL_ARB_texture_compression_functions},
-   {"GL_ARB_texture_cube_map", NULL},
-   {"GL_ARB_texture_env_add", NULL},
-   {"GL_ARB_texture_env_combine", NULL},
-   {"GL_ARB_texture_env_crossbar", NULL},
-   {"GL_ARB_texture_env_dot3", NULL},
-   {"GL_ARB_texture_mirrored_repeat", NULL},
-   {"GL_ARB_texture_non_power_of_two",   NULL },
-   {"GL_ARB_texture_rectangle", NULL},
-   {"GL_NV_texture_rectangle", NULL},
-   {"GL_EXT_texture_rectangle", NULL},
-   {"GL_ARB_point_parameters", NULL}, 
-   {"GL_ARB_vertex_buffer_object", GL_ARB_vertex_buffer_object_functions},
-   {"GL_ARB_vertex_program", GL_ARB_vertex_program_functions},
-   {"GL_ARB_window_pos", GL_ARB_window_pos_functions},
-   {"GL_EXT_blend_color", GL_EXT_blend_color_functions},
-   {"GL_EXT_blend_equation_separate",
-    GL_EXT_blend_equation_separate_functions},
-   {"GL_EXT_blend_func_separate", GL_EXT_blend_func_separate_functions},
-   {"GL_EXT_blend_minmax", GL_EXT_blend_minmax_functions},
-   {"GL_EXT_blend_logic_op", NULL},
-   {"GL_EXT_blend_subtract", NULL},
-   {"GL_EXT_cull_vertex", GL_EXT_cull_vertex_functions},
-   {"GL_EXT_fog_coord", GL_EXT_fog_coord_functions},
-   {"GL_EXT_multi_draw_arrays", GL_EXT_multi_draw_arrays_functions},
-   {"GL_ATI_separate_stencil", GL_ATI_separate_stencil_functions},
-#if 1                           /* XXX FBO temporary? */
-   {"GL_EXT_packed_depth_stencil", NULL},
-#endif
-   {"GL_EXT_secondary_color", GL_EXT_secondary_color_functions},
-   {"GL_EXT_stencil_wrap", NULL},
-   {"GL_EXT_texture_edge_clamp", NULL},
-   {"GL_EXT_texture_env_combine", NULL},
-   {"GL_EXT_texture_env_dot3", NULL},
-   {"GL_EXT_texture_filter_anisotropic", NULL},
-   {"GL_EXT_texture_lod_bias", NULL},
-   {"GL_3DFX_texture_compression_FXT1", NULL},
-   {"GL_APPLE_client_storage", NULL},
-   {"GL_MESA_pack_invert", NULL},
-   {"GL_MESA_ycbcr_texture", NULL},
-   {"GL_NV_blend_square", NULL},
-   {"GL_NV_vertex_program", GL_NV_vertex_program_functions},
-   {"GL_NV_vertex_program1_1", NULL},
-   { "GL_SGIS_generate_mipmap", NULL },
-   {NULL, NULL}
+   { "GL_ARB_multisample",                GL_ARB_multisample_functions },
+   { "GL_ARB_multitexture",               NULL },
+   { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
+   { "GL_ARB_texture_border_clamp",       NULL },
+   { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
+   { "GL_ARB_texture_cube_map",           NULL },
+   { "GL_ARB_texture_env_add",            NULL },
+   { "GL_ARB_texture_env_combine",        NULL },
+   { "GL_ARB_texture_env_crossbar",       NULL },
+   { "GL_ARB_texture_env_dot3",           NULL },
+   { "GL_ARB_texture_mirrored_repeat",    NULL },
+   { "GL_ARB_texture_non_power_of_two",   NULL },
+   { "GL_ARB_texture_rectangle",          NULL },
+   { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
+   { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
+   { "GL_ARB_window_pos",                 GL_ARB_window_pos_functions },
+   { "GL_EXT_blend_color",                GL_EXT_blend_color_functions },
+   { "GL_EXT_blend_equation_separate",    GL_EXT_blend_equation_separate_functions },
+   { "GL_EXT_blend_func_separate",        GL_EXT_blend_func_separate_functions },
+   { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
+   { "GL_EXT_blend_logic_op",             NULL },
+   { "GL_EXT_blend_subtract",             NULL },
+   { "GL_EXT_cull_vertex",                GL_EXT_cull_vertex_functions },
+   { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+   { "GL_EXT_multi_draw_arrays",          GL_EXT_multi_draw_arrays_functions },
+   { "GL_EXT_packed_depth_stencil",       NULL },
+   { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
+   { "GL_EXT_stencil_wrap",               NULL },
+   { "GL_EXT_texture_edge_clamp",         NULL },
+   { "GL_EXT_texture_env_combine",        NULL },
+   { "GL_EXT_texture_env_dot3",           NULL },
+   { "GL_EXT_texture_filter_anisotropic", NULL },
+   { "GL_EXT_texture_lod_bias",           NULL },
+   { "GL_3DFX_texture_compression_FXT1",  NULL },
+   { "GL_APPLE_client_storage",           NULL },
+   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
+   { "GL_MESA_pack_invert",               NULL },
+   { "GL_MESA_ycbcr_texture",             NULL },
+   { "GL_NV_blend_square",                NULL },
+   { "GL_NV_point_sprite",                GL_NV_point_sprite_functions },
+   { "GL_NV_vertex_program",              GL_NV_vertex_program_functions },
+   { "GL_NV_vertex_program1_1",           NULL },
+   { "GL_SGIS_generate_mipmap",           NULL },
+   { NULL, NULL }
 };
 
 static const struct dri_extension brw_extensions[] = {
-   { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions},
-   { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions},
-   { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions},
-   { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions},
-   { "GL_ARB_point_sprite", 		  NULL},
-   { "GL_ARB_fragment_shader",            NULL },
-   { "GL_ARB_draw_buffers",               NULL },
    { "GL_ARB_depth_texture",              NULL },
+   { "GL_ARB_draw_buffers",               NULL },
    { "GL_ARB_fragment_program",           NULL },
+   { "GL_ARB_fragment_program_shadow",    NULL },
+   { "GL_ARB_fragment_shader",            NULL },
+   { "GL_ARB_point_sprite", 		  NULL },
+   { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
+   { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
+   { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
    { "GL_ARB_shadow",                     NULL },
+   { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
    { "GL_EXT_shadow_funcs",               NULL },
-   { "GL_ARB_fragment_program_shadow",    NULL },
+   { "GL_EXT_texture_sRGB",		  NULL },
    /* ARB extn won't work if not enabled */
    { "GL_SGIX_depth_texture",             NULL },
-   { "GL_EXT_texture_sRGB",		  NULL},
    { NULL,                                NULL }
 };
 
 #ifdef I915_MMIO_READ
 static const struct dri_extension arb_oc_extensions[] = {
-   {"GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions},
-   {NULL, NULL}
+   { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
+   { NULL, NULL }
 };
 #endif
 
 static const struct dri_extension ttm_extensions[] = {
-   {"GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions},
-   {"GL_ARB_pixel_buffer_object", NULL},
-   {NULL, NULL}
+   { "GL_ARB_pixel_buffer_object",        NULL },
+   { "GL_EXT_framebuffer_object",         GL_EXT_framebuffer_object_functions },
+   { NULL, NULL }
 };
 
 /**
-- 
cgit v1.2.3


From 8415d06d90a197e16554dab98d160334fd9f9f93 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 1 Oct 2008 01:13:40 +0900
Subject: util: Fix util_fast_pow/exp2/log2.

- Use a lookup table for log2.

- Compute (float) (1 << ipart) by tweaking with the exponent directly to
avoid integer overflow and float conversion.

- Also table negative exponents to avoid float division and branching.

- Implement util_fast_exp as function of util_fast_exp2.
---
 src/gallium/auxiliary/util/u_math.c |  21 +++++--
 src/gallium/auxiliary/util/u_math.h | 112 +++++++++++++++---------------------
 2 files changed, 64 insertions(+), 69 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
index 0729114d6a..5b3cab4642 100644
--- a/src/gallium/auxiliary/util/u_math.c
+++ b/src/gallium/auxiliary/util/u_math.c
@@ -30,7 +30,7 @@
 #include "util/u_math.h"
 
 
-
+/** 2^x, for x in [-1.0, 1.0[ */
 float pow2_table[POW2_TABLE_SIZE];
 
 
@@ -38,9 +38,21 @@ static void
 init_pow2_table(void)
 {
    int i;
-   for (i = 0; i < POW2_TABLE_SIZE; i++) {
-      pow2_table[i] = (float) pow(2.0, i / POW2_TABLE_SCALE);
-   }
+   for (i = 0; i < POW2_TABLE_SIZE; i++)
+      pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
+}
+
+
+/** log2(x), for x in [1.0, 2.0[ */
+float log2_table[LOG2_TABLE_SIZE];
+
+
+static void 
+init_log2_table(void)
+{
+   unsigned i;
+   for (i = 0; i < LOG2_TABLE_SIZE; i++)
+      log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SIZE));
 }
 
 
@@ -53,6 +65,7 @@ util_init_math(void)
    static boolean initialized = FALSE;
    if (!initialized) {
       init_pow2_table();
+      init_log2_table();
       initialized = TRUE;
    }
 }
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 084655e6dd..be7303e550 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -174,8 +174,10 @@ static INLINE float logf( float f )
 
 
-#define POW2_TABLE_SIZE 256
-#define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1))
+#define POW2_TABLE_SIZE_LOG2 9
+#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
+#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
+#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
 extern float pow2_table[POW2_TABLE_SIZE];
 
 
@@ -186,98 +188,78 @@ util_init_math(void);
 
 union fi {
    float f;
-   int i;
-   unsigned ui;
+   int32_t i;
+   uint32_t ui;
 };
 
 
 /**
- * Fast approximation to exp(x).
- * Compute with base 2 exponents:  exp(x) = exp2(log2(e) * x)
- * Note: log2(e) is a constant, k = 1.44269
- * So, exp(x) = exp2(k * x);
+ * Fast version of 2^x
  * Identity: exp2(a + b) = exp2(a) * exp2(b)
- * Let ipart = int(k*x)
- * Let fpart = k*x - ipart;
- * So, exp2(k*x) = exp2(ipart) * exp2(fpart)
+ * Let ipart = int(x)
+ * Let fpart = x - ipart;
+ * So, exp2(x) = exp2(ipart) * exp2(fpart)
  * Compute exp2(ipart) with i << ipart
  * Compute exp2(fpart) with lookup table.
  */
 static INLINE float
-util_fast_exp(float x)
+util_fast_exp2(float x)
 {
-   if (x >= 0.0f) {
-      float k = 1.44269f; /* = log2(e) */
-      float kx = k * x;
-      int ipart = (int) kx;
-      float fpart = kx - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return y;
-   }
-   else {
-      /* exp(-x) = 1.0 / exp(x) */
-      float k = -1.44269f;
-      float kx = k * x;
-      int ipart = (int) kx;
-      float fpart = kx - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return 1.0f / y;
-   }
+   int32_t ipart;
+   float fpart, mpart;
+   union fi epart;
+   
+   if(x > 129.00000f)
+      return 3.402823466e+38f;
+   
+   if(x < -126.99999f)
+      return 0.0f;
+
+   ipart = (int32_t) x;
+   fpart = x - (float) ipart;
+   
+   /* same as
+    *   epart.f = (float) (1 << ipart)
+    * but faster and without integer overflow for ipart > 31 */
+   epart.i = (ipart + 127 ) << 23;
+   
+   mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
+   
+   return epart.f * mpart;
 }
 
 
 /**
- * Fast version of 2^x
- * XXX the above function could be implemented in terms of this one.
+ * Fast approximation to exp(x).
  */
 static INLINE float
-util_fast_exp2(float x)
+util_fast_exp(float x)
 {
-   if (x >= 0.0f) {
-      int ipart = (int) x;
-      float fpart = x - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return y;
-   }
-   else {
-      /* exp(-x) = 1.0 / exp(x) */
-      int ipart = (int) -x;
-      float fpart = -x - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return 1.0f / y;
-   }
+   const float k = 1.44269f; /* = log2(e) */
+   return util_fast_exp2(k * x);
 }
 
 
-/**
- * Based on code from http://www.flipcode.com/archives/Fast_log_Function.shtml
- */
+#define LOG2_TABLE_SIZE_LOG2 8
+#define LOG2_TABLE_SIZE (1 << LOG2_TABLE_SIZE_LOG2)
+extern float log2_table[LOG2_TABLE_SIZE];
+
+
 static INLINE float
-util_fast_log2(float val)
+util_fast_log2(float x)
 {
    union fi num;
-   int log_2;
-   num.f = val;
-   log_2 = ((num.i >> 23) & 255) - 128;
-   num.i &= ~(255 << 23);
-   num.i += 127 << 23;
-   num.f = ((-1.0f/3) * num.f + 2) * num.f - 2.0f/3;
-   return num.f + log_2;
+   float epart, mpart;
+   num.f = x;
+   epart = (float)(((num.i & 0x7f800000) >> 23) - 127);
+   mpart = log2_table[(num.i & 0x007fffff) >> (23 - LOG2_TABLE_SIZE_LOG2)];
+   return epart + mpart;
 }
 
 
 static INLINE float
 util_fast_pow(float x, float y)
 {
-   /* XXX these tests may need adjustment */
-   if (y >= 3.0f && (-0.02f <= x && x <= 0.02f))
-      return 0.0f;
-   if (y >= 50.0f && (-0.9f <= x && x <= 0.9f))
-      return 0.0f;
    return util_fast_exp2(util_fast_log2(x) * y);
 }
 
-- 
cgit v1.2.3


From 4ae161e9409f8b5d73306bbf382c7b27d5038ab3 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Tue, 30 Sep 2008 20:50:49 +0200
Subject: Gallivm: port to llvm 2.4.

---
 configs/linux-llvm                                 |   1 +
 src/gallium/auxiliary/gallivm/gallivm_builtins.cpp | 252 ++++++++++-----------
 src/gallium/auxiliary/gallivm/instructions.cpp     |  36 +--
 src/gallium/auxiliary/gallivm/instructions.h       |   2 +-
 src/gallium/auxiliary/gallivm/instructionssoa.cpp  |   6 +-
 src/gallium/auxiliary/gallivm/instructionssoa.h    |   2 +-
 6 files changed, 150 insertions(+), 149 deletions(-)

diff --git a/configs/linux-llvm b/configs/linux-llvm
index 3b32db34d8..489cfd0546 100644
--- a/configs/linux-llvm
+++ b/configs/linux-llvm
@@ -31,4 +31,5 @@ else
   LLVM_CXXFLAGS=
 endif
 
+LD = g++
 GL_LIB_DEPS = $(LLVM_LDFLAGS) $(LLVM_LIBS) $(EXTRA_LIB_PATH) -lX11 -lXext -lm -lpthread -lstdc++
diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
index 0fc5c4ec5c..fcc5c05794 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
@@ -1,140 +1,140 @@
 static const unsigned char llvm_builtins_data[] = {
-0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x29,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
+0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
 0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39,
 0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02,
 0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24,
 0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64,
-0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x02,0x00,0x00,0x00,0x0b,0x04,0x00,0x0c,
-0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x32,0x22,0x48,0x09,
-0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,0xc6,0x05,0x42,0x52,
-0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,0x29,0x80,0x21,0x00,
-0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,0x80,0x50,0x2b,0x03,
-0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,0x14,0x01,0x80,0x11,
-0x80,0x22,0x88,0x00,0x13,0xa2,0x74,0xb0,0x03,0x3c,0xb0,0x83,0x36,0x80,0x87,0x71,
-0x68,0x03,0x76,0x48,0x07,0x77,0xa8,0x07,0x7c,0x68,0x83,0x73,0x70,0x87,0x7a,0xd8,
-0x70,0x0f,0xe5,0xd0,0x06,0xf0,0xa0,0x07,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x71,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,
-0xe9,0x80,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,0x71,0x60,0x07,0x7a,
-0x10,0x07,0x76,0xa0,0x07,0x71,0x60,0x07,0x6d,0x90,0x0e,0x73,0x20,0x07,0x7a,0x30,
-0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,
-0x72,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0e,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,
-0xa0,0x07,0x73,0x20,0x07,0x6d,0x60,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x76,0x40,0x07,0x6d,0x60,0x0f,0x76,0x40,0x07,0x7a,0x60,0x07,0x74,0xa0,0x07,
-0x76,0x40,0x07,0x6d,0x60,0x0f,0x71,0x20,0x07,0x78,0xa0,0x07,0x71,0x20,0x07,0x78,
-0xa0,0x07,0x71,0x20,0x07,0x78,0xd0,0x06,0xe1,0x00,0x07,0x7a,0x00,0x07,0x7a,0x60,
-0x07,0x74,0xd0,0x06,0xe6,0x80,0x07,0x70,0xa0,0x07,0x71,0x20,0x07,0x78,0xa0,0x07,
-0x71,0x20,0x07,0x78,0xa0,0xf3,0x40,0x88,0x04,0x32,0x32,0x02,0x04,0x20,0x76,0x46,
-0xfc,0x6c,0x48,0x92,0x00,0x40,0x00,0x00,0x00,0x00,0x0c,0x49,0x12,0x20,0x00,0x00,
-0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x01,0x00,0x00,0x00,0x30,0x24,0x59,0x00,0x20,
-0x08,0x00,0x00,0x00,0x86,0x24,0x0a,0x00,0x04,0x00,0x00,0x00,0xc0,0x90,0x84,0x01,
-0x02,0x00,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
-0x05,0x00,0x02,0x00,0x00,0x00,0x60,0x48,0x72,0x00,0x01,0x00,0x00,0x00,0x00,0x0c,
-0x49,0x14,0x00,0x08,0x00,0x00,0x00,0x80,0x21,0x49,0x01,0x00,0x41,0x00,0x00,0x00,
-0x90,0x05,0x02,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
+0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x03,0x00,0x00,0x00,0x0b,0x84,0xff,0xff,
+0xff,0xff,0x1f,0xc0,0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,
+0x32,0x22,0x48,0x09,0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,
+0xc6,0x05,0x42,0x52,0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,
+0x29,0x80,0x21,0x00,0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,
+0x80,0x50,0x2b,0x03,0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,
+0x14,0x01,0x80,0x11,0x80,0x22,0x88,0x00,0x13,0x30,0x7c,0xc0,0x03,0x3b,0xf8,0x05,
+0x3b,0xa0,0x83,0x36,0xa8,0x07,0x77,0x58,0x07,0x77,0x78,0x87,0x7b,0x70,0x87,0x36,
+0x60,0x87,0x74,0x70,0x87,0x7a,0xc0,0x87,0x36,0x38,0x07,0x77,0xa8,0x87,0x0d,0xf7,
+0x50,0x0e,0x6d,0x00,0x0f,0x7a,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xe9,0x10,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,
+0x78,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,0xe9,0x10,0x07,0x76,0xa0,0x07,0x71,
+0x60,0x07,0x7a,0x10,0x07,0x76,0xd0,0x06,0xe9,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,
+0x07,0x7a,0x30,0x07,0x72,0xd0,0x06,0xe9,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,
+0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x7a,
+0x30,0x07,0x72,0xd0,0x06,0xe6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xf6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,0x07,
+0x74,0xd0,0x06,0xf6,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,
+0x10,0x07,0x72,0x80,0x07,0x6d,0x10,0x0e,0x70,0xa0,0x07,0x70,0xa0,0x07,0x76,0x40,
+0x07,0x6d,0x60,0x0e,0x78,0x00,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,
+0x72,0x80,0x07,0x3a,0x0f,0x84,0x48,0x20,0x23,0x24,0x40,0x00,0x62,0x67,0x88,0x9f,
+0x19,0x92,0x24,0x00,0x10,0x04,0x00,0x00,0x00,0x43,0x92,0x04,0x08,0x00,0x00,0x00,
+0x00,0x60,0x48,0xa2,0x00,0x40,0x10,0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x02,
+0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x41,0x00,0x00,0x00,0x30,0x24,0x61,0x80,0x00,
+0x00,0x00,0x00,0x00,0x86,0x24,0x07,0x10,0x00,0x00,0x00,0x00,0xc0,0x90,0x44,0x01,
+0x80,0x20,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
+0x05,0x00,0x82,0x00,0x00,0x00,0x60,0x48,0x52,0x00,0x40,0x10,0x00,0x00,0x00,0x64,
+0x81,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
 0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x8a,0x8a,0x59,0x8b,0x43,0x50,0xd2,0x09,0x02,
 0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,0x64,0x56,0x28,0x66,
 0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,0xa8,0x64,0x06,0x95,
 0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,0x04,0x00,0x00,0x00,
 0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
 0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,0x10,0x84,0xc3,0x60,
-0x04,0x00,0x00,0x00,0x93,0x0c,0xce,0x43,0x4c,0x31,0x3c,0x8e,0x34,0xc9,0x30,0x41,
-0xc2,0x14,0x03,0x34,0x51,0x93,0x0c,0x4d,0x44,0x4c,0x31,0x44,0x8d,0x35,0x56,0x01,
-0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0x46,0x41,0x08,0xcc,
-0x73,0x9b,0x05,0x21,0x30,0xcf,0x6e,0x18,0x84,0x00,0x2c,0x8b,0x35,0x04,0x80,0x39,
-0x04,0x81,0x5d,0x20,0x80,0x0f,0x0c,0x43,0xe4,0xd3,0x36,0x81,0x04,0x3e,0x30,0x0c,
-0x91,0x4f,0x5b,0x05,0x12,0xf8,0xc0,0x30,0x44,0x7e,0x7d,0x00,0x05,0xd1,0x4c,0x11,
-0x66,0x12,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0xc3,0x0d,0xce,0x43,0x4c,0x37,0x3c,0x8e,0x34,0xdc,0x30,0x41,
+0xc2,0x74,0x03,0x34,0x51,0xc3,0x0d,0x4d,0x44,0x4c,0x37,0x44,0x8d,0x35,0x56,0x01,
+0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0xd6,0x10,0x00,0xe6,
+0x10,0x04,0x76,0x81,0x00,0x3e,0x30,0x0c,0x91,0x4f,0x1b,0x05,0x21,0x30,0x8f,0x6d,
+0x13,0x48,0xe0,0x03,0xc3,0x10,0xf9,0xb4,0x55,0x20,0x81,0x0f,0x0c,0x43,0xe4,0xd7,
+0x66,0x41,0x08,0xcc,0xa3,0x1f,0x40,0x41,0x34,0x53,0x84,0x99,0xc4,0x20,0x30,0x8f,
+0x61,0x10,0x02,0xb0,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x27,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
 0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,0x55,0xd6,0x1c,0x84,
 0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,0x41,0xfc,0x03,0x00,
-0x63,0x08,0x0d,0x34,0xc9,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
+0x63,0x08,0x0d,0x34,0xdc,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
 0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,0x6d,0x73,0x0c,0x19,
-0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x13,0x00,0x00,0x00,0x17,0x60,0x20,0xc5,
-0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,0x14,0x13,0xf3,0xd4,0xb8,0x69,
-0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,0xba,0x35,0x0c,0x13,0xf3,0x9c,
-0x80,0xe4,0x36,0x48,0x81,0x10,0xc3,0x4a,0x4c,0x54,0xd4,0x6c,0x8b,0x23,0x28,0x76,
-0x41,0x4c,0xcc,0xa3,0x1b,0x07,0x21,0x00,0xcb,0x72,0x00,0x05,0xd1,0x4c,0x11,0x66,
-0x18,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,0x11,0x00,0x00,0x00,
-0x63,0x08,0x4d,0x64,0x16,0xc1,0x49,0x86,0xab,0x22,0x66,0x19,0x02,0x01,0x1b,0x43,
-0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,0x0a,0x20,0x0b,0x34,
-0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x24,0x83,0x57,0x11,0xb3,
-0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,0x48,0xb3,0x04,0xc6,
-0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,0x63,0x08,0xcd,0x64,
-0x64,0x40,0x70,0x92,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,0x60,0x0c,0xc1,0x99,
-0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,0x33,0x38,0xd0,0x00,
-0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,0xe0,0x24,0x03,0x1b,
-0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,0x30,0x63,0x08,0x0f,
-0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,0xa0,0x06,0x70,0x00,
-0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x76,0x52,0x4c,0xcc,
-0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,0xc6,0x50,0x8a,0x89,
-0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,0x79,0x68,0x73,0x20,
-0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,0x9e,0xdb,0x32,0x88,
-0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,0xa7,0xb7,0x95,0x62,
-0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,0x34,0x35,0x56,0x62,
-0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,0x2c,0x82,0xd3,0x0c,
-0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,0x24,0x01,0x63,0xec,
-0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,0xfc,0xc4,0xd0,0x90,
-0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,0xc1,0x71,0x7b,0x29,
-0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,0x32,0xf6,0xe6,0x46,
-0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,0x4e,0x33,0x58,0x47,
-0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,0x33,0xe1,0xbc,0xa5,
-0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,0x3a,0x40,0xc6,0xde,
-0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,0xed,0x82,0x10,0x9c,
-0xa6,0xba,0x81,0x44,0x70,0x9a,0xc1,0x17,0x9c,0x66,0x32,0x93,0x42,0x60,0x1e,0x7b,
-0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,0xa7,0x6d,0xa4,0x98,
-0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,0x9c,0x66,0xc0,0x7b,
-0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,0x8b,0xe0,0x34,0x83,
-0x2f,0x38,0xcd,0x64,0xd3,0xe6,0x61,0x08,0x4e,0x53,0xd5,0xf6,0x01,0x14,0x44,0x33,
-0x45,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
-0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,0x1e,0xe1,0x19,0xc6,
-0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,0x15,0xc1,0x31,0x84,
-0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,0x71,0x6c,0x23,0x38,
-0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,0x35,0xc7,0x20,0x79,
-0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,0x04,0x10,0x20,0xd5,
-0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,0x30,0x06,0x64,0xe0,
-0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,0x63,0x21,0x40,0x70,
-0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,0x25,0x00,0x00,0x00,
-0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,0x05,0x31,0x31,0xcf,
-0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,0x41,0x08,0xc0,0xb2,
-0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,0x9b,0x8a,0x21,0x00,
-0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,0x82,0xd3,0x54,0xb7,
-0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,0x9d,0x9b,0x87,0x21,
-0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,0x10,0x82,0xd3,0x54,
-0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,0xb4,0x8d,0x14,0x13,
-0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,0x00,0x2c,0x8b,0xcd,
-0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,0x00,0x05,0xd1,0x4c,
-0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0x4a,0x60,0x04,
-0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0x48,
-0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,0x44,0x39,0x58,0x85,
-0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x16,0x41,0x4c,0xcc,0x63,0xdb,0x04,0x31,
-0x31,0x4f,0x6e,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x56,0x41,0x4c,
-0xcc,0xd3,0x1b,0x45,0x21,0x00,0xcb,0xb2,0x9b,0x04,0x21,0x00,0xcb,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,
-0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,0x34,0xc7,0x20,0x51,
-0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,0x94,0x83,0x58,0x38,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x76,0x51,
-0x4c,0xcc,0x53,0xdb,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,0x31,0x8f,0x6e,0x0d,
-0x43,0x05,0x2c,0x66,0x41,0x4c,0xcc,0xd3,0x1f,0x40,0x41,0x34,0x53,0x84,0x19,0x05,
-0x21,0x00,0xcb,0x02,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0xa0,0x04,
-0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,0xc9,0x30,0x49,0xc4,
-0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xc9,0x50,0x49,0xc4,0x2c,0x03,0x21,0x58,
-0x63,0x08,0x4d,0x34,0xc9,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,0x63,0x08,0x8d,0x33,
-0xc9,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,0x1a,0x00,0x00,0x00,
-0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,0x04,0x31,0x31,0x4f,
-0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x47,0x20,0xb9,0x0d,0x52,
-0x20,0xc4,0xb0,0x12,0x13,0x15,0x35,0xdb,0xe2,0x08,0x8a,0x5d,0x10,0x13,0xf3,0xec,
-0x37,0x90,0x2c,0x4e,0xf4,0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,
-0x74,0x02,0xc8,0xe2,0x44,0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,
-0x18,0x11,0x31,0x55,0xc0,0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,
-0x46,0x31,0x08,0xcc,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,
-0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,0x23,0x59,0xc2,0x20,0x09,0x92,0x1d,0x18,
-0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,
-0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,
-0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,0x84,0x34,0x85,0x31,0x10,0x0a,0xb2,0x3c,
-0x56,0x30,0x08,0xcc,0x63,0x0b,0x44,0x25,0x21,0x0d,0x00,0x00,0x00,0x00,0x00,0x00};
+0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x10,0x00,0x00,0x00,0x27,0x50,0x20,0x05,
+0xd1,0x0c,0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,
+0x14,0x13,0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,
+0xba,0x35,0x0c,0x13,0xf3,0xd8,0x05,0x31,0x31,0x8f,0x6e,0x1c,0x84,0x00,0x2c,0xcb,
+0x01,0x14,0x44,0x33,0x45,0x98,0x61,0x0c,0x02,0xf3,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,
+0x11,0x00,0x00,0x00,0x63,0x08,0x4d,0x64,0x16,0xc1,0xe1,0x86,0xab,0x22,0x66,0x19,
+0x02,0x01,0x1b,0x43,0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,
+0x0a,0x20,0x0b,0x34,0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x70,
+0x83,0x57,0x11,0xb3,0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,
+0x48,0xb3,0x04,0xc6,0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,
+0x63,0x08,0xcd,0x64,0x64,0x40,0x70,0xb8,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,
+0x60,0x0c,0xc1,0x99,0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,
+0x33,0x38,0xd0,0x00,0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,
+0xe0,0x70,0x03,0x1b,0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,
+0x30,0x63,0x08,0x0f,0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,
+0xa0,0x06,0x70,0x00,0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x76,0x52,0x4c,0xcc,0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,
+0xc6,0x50,0x8a,0x89,0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,
+0x79,0x68,0x73,0x20,0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,
+0x9e,0xdb,0x32,0x88,0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,
+0xa7,0xb7,0x95,0x62,0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,
+0x34,0x35,0x56,0x62,0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,
+0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,
+0x24,0x01,0x63,0xec,0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,
+0xfc,0xc4,0xd0,0x90,0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,
+0xc1,0x71,0x7b,0x29,0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,
+0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,
+0x4e,0x33,0x58,0x47,0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,
+0x33,0xe1,0xbc,0xa5,0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,
+0x3a,0x40,0xc6,0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,
+0x6f,0x20,0x11,0x9c,0x66,0xf0,0x05,0xa7,0x99,0xec,0x82,0x10,0x9c,0xa6,0x32,0x93,
+0x42,0x60,0x1e,0x7b,0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,
+0xa7,0x6d,0xa4,0x98,0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,
+0x9c,0x66,0xc0,0x7b,0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,
+0x8b,0xe0,0x34,0x83,0x2f,0x38,0xcd,0x64,0xd3,0x07,0x50,0x10,0xcd,0x14,0x61,0xe6,
+0x61,0x08,0x4e,0x53,0xd5,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
+0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,
+0x1e,0xe1,0x19,0xc6,0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,
+0x15,0xc1,0x31,0x84,0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,
+0x71,0x6c,0x23,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,
+0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,
+0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,
+0x30,0x06,0x64,0xe0,0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,
+0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,
+0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,
+0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,
+0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,
+0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,
+0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,
+0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,
+0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,
+0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,
+0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,
+0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,
+0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,
+0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x26,0x41,0x08,0xc0,
+0xb2,0x18,0x45,0x21,0x00,0xcb,0xb2,0x5b,0x04,0x31,0x31,0x8f,0x6d,0x13,0xc4,0xc4,
+0x3c,0xb9,0x35,0x0c,0x15,0xb0,0x58,0x05,0x31,0x31,0x4f,0x7f,0x00,0x05,0xd1,0x4c,
+0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,
+0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
+0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,
+0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,
+0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x76,0x51,0x4c,0xcc,
+0x53,0xdb,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,
+0x31,0x8f,0x6e,0x16,0xc4,0xc4,0x3c,0xbd,0x51,0x10,0x02,0xb0,0x2c,0xd6,0x30,0x54,
+0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x2c,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,
+0xdc,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xdc,0x50,0x49,0xc4,
+0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xdc,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,
+0x63,0x08,0x8d,0x33,0xdc,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,
+0x17,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,
+0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x76,
+0x41,0x4c,0xcc,0xb3,0x1f,0x81,0x11,0x11,0x13,0x15,0x35,0x37,0x90,0x2c,0x4e,0xf4,
+0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44,
+0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0,
+0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03,
+0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,
+0x23,0x19,0xc3,0xa0,0x20,0x8b,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,
+0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,
+0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,
+0x84,0x34,0x85,0x25,0x0c,0x92,0x20,0x59,0xc1,0x20,0x30,0x8f,0x2d,0x10,0x95,0x84,
+0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index 3eaf9aacf6..599975d5ad 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -923,7 +923,7 @@ llvm::Value * Instructions::callCeil(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> ceilArgs;
       ceilArgs.push_back(Type::FloatTy);
-      PAListPtr ceilPal;
+      AttrListPtr ceilPal;
       FunctionType* ceilType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/ceilArgs,
@@ -933,7 +933,7 @@ llvm::Value * Instructions::callCeil(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"ceilf", m_mod);
       m_llvmCeil->setCallingConv(CallingConv::C);
-      m_llvmCeil->setParamAttrs(ceilPal);
+      m_llvmCeil->setAttributes(ceilPal);
    }
    CallInst *call =  m_builder.CreateCall(m_llvmCeil, val,
                                           name("ceilf"));
@@ -948,7 +948,7 @@ llvm::Value *Instructions::callFAbs(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> fabsArgs;
       fabsArgs.push_back(Type::FloatTy);
-      PAListPtr fabsPal;
+      AttrListPtr fabsPal;
       FunctionType* fabsType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/fabsArgs,
@@ -958,7 +958,7 @@ llvm::Value *Instructions::callFAbs(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"fabs", m_mod);
       m_llvmFAbs->setCallingConv(CallingConv::C);
-      m_llvmFAbs->setParamAttrs(fabsPal);
+      m_llvmFAbs->setAttributes(fabsPal);
    }
    CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
                                          name("fabs"));
@@ -973,7 +973,7 @@ llvm::Value * Instructions::callFExp(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> fexpArgs;
       fexpArgs.push_back(Type::FloatTy);
-      PAListPtr fexpPal;
+      AttrListPtr fexpPal;
       FunctionType* fexpType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/fexpArgs,
@@ -983,7 +983,7 @@ llvm::Value * Instructions::callFExp(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"expf", m_mod);
       m_llvmFexp->setCallingConv(CallingConv::C);
-      m_llvmFexp->setParamAttrs(fexpPal);
+      m_llvmFexp->setAttributes(fexpPal);
    }
    CallInst *call = m_builder.CreateCall(m_llvmFexp, val,
                                          name("expf"));
@@ -998,7 +998,7 @@ llvm::Value * Instructions::callFLog(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> flogArgs;
       flogArgs.push_back(Type::FloatTy);
-      PAListPtr flogPal;
+      AttrListPtr flogPal;
       FunctionType* flogType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/flogArgs,
@@ -1008,7 +1008,7 @@ llvm::Value * Instructions::callFLog(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"logf", m_mod);
       m_llvmFlog->setCallingConv(CallingConv::C);
-      m_llvmFlog->setParamAttrs(flogPal);
+      m_llvmFlog->setAttributes(flogPal);
    }
    CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
                                          name("logf"));
@@ -1023,7 +1023,7 @@ llvm::Value * Instructions::callFloor(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> floorArgs;
       floorArgs.push_back(Type::FloatTy);
-      PAListPtr floorPal;
+      AttrListPtr floorPal;
       FunctionType* floorType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/floorArgs,
@@ -1033,7 +1033,7 @@ llvm::Value * Instructions::callFloor(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"floorf", m_mod);
       m_llvmFloor->setCallingConv(CallingConv::C);
-      m_llvmFloor->setParamAttrs(floorPal);
+      m_llvmFloor->setAttributes(floorPal);
    }
    CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
                                           name("floorf"));
@@ -1048,7 +1048,7 @@ llvm::Value *Instructions::callFSqrt(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> fsqrtArgs;
       fsqrtArgs.push_back(Type::FloatTy);
-      PAListPtr fsqrtPal;
+      AttrListPtr fsqrtPal;
       FunctionType* fsqrtType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/fsqrtArgs,
@@ -1058,7 +1058,7 @@ llvm::Value *Instructions::callFSqrt(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"llvm.sqrt.f32", m_mod);
       m_llvmFSqrt->setCallingConv(CallingConv::C);
-      m_llvmFSqrt->setParamAttrs(fsqrtPal);
+      m_llvmFSqrt->setAttributes(fsqrtPal);
    }
    CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
                                          name("sqrt"));
@@ -1074,7 +1074,7 @@ llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
       std::vector<const Type*> powArgs;
       powArgs.push_back(Type::FloatTy);
       powArgs.push_back(Type::FloatTy);
-      PAListPtr powPal;
+      AttrListPtr powPal;
       FunctionType* powType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/powArgs,
@@ -1084,7 +1084,7 @@ llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"llvm.pow.f32", m_mod);
       m_llvmPow->setCallingConv(CallingConv::C);
-      m_llvmPow->setParamAttrs(powPal);
+      m_llvmPow->setAttributes(powPal);
    }
    std::vector<Value*> params;
    params.push_back(val1);
@@ -1126,7 +1126,7 @@ llvm::Value * Instructions::constVector(float x, float y, float z, float w)
 llvm::Function * Instructions::declarePrintf()
 {
    std::vector<const Type*> args;
-   PAListPtr params;
+   AttrListPtr params;
    FunctionType* funcTy = FunctionType::get(
       /*Result=*/IntegerType::get(32),
       /*Params=*/args,
@@ -1136,7 +1136,7 @@ llvm::Function * Instructions::declarePrintf()
       /*Linkage=*/GlobalValue::ExternalLinkage,
       /*Name=*/"printf", m_mod);
    func_printf->setCallingConv(CallingConv::C);
-   func_printf->setParamAttrs(params);
+   func_printf->setAttributes(params);
    return func_printf;
 }
 
@@ -1148,7 +1148,7 @@ llvm::Function * Instructions::declareFunc(int label)
    args.push_back(vecPtr);
    args.push_back(vecPtr);
    args.push_back(vecPtr);
-   PAListPtr params;
+   AttrListPtr params;
    FunctionType *funcType = FunctionType::get(
       /*Result=*/Type::VoidTy,
       /*Params=*/args,
@@ -1159,7 +1159,7 @@ llvm::Function * Instructions::declareFunc(int label)
       /*Linkage=*/GlobalValue::ExternalLinkage,
       /*Name=*/name.c_str(), m_mod);
    func->setCallingConv(CallingConv::C);
-   func->setParamAttrs(params);
+   func->setAttributes(params);
    return func;
 }
 
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index c3b28e9746..e18571251e 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -146,7 +146,7 @@ private:
    llvm::Module             *m_mod;
    llvm::Function           *m_func;
    char                      m_name[32];
-   llvm::IRBuilder           m_builder;
+   llvm::IRBuilder<>         m_builder;
    int                       m_idx;
 
    llvm::VectorType *m_floatVecType;
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index 9a3ed9f538..5863f37095 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -259,7 +259,7 @@ void InstructionsSoa::createBuiltins()
 {
    MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
       (const char*)&soabuiltins_data[0],
-      (const char*)&soabuiltins_data[Elements(soabuiltins_data)-1]);
+      (const char*)&soabuiltins_data[Elements(soabuiltins_data)]);
    m_builtins = ParseBitcodeFile(buffer);
    std::cout<<"Builtins created at "<<m_builtins<<std::endl;
    assert(m_builtins);
@@ -458,8 +458,8 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
       func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage,
                               originalFunc->getName(), currentModule());
       func->setCallingConv(CallingConv::C);
-      const PAListPtr pal;
-      func->setParamAttrs(pal);
+      const AttrListPtr pal;
+      func->setAttributes(pal);
       currentModule()->dump();
    } else {
       DenseMap<const Value*, Value *> val;
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 3e20b652dd..20cab3b3bb 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -96,7 +96,7 @@ private:
                                          const std::vector<llvm::Value*> in3);
    void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST);
 private:
-   llvm::IRBuilder  m_builder;
+   llvm::IRBuilder<>  m_builder;
    StorageSoa *m_storage;
 
    std::map<int, std::string> m_functionsMap;
-- 
cgit v1.2.3


From 5e585719ebab17959d972e2e69c04203ecd3f2f3 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Tue, 30 Sep 2008 14:07:09 -0600
Subject: cell:  Moved X86 checks to wrap #include section so that Cell targets
 will compile again.

---
 src/gallium/auxiliary/tgsi/tgsi_sse2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4fdad3a5c7..94b2df2dbb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,6 +25,8 @@
  * 
  **************************************************************************/
 
+#ifdef PIPE_ARCH_X86
+
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
@@ -36,8 +38,6 @@
 
 #include "rtasm/rtasm_x86sse.h"
 
-#ifdef PIPE_ARCH_X86
-
 /* for 1/sqrt()
  *
  * This costs about 100fps (close to 10%) in gears:
-- 
cgit v1.2.3


From e3378790bb43d65689349545a14c8651677aff41 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Tue, 30 Sep 2008 15:38:38 -0600
Subject: cell: Fixed usage of MAX_INSTRUCTIONS to use new
 MAX_PROGRAM_INSTRUCTIONS instead of old MAX_NV_XXX definitions in order to
 allow Cell TGSI fragment program generator to work again.

---
 src/mesa/shader/arbprogparse.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/mesa/shader/arbprogparse.c b/src/mesa/shader/arbprogparse.c
index f499499eb3..983f61a653 100644
--- a/src/mesa/shader/arbprogparse.c
+++ b/src/mesa/shader/arbprogparse.c
@@ -64,12 +64,6 @@ having three separate program parameter arrays.
 #include "prog_statevars.h"
 #include "prog_instruction.h"
 
-
-/* For ARB programs, use the NV instruction limits */
-#define MAX_INSTRUCTIONS MAX2(MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS, \
-                              MAX_NV_VERTEX_PROGRAM_INSTRUCTIONS)
-
-
 /**
  * This is basically a union of the vertex_program and fragment_program
  * structs that we can use to parse the program into
@@ -3443,7 +3437,7 @@ parse_instructions(GLcontext * ctx, const GLubyte * inst,
       : ctx->Const.VertexProgram.MaxInstructions;
    GLint err = 0;
 
-   ASSERT(MAX_INSTRUCTIONS >= maxInst);
+   ASSERT(MAX_PROGRAM_INSTRUCTIONS >= maxInst);
 
    Program->MajorVersion = (GLuint) * inst++;
    Program->MinorVersion = (GLuint) * inst++;
@@ -3798,7 +3792,7 @@ _mesa_parse_arb_program(GLcontext *ctx, GLenum target,
 
    /* Initialize the arb_program struct */
    program->Base.String = strz;
-   program->Base.Instructions = _mesa_alloc_instructions(MAX_INSTRUCTIONS);
+   program->Base.Instructions = _mesa_alloc_instructions(MAX_PROGRAM_INSTRUCTIONS);
    program->Base.NumInstructions =
    program->Base.NumTemporaries =
    program->Base.NumParameters =
@@ -3843,12 +3837,12 @@ _mesa_parse_arb_program(GLcontext *ctx, GLenum target,
 
    _mesa_free (parsed);
 
-   /* Reallocate the instruction array from size [MAX_INSTRUCTIONS]
+   /* Reallocate the instruction array from size [MAX_PROGRAM_INSTRUCTIONS]
     * to size [ap.Base.NumInstructions].
     */
    program->Base.Instructions
       = _mesa_realloc_instructions(program->Base.Instructions,
-                                   MAX_INSTRUCTIONS,
+                                   MAX_PROGRAM_INSTRUCTIONS,
                                    program->Base.NumInstructions);
 
    return !err;
-- 
cgit v1.2.3


From a6ff215777da2181d7099284f2da28eff78273a9 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Wed, 1 Oct 2008 00:00:58 +0200
Subject: Gallivm: add slt. glxgears should be running, except it isn't.

---
 src/gallium/auxiliary/gallivm/instructionssoa.cpp |   9 ++
 src/gallium/auxiliary/gallivm/instructionssoa.h   |   2 +
 src/gallium/auxiliary/gallivm/soabuiltins.c       | 155 +++++++++++++---------
 src/gallium/auxiliary/gallivm/tgsitollvm.cpp      |   1 +
 4 files changed, 101 insertions(+), 66 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index 5863f37095..a658072551 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -181,6 +181,7 @@ void InstructionsSoa::createFunctionMap()
    m_functionsMap[TGSI_OPCODE_POWER] = "pow";
    m_functionsMap[TGSI_OPCODE_LIT]   = "lit";
    m_functionsMap[TGSI_OPCODE_RSQ]   = "rsq";
+   m_functionsMap[TGSI_OPCODE_SLT]   = "slt";
 }
 
 void InstructionsSoa::createDependencies()
@@ -280,6 +281,14 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1, in2);
 }
 
+
+std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_SLT);
+   return callBuiltin(func, in1, in2);
+}
+
 llvm::Value * InstructionsSoa::allocaTemp()
 {
    VectorType *vector   = VectorType::get(Type::FloatTy, 4);
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 20cab3b3bb..3817fdc904 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -69,6 +69,8 @@ public:
    std::vector<llvm::Value*> pow(const std::vector<llvm::Value*> in1,
                                  const std::vector<llvm::Value*> in2);
    std::vector<llvm::Value*> rsq(const std::vector<llvm::Value*> in1);
+   std::vector<llvm::Value*> slt(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
    std::vector<llvm::Value*> sub(const std::vector<llvm::Value*> in1,
                                  const std::vector<llvm::Value*> in2);
    void         end();
diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
index 78f84510e2..cb85e1734e 100644
--- a/src/gallium/auxiliary/gallivm/soabuiltins.c
+++ b/src/gallium/auxiliary/gallivm/soabuiltins.c
@@ -36,6 +36,8 @@ typedef __attribute__(( ext_vector_type(4) )) float float4;
 
 extern float fabsf(float val);
 
+/* helpers */
+
 float4 absvec(float4 vec)
 {
    float4 res;
@@ -47,6 +49,58 @@ float4 absvec(float4 vec)
    return res;
 }
 
+float4 maxvec(float4 a, float4 b)
+{
+   return (float4){(a.x > b.x) ? a.x : b.x,
+         (a.y > b.y) ? a.y : b.y,
+         (a.z > b.z) ? a.z : b.z,
+         (a.w > b.w) ? a.w : b.w};
+}
+
+float4 minvec(float4 a, float4 b)
+{
+   return (float4){(a.x < b.x) ? a.x : b.x,
+         (a.y < b.y) ? a.y : b.y,
+         (a.z < b.z) ? a.z : b.z,
+         (a.w < b.w) ? a.w : b.w};
+}
+
+extern float powf(float num, float p);
+extern float sqrtf(float x);
+
+float4 powvec(float4 vec, float4 q)
+{
+   float4 p;
+   p.x = powf(vec.x, q.x);
+   p.y = powf(vec.y, q.y);
+   p.z = powf(vec.z, q.z);
+   p.w = powf(vec.w, q.w);
+   return p;
+}
+
+float4 sqrtvec(float4 vec)
+{
+   float4 p;
+   p.x = sqrtf(vec.x);
+   p.y = sqrtf(vec.y);
+   p.z = sqrtf(vec.z);
+   p.w = sqrtf(vec.w);
+   return p;
+}
+
+float4 sltvec(float4 v1, float4 v2)
+{
+   float4 p;
+   p.x = (v1.x < v2.x) ? 1.0 : 0.0;
+   p.y = (v1.y < v2.y) ? 1.0 : 0.0;
+   p.z = (v1.z < v2.z) ? 1.0 : 0.0;
+   p.w = (v1.w < v2.w) ? 1.0 : 0.0;
+   return p;
+}
+
+
+/* instructions */
+
 void abs(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
 {
@@ -69,7 +123,6 @@ void dp3(float4 *res,
    res[3] = dot;
 }
 
-
 void dp4(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
          float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -83,35 +136,25 @@ void dp4(float4 *res,
    res[3] = dot;
 }
 
-extern float powf(float num, float p);
-extern float sqrtf(float x);
-
-float4 powvec(float4 vec, float4 q)
-{
-   float4 p;
-   p.x = powf(vec.x, q.x);
-   p.y = powf(vec.y, q.y);
-   p.z = powf(vec.z, q.z);
-   p.w = powf(vec.w, q.w);
-   return p;
-}
-
-void pow(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+void lit(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
 {
-   res[0] = powvec(tmp0x, tmp1x);
-   res[1] = res[0];
-   res[2] = res[0];
-   res[3] = res[0];
-}
+   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
+   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
+   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
 
-float4 minvec(float4 a, float4 b)
-{
-   return (float4){(a.x < b.x) ? a.x : b.x,
-         (a.y < b.y) ? a.y : b.y,
-         (a.z < b.z) ? a.z : b.z,
-         (a.w < b.w) ? a.w : b.w};
+   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
+   if (tmp0x.x > 0) {
+      float4 tmpy = maxvec(tmp0y, zerovec);
+      float4 tmpw = minvec(tmp0w, plus128);
+      tmpw = maxvec(tmpw, min128);
+      res[1] = tmp0x;
+      res[2] = powvec(tmpy, tmpw);
+   } else {
+      res[1] = zerovec;
+      res[2] = zerovec;
+   }
+   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
 }
 
 void min(float4 *res,
@@ -125,14 +168,6 @@ void min(float4 *res,
 }
 
 
-float4 maxvec(float4 a, float4 b)
-{
-   return (float4){(a.x > b.x) ? a.x : b.x,
-         (a.y > b.y) ? a.y : b.y,
-         (a.z > b.z) ? a.z : b.z,
-         (a.w > b.w) ? a.w : b.w};
-}
-
 void max(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
          float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -143,37 +178,14 @@ void max(float4 *res,
    res[3] = maxvec(tmp0w, tmp1w);
 }
 
-
-void lit(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
-{
-   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
-   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
-   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
-
-   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
-   if (tmp0x.x > 0) {
-      float4 tmpy = maxvec(tmp0y, zerovec);
-      float4 tmpw = minvec(tmp0w, plus128);
-      tmpw = maxvec(tmpw, min128);
-      res[1] = tmp0x;
-      res[2] = powvec(tmpy, tmpw);
-   } else {
-      res[1] = zerovec;
-      res[2] = zerovec;
-   }
-   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
-}
-
-
-float4 sqrtvec(float4 vec)
+void pow(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
 {
-   float4 p;
-   p.x = sqrtf(vec.x);
-   p.y = sqrtf(vec.y);
-   p.z = sqrtf(vec.z);
-   p.w = sqrtf(vec.w);
-   return p;
+   res[0] = powvec(tmp0x, tmp1x);
+   res[1] = res[0];
+   res[2] = res[0];
+   res[3] = res[0];
 }
 
 void rsq(float4 *res,
@@ -185,3 +197,14 @@ void rsq(float4 *res,
    res[2] = onevec/sqrtvec(absvec(tmp0z));
    res[3] = onevec/sqrtvec(absvec(tmp0w));
 }
+
+void slt(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   res[0] = sltvec(tmp0x, tmp1x);
+   res[1] = sltvec(tmp0y, tmp1y);
+   res[2] = sltvec(tmp0z, tmp1z);
+   res[3] = sltvec(tmp0w, tmp1w);
+}
+
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index fdfbb76c16..7292c0e366 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -767,6 +767,7 @@ translate_instructionir(llvm::Module *module,
    }
       break;
    case TGSI_OPCODE_SLT: {
+      out = instr->slt(inputs[0], inputs[1]);
    }
       break;
    case TGSI_OPCODE_SGE: {
-- 
cgit v1.2.3


From cbfce4175bf72788842bb45fa11c7e19caa8e6a8 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 1 Oct 2008 08:27:20 +0900
Subject: tgsi: Include p_config.h.

---
 src/gallium/auxiliary/tgsi/tgsi_sse2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 94b2df2dbb..79f424b692 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,6 +25,8 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_config.h"
+
 #ifdef PIPE_ARCH_X86
 
 #include "pipe/p_debug.h"
-- 
cgit v1.2.3


From cb8a3ba433190b7af254349b00d356b31e813a1a Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 1 Oct 2008 08:28:05 +0900
Subject: util: No-op u_sse.h outside PIPE_ARCH_X86/X86_64.

---
 src/gallium/auxiliary/util/u_sse.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 0c8356cd05..68e56f0816 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -37,6 +37,10 @@
 #ifndef U_SSE_H_
 #define U_SSE_H_
 
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
@@ -66,7 +70,8 @@ _mm_castps_si128(__m128 a)
    return u.m128i;
 }
 
-#endif
+#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
 
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
 
 #endif /* U_SSE_H_ */
-- 
cgit v1.2.3


From 23e325e55a24a94cbbeea1592d07f1a09a844de7 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 1 Oct 2008 10:25:41 +0900
Subject: Add -msse and -msse2 to the *-x86 configs.

---
 configs/linux-dri-x86 | 3 +--
 configs/linux-x86     | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/configs/linux-dri-x86 b/configs/linux-dri-x86
index 4eedfa52f7..4e7d45d35a 100644
--- a/configs/linux-dri-x86
+++ b/configs/linux-dri-x86
@@ -5,8 +5,7 @@ include $(TOP)/configs/linux-dri
 
 CONFIG_NAME = linux-dri-x86
 
-# Add -m32 to CFLAGS:
-ARCH_FLAGS = -m32
+ARCH_FLAGS = -m32 -mmmx -msse -msse2
 
 ASM_FLAGS = -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM
 MESA_ASM_SOURCES = $(X86_SOURCES)
diff --git a/configs/linux-x86 b/configs/linux-x86
index a4cf4e8d62..5970b185ce 100644
--- a/configs/linux-x86
+++ b/configs/linux-x86
@@ -4,6 +4,8 @@ include $(TOP)/configs/linux
 
 CONFIG_NAME = linux-x86
 
+ARCH_FLAGS = -m32 -mmmx -msse -msse2
+
 ASM_FLAGS = -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM
 MESA_ASM_SOURCES = $(X86_SOURCES)
 GLAPI_ASM_SOURCES = $(X86_API)
-- 
cgit v1.2.3


From 266c5f5ccb3200c1fa195653d53748410078eac7 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Wed, 1 Oct 2008 19:36:04 +0200
Subject: mesa: Fix compiler warnings on Windows.

---
 src/mesa/shader/arbprogparse.c     | 2 +-
 src/mesa/shader/slang/slang_link.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/shader/arbprogparse.c b/src/mesa/shader/arbprogparse.c
index 466ae48bef..34350ac4f3 100644
--- a/src/mesa/shader/arbprogparse.c
+++ b/src/mesa/shader/arbprogparse.c
@@ -2609,7 +2609,7 @@ parse_src_reg (GLcontext * ctx, const GLubyte ** inst,
       /* If we're referencing the Program->Parameters[] array, check if the
        * parameter is really a constant/literal.  If so, set File to CONSTANT.
        */
-      assert(*Index < Program->Base.Parameters->NumParameters);
+      assert(*Index < (GLint) Program->Base.Parameters->NumParameters);
       file = Program->Base.Parameters->Parameters[*Index].Type;
       if (file == PROGRAM_CONSTANT)
          *File = PROGRAM_CONSTANT;
diff --git a/src/mesa/shader/slang/slang_link.c b/src/mesa/shader/slang/slang_link.c
index dd7d5be6d8..202080d9d4 100644
--- a/src/mesa/shader/slang/slang_link.c
+++ b/src/mesa/shader/slang/slang_link.c
@@ -408,7 +408,7 @@ _slang_update_inputs_outputs(struct gl_program *prog)
             }
          }
          else if (inst->SrcReg[j].File == PROGRAM_ADDRESS) {
-            maxAddrReg = MAX2(maxAddrReg, inst->SrcReg[j].Index + 1);
+            maxAddrReg = MAX2(maxAddrReg, (GLuint) (inst->SrcReg[j].Index + 1));
          }
       }
       if (inst->DstReg.File == PROGRAM_OUTPUT) {
-- 
cgit v1.2.3


From df6ae3f0a39f95cb1199ac16c98be2bf9a3bc96f Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 1 Oct 2008 14:14:06 -0700
Subject: i965: Fix overwriting of depth override for SetTexOffset.

Fixes black borders around windows in compiz.  Bug #17233.
---
 src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 47127c0421..3790b50c97 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -274,7 +274,6 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    key.width = firstImage->Width;
    key.height = firstImage->Height;
    key.cpp = intelObj->mt->cpp;
-   key.depth = firstImage->Depth;
    key.tiling = intelObj->mt->region->tiling;
 
    dri_bo_unreference(brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS]);
-- 
cgit v1.2.3


From 17fdd1ab3b48da8dff742e626c3c59bc89cafeed Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 1 Oct 2008 16:58:38 -0700
Subject: i965: sampler default color ends up in texture cache, not
 instructions.

See volume 4, SAMPLER_BORDER_COLOR_STATE programming notes.
---
 src/mesa/drivers/dri/i965/brw_wm_sampler_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index e1db31ec08..977b90ad43 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -304,7 +304,7 @@ static void upload_wm_samplers( struct brw_context *brw )
 	    continue;
 
 	 dri_bo_emit_reloc(brw->wm.sampler_bo,
-			   I915_GEM_DOMAIN_INSTRUCTION, 0,
+			   I915_GEM_DOMAIN_SAMPLER, 0,
 			   0,
 			   i * sizeof(struct brw_sampler_state) +
 			   offsetof(struct brw_sampler_state, ss2),
-- 
cgit v1.2.3


From 4741dbcbbc2514de370a760f4b78a17491014555 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 1 Oct 2008 15:51:56 -0700
Subject: Unify ARB_depth_texture and SGIX_depth_texture

The ARB extension is a superset of the older SGIX extension.  Any
hardware that can support the SGIX version can also support the ARB
version.  In Mesa, any driver that supports one also supports the
other.  This unification just simplifies some bits of code.
---
 src/mesa/drivers/dri/i915/i915_context.c    |  2 --
 src/mesa/drivers/dri/intel/intel_context.c  |  2 --
 src/mesa/drivers/dri/intel/intel_tex_copy.c |  2 +-
 src/mesa/drivers/dri/r300/r300_context.c    |  1 -
 src/mesa/main/depthstencil.c                |  4 ++--
 src/mesa/main/extensions.c                  |  3 +--
 src/mesa/main/mtypes.h                      |  1 -
 src/mesa/main/texformat.c                   |  9 ++++-----
 src/mesa/main/teximage.c                    | 18 ++++++++----------
 src/mesa/main/texparam.c                    |  3 +--
 src/mesa/swrast/s_texstore.c                |  6 +++---
 11 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index efcac911aa..e0ddc7fd61 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -56,8 +56,6 @@ static const struct dri_extension i915_extensions[] = {
    {"GL_ARB_shadow", NULL},
    {"GL_ARB_texture_non_power_of_two", NULL},
    {"GL_EXT_shadow_funcs", NULL},
-   /* ARB extn won't work if not enabled */
-   {"GL_SGIX_depth_texture", NULL},
    {NULL, NULL}
 };
 
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 1dd3ee7d0a..e53972c46d 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -404,8 +404,6 @@ static const struct dri_extension brw_extensions[] = {
    { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
    { "GL_EXT_shadow_funcs",               NULL },
    { "GL_EXT_texture_sRGB",		  NULL },
-   /* ARB extn won't work if not enabled */
-   { "GL_SGIX_depth_texture",             NULL },
    { NULL,                                NULL }
 };
 
diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c
index a7b88b39c0..f4cb4a781c 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c
@@ -60,7 +60,7 @@ get_teximage_source(struct intel_context *intel, GLenum internalFormat)
 
    switch (internalFormat) {
    case GL_DEPTH_COMPONENT:
-   case GL_DEPTH_COMPONENT16_ARB:
+   case GL_DEPTH_COMPONENT16:
       irb = intel_get_renderbuffer(intel->ctx.ReadBuffer, BUFFER_DEPTH);
       if (irb && irb->region && irb->region->cpp == 2)
          return irb->region;
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index ee4a69dce3..37436275e3 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -138,7 +138,6 @@ const struct dri_extension card_extensions[] = {
   {"GL_NV_blend_square",		NULL},
   {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
   {"GL_SGIS_generate_mipmap",		NULL},
-  {"GL_SGIX_depth_texture",		NULL},
   {NULL,				NULL}
   /* *INDENT-ON* */
 };
diff --git a/src/mesa/main/depthstencil.c b/src/mesa/main/depthstencil.c
index fb54d6184d..9d208e2997 100644
--- a/src/mesa/main/depthstencil.c
+++ b/src/mesa/main/depthstencil.c
@@ -282,8 +282,8 @@ _mesa_new_z24_renderbuffer_wrapper(GLcontext *ctx,
    z24rb->RefCount = 1;
    z24rb->Width = dsrb->Width;
    z24rb->Height = dsrb->Height;
-   z24rb->InternalFormat = GL_DEPTH_COMPONENT24_ARB;
-   z24rb->_ActualFormat = GL_DEPTH_COMPONENT24_ARB;
+   z24rb->InternalFormat = GL_DEPTH_COMPONENT24;
+   z24rb->_ActualFormat = GL_DEPTH_COMPONENT24;
    z24rb->_BaseFormat = GL_DEPTH_COMPONENT;
    z24rb->DataType = GL_UNSIGNED_INT;
    z24rb->DepthBits = 24;
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index ddcb44f06b..de75325f15 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -165,7 +165,7 @@ static const struct {
    { OFF, "GL_SGIS_texture_border_clamp",      F(ARB_texture_border_clamp) },
    { ON,  "GL_SGIS_texture_edge_clamp",        F(SGIS_texture_edge_clamp) },
    { ON,  "GL_SGIS_texture_lod",               F(SGIS_texture_lod) },
-   { OFF, "GL_SGIX_depth_texture",             F(SGIX_depth_texture) },
+   { OFF, "GL_SGIX_depth_texture",             F(ARB_depth_texture) },
    { OFF, "GL_SGIX_shadow",                    F(SGIX_shadow) },
    { OFF, "GL_SGIX_shadow_ambient",            F(SGIX_shadow_ambient) },
    { OFF, "GL_SUN_multi_draw_arrays",          F(EXT_multi_draw_arrays) },
@@ -292,7 +292,6 @@ _mesa_enable_sw_extensions(GLcontext *ctx)
    ctx->Extensions.SGI_texture_color_table = GL_TRUE;
    ctx->Extensions.SGIS_generate_mipmap = GL_TRUE;
    ctx->Extensions.SGIS_texture_edge_clamp = GL_TRUE;
-   ctx->Extensions.SGIX_depth_texture = GL_TRUE;
    ctx->Extensions.SGIX_shadow = GL_TRUE;
    ctx->Extensions.SGIX_shadow_ambient = GL_TRUE;
 #if FEATURE_ARB_vertex_program || FEATURE_ARB_fragment_program
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 19cf6f56d1..62bc65cc72 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2626,7 +2626,6 @@ struct gl_extensions
    GLboolean SGIS_generate_mipmap;
    GLboolean SGIS_texture_edge_clamp;
    GLboolean SGIS_texture_lod;
-   GLboolean SGIX_depth_texture;
    GLboolean SGIX_shadow;
    GLboolean SGIX_shadow_ambient; /* or GL_ARB_shadow_ambient */
    GLboolean TDFX_texture_compression_FXT1;
diff --git a/src/mesa/main/texformat.c b/src/mesa/main/texformat.c
index d4ccdf9fa0..4442ce39a4 100644
--- a/src/mesa/main/texformat.c
+++ b/src/mesa/main/texformat.c
@@ -1420,14 +1420,13 @@ _mesa_choose_tex_format( GLcontext *ctx, GLint internalFormat,
          ; /* fallthrough */
    }
 
-   if (ctx->Extensions.SGIX_depth_texture ||
-       ctx->Extensions.ARB_depth_texture) {
+   if (ctx->Extensions.ARB_depth_texture) {
       switch (internalFormat) {
          case GL_DEPTH_COMPONENT:
-         case GL_DEPTH_COMPONENT24_SGIX:
-         case GL_DEPTH_COMPONENT32_SGIX:
+         case GL_DEPTH_COMPONENT24:
+         case GL_DEPTH_COMPONENT32:
             return &_mesa_texformat_z32;
-         case GL_DEPTH_COMPONENT16_SGIX:
+         case GL_DEPTH_COMPONENT16:
             return &_mesa_texformat_z16;
          default:
             ; /* fallthrough */
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index b0a2d6ddb3..4a419fde26 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -241,13 +241,12 @@ _mesa_base_tex_format( GLcontext *ctx, GLint internalFormat )
       }
    }
 
-   if (ctx->Extensions.SGIX_depth_texture ||
-       ctx->Extensions.ARB_depth_texture) {
+   if (ctx->Extensions.ARB_depth_texture) {
       switch (internalFormat) {
          case GL_DEPTH_COMPONENT:
-         case GL_DEPTH_COMPONENT16_SGIX:
-         case GL_DEPTH_COMPONENT24_SGIX:
-         case GL_DEPTH_COMPONENT32_SGIX:
+         case GL_DEPTH_COMPONENT16:
+         case GL_DEPTH_COMPONENT24:
+         case GL_DEPTH_COMPONENT32:
             return GL_DEPTH_COMPONENT;
          default:
             ; /* fallthrough */
@@ -526,9 +525,9 @@ static GLboolean
 is_depth_format(GLenum format)
 {
    switch (format) {
-      case GL_DEPTH_COMPONENT16_ARB:
-      case GL_DEPTH_COMPONENT24_ARB:
-      case GL_DEPTH_COMPONENT32_ARB:
+      case GL_DEPTH_COMPONENT16:
+      case GL_DEPTH_COMPONENT24:
+      case GL_DEPTH_COMPONENT32:
       case GL_DEPTH_COMPONENT:
          return GL_TRUE;
       default:
@@ -2297,8 +2296,7 @@ _mesa_GetTexImage( GLenum target, GLint level, GLenum format,
       return;
    }
 
-   if (!ctx->Extensions.SGIX_depth_texture &&
-       !ctx->Extensions.ARB_depth_texture && is_depth_format(format)) {
+   if (!ctx->Extensions.ARB_depth_texture && is_depth_format(format)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetTexImage(format)");
       return;
    }
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index acddb6663b..a9e752a637 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -642,8 +642,7 @@ _mesa_GetTexLevelParameteriv( GLenum target, GLint level,
             *params = 0;
          break;
       case GL_TEXTURE_DEPTH_SIZE_ARB:
-         if (ctx->Extensions.SGIX_depth_texture ||
-             ctx->Extensions.ARB_depth_texture)
+         if (ctx->Extensions.ARB_depth_texture)
             *params = img->TexFormat->DepthBits;
          else
             _mesa_error(ctx, GL_INVALID_ENUM,
diff --git a/src/mesa/swrast/s_texstore.c b/src/mesa/swrast/s_texstore.c
index 15d52aa587..16b00b9fa1 100644
--- a/src/mesa/swrast/s_texstore.c
+++ b/src/mesa/swrast/s_texstore.c
@@ -216,9 +216,9 @@ is_depth_format(GLenum format)
 {
    switch (format) {
       case GL_DEPTH_COMPONENT:
-      case GL_DEPTH_COMPONENT16_SGIX:
-      case GL_DEPTH_COMPONENT24_SGIX:
-      case GL_DEPTH_COMPONENT32_SGIX:
+      case GL_DEPTH_COMPONENT16:
+      case GL_DEPTH_COMPONENT24:
+      case GL_DEPTH_COMPONENT32:
          return GL_TRUE;
       default:
          return GL_FALSE;
-- 
cgit v1.2.3


From dd7e5a498066e4ebdb7ad40773de48e5bc993164 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Wed, 1 Oct 2008 13:34:38 +0100
Subject: draw: add streamlined paths for fetching linear verts

---
 src/gallium/auxiliary/draw/draw_vs_aos.c    |  44 +++++----
 src/gallium/auxiliary/draw/draw_vs_aos.h    |  19 ++--
 src/gallium/auxiliary/draw/draw_vs_aos_io.c | 137 +++++++++++++++++++++-------
 3 files changed, 134 insertions(+), 66 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index a556477a76..4c794e0e23 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -92,9 +92,9 @@ struct x86_reg aos_get_x86( struct aos_compilation *cp,
          assert(which_reg == 1);
          offset = Offset(struct aos_machine, constants);
          break;
-      case X86_ATTRIBS:
+      case X86_BUFFERS:
          assert(which_reg == 0);
-         offset = Offset(struct aos_machine, attrib);
+         offset = Offset(struct aos_machine, buffer);
          break;
       default:
          assert(0);
@@ -1939,6 +1939,8 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
    save_fpu_state( &cp );
    set_fpu_round_nearest( &cp );
 
+   aos_init_inputs( &cp, linear );
+
    /* Note address for loop jump 
     */
    label = x86_get_label(cp.func);
@@ -2018,13 +2020,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
 
       /* Incr index
        */   
-      if (linear) {
-         x86_inc(cp.func, cp.idx_EBX);
-      } 
-      else {
-         x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
-      }
-
+      aos_incr_inputs( &cp, linear );
    }
    /* decr count, loop if not zero
     */
@@ -2065,14 +2061,10 @@ static void vaos_set_buffer( struct draw_vs_varient *varient,
                              unsigned stride )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
-   unsigned i;
 
-   for (i = 0; i < vaos->base.key.nr_inputs; i++) {
-      if (vaos->base.key.element[i].in.buffer == buf) {
-         vaos->attrib[i].input_ptr = ((char *)ptr +
-                                      vaos->base.key.element[i].in.offset);
-         vaos->attrib[i].input_stride = stride;
-      }
+   if (buf < vaos->nr_vb) {
+      vaos->buffer[buf].base_ptr = (char *)ptr;
+      vaos->buffer[buf].stride = stride;
    }
 }
 
@@ -2089,7 +2081,7 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
-   machine->attrib = vaos->attrib;
+   machine->buffer = vaos->buffer;
 
    vaos->gen_run_elts( machine,
                        elts,
@@ -2108,7 +2100,7 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
-   machine->attrib = vaos->attrib;
+   machine->buffer = vaos->buffer;
 
    vaos->gen_run_linear( machine,
                          start,
@@ -2127,7 +2119,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
 
-   FREE( vaos->attrib );
+   FREE( vaos->buffer );
 
    x86_release_func( &vaos->func[0] );
    x86_release_func( &vaos->func[1] );
@@ -2140,6 +2132,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
                                                  const struct draw_vs_varient_key *key )
 {
+   unsigned i;
    struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
 
    if (!vaos)
@@ -2154,10 +2147,15 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
 
    vaos->draw = vs->draw;
 
-   vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) );
-   if (!vaos->attrib)
+   for (i = 0; i < key->nr_inputs; i++) 
+      vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
+
+   vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
+   if (!vaos->buffer)
       goto fail;
 
+   debug_printf("nr_vb: %d\n", vaos->nr_vb);
+
 #if 0
    tgsi_dump(vs->state.tokens, 0);
 #endif
@@ -2179,8 +2177,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    return &vaos->base;
 
  fail:
-   if (vaos && vaos->attrib)
-      FREE(vaos->attrib);
+   if (vaos && vaos->buffer)
+      FREE(vaos->buffer);
 
    if (vaos)
       x86_release_func( &vaos->func[0] );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 7fe6f79db0..306392e5d6 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -87,9 +87,10 @@ struct lit_info {
 #define MAX_SHINE_TAB    4
 #define MAX_LIT_INFO     16
 
-struct aos_attrib {
-   const void *input_ptr;
-   unsigned input_stride;
+struct aos_buffer {
+   const void *base_ptr;
+   unsigned stride;
+   void *ptr;                   /* updated per vertex */
 };
 
 
@@ -123,7 +124,7 @@ struct aos_machine {
    const float (*immediates)[4];     /* points to shader data */
    const float (*constants)[4];      /* points to draw data */
 
-   const struct aos_attrib *attrib; /* points to ? */
+   const struct aos_buffer *buffer; /* points to ? */
 };
 
 
@@ -179,8 +180,9 @@ struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
                                    unsigned file,
                                    unsigned idx );
 
-boolean aos_fetch_inputs( struct aos_compilation *cp,
-                          boolean linear );
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear );
 
 boolean aos_emit_outputs( struct aos_compilation *cp );
 
@@ -210,7 +212,7 @@ do {                                                                    \
 #define X86_NULL       0
 #define X86_IMMEDIATES 1
 #define X86_CONSTANTS  2
-#define X86_ATTRIBS    3
+#define X86_BUFFERS    3
 
 struct x86_reg aos_get_x86( struct aos_compilation *cp,
                             unsigned which_reg,
@@ -232,7 +234,8 @@ struct draw_vs_varient_aos_sse {
    struct draw_vs_varient base;
    struct draw_context *draw;
 
-   struct aos_attrib *attrib;
+   struct aos_buffer *buffer;
+   unsigned nr_vb;
 
    vaos_run_linear_func gen_run_linear;
    vaos_run_elts_func gen_run_elts;
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index 26297c74f8..8e08b9285f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -95,28 +95,6 @@ static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
 
 
-static void get_src_ptr( struct aos_compilation *cp,
-                         struct x86_reg src,
-                         struct x86_reg elt,
-                         unsigned a )
-{
-   struct x86_reg attrib = x86_make_disp(aos_get_x86( cp, 0, X86_ATTRIBS ), 
-                                         a * sizeof(struct aos_attrib));
-
-   struct x86_reg input_ptr = x86_make_disp(attrib, 
-                                            Offset(struct aos_attrib, input_ptr));
-
-   struct x86_reg input_stride = x86_make_disp(attrib, 
-                                               Offset(struct aos_attrib, input_stride));
-
-   /* Calculate pointer to current attrib:
-    */
-   x86_mov(cp->func, src, input_stride);
-   x86_imul(cp->func, src, elt);
-   x86_add(cp->func, src, input_ptr);
-}
-
-
 /* Extended swizzles?  Maybe later.
  */  
 static void emit_swizzle( struct aos_compilation *cp,
@@ -128,22 +106,44 @@ static void emit_swizzle( struct aos_compilation *cp,
 }
 
 
+
+static boolean get_buffer_ptr( struct aos_compilation *cp,
+                            unsigned buf_idx,
+                            struct x86_reg elt,
+                            struct x86_reg ptr)
+{
+   struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                      buf_idx * sizeof(struct aos_buffer));
+
+   struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                               Offset(struct aos_buffer, base_ptr));
+
+   struct x86_reg buf_stride = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, stride));
+
+   /* Calculate pointer to current attrib:
+    */
+   x86_mov(cp->func, ptr, buf_stride);
+   x86_imul(cp->func, ptr, elt);
+   x86_add(cp->func, ptr, buf_base_ptr);
+
+   return TRUE;
+}
+
+
+
+
 static boolean load_input( struct aos_compilation *cp,
                            unsigned idx,
-                           boolean linear )
+                           struct x86_reg bufptr )
 {
    unsigned format = cp->vaos->base.key.element[idx].in.format;
-   struct x86_reg src = cp->tmp_EAX;
+   unsigned offset = cp->vaos->base.key.element[idx].in.offset;
    struct x86_reg dataXMM = aos_get_xmm_reg(cp);
 
    /* Figure out source pointer address:
     */
-   get_src_ptr(cp, 
-               src, 
-               linear ? cp->idx_EBX : x86_deref(cp->idx_EBX),
-               idx);
-
-   src = x86_deref(src);
+   struct x86_reg src = x86_make_disp(bufptr, offset);
 
    aos_adopt_xmm_reg( cp,
                       dataXMM,
@@ -179,20 +179,87 @@ static boolean load_input( struct aos_compilation *cp,
    return TRUE;
 }
 
-
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+static boolean load_inputs( struct aos_compilation *cp,
+                            unsigned buffer,
+                            struct x86_reg ptr )
 {
    unsigned i;
-   
+
    for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
-      if (!load_input( cp, i, linear ))
+      if (cp->vaos->base.key.element[i].in.buffer == buffer) {
+
+         if (!load_input( cp, i, ptr ))
+            return FALSE;
+
+         cp->insn_counter++;
+      }
+   }
+   
+   return TRUE;
+}
+
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
+{
+   if (linear && cp->vaos->nr_vb == 1) {
+
+      struct x86_reg elt = cp->idx_EBX;
+      struct x86_reg ptr = cp->tmp_EAX;
+
+      if (!get_buffer_ptr( cp, 0, elt, ptr ))
          return FALSE;
-      cp->insn_counter++;
+
+      /* In the linear, single buffer case, keep the buffer pointer
+       * instead of the index number.
+       */
+      x86_mov( cp->func, elt, ptr );
+   }
+
+   return TRUE;
+}
+
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+{
+   if (linear && cp->vaos->nr_vb == 1) {
+      
+      load_inputs( cp, 0, cp->idx_EBX );
+
+   }
+   else {
+      struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
+      unsigned j;
+   
+      for (j = 0; j < cp->vaos->nr_vb; j++) {
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         if (!get_buffer_ptr( cp, j, elt, ptr ))
+            return FALSE;
+
+         cp->insn_counter++;
+
+         if (!load_inputs( cp, j, ptr ))
+            return FALSE;
+      }
    }
 
    return TRUE;
 }
 
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
+{
+   if (linear && cp->vaos->nr_vb == 1) {
+      struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                            (0 * sizeof(struct aos_buffer) + 
+                                             Offset(struct aos_buffer, stride)));
+
+      x86_add(cp->func, cp->idx_EBX, stride);
+   }
+   else if (linear) {
+      x86_inc(cp->func, cp->idx_EBX);
+   } 
+   else {
+      x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
+   }
+}
 
 
-- 
cgit v1.2.3


From 102daee1b8971cf39235e220b9524bec1e4a7089 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 2 Oct 2008 12:46:01 +0100
Subject: rtasm: add prefetch instructions

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 26 ++++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  5 +++++
 2 files changed, 31 insertions(+)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 6d4c081e04..9085f4cc0e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -629,6 +629,32 @@ void x86_and( struct x86_function *p,
  * SSE instructions
  */
 
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 0, ptr);
+}
+
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 1, ptr);
+}
+
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 2, ptr);
+}
+
+
+
 
 void sse_movss( struct x86_function *p,
 		struct x86_reg dst,
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index af94577aab..2d7715f965 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -184,6 +184,11 @@ void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg ar
 void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
+
 void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-- 
cgit v1.2.3


From af9cfea9cc80411351f9879d8eeb525bf7b4ca50 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Wed, 1 Oct 2008 18:40:01 +0100
Subject: draw: don't keep refetching constant inputs

---
 .../auxiliary/draw/draw_pt_fetch_shade_emit.c      |  37 +++---
 src/gallium/auxiliary/draw/draw_vs.h               |   4 +-
 src/gallium/auxiliary/draw/draw_vs_aos.c           |  26 ++++-
 src/gallium/auxiliary/draw/draw_vs_aos.h           |   2 +
 src/gallium/auxiliary/draw/draw_vs_aos_io.c        | 127 +++++++++++++++------
 src/gallium/auxiliary/draw/draw_vs_varient.c       |  10 +-
 6 files changed, 144 insertions(+), 62 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index 73fc70c1bc..a0e08dd10a 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -79,6 +79,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs;
    const struct vertex_info *vinfo;
    unsigned i;
+   unsigned nr_vbs = 0;
    
 
    if (!draw->render->set_primitive( draw->render, 
@@ -102,7 +103,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
 
    fse->key.viewport = !draw->identity_viewport;
    fse->key.clip = !draw->bypass_clipping;
-   fse->key.pad = 0;
+   fse->key.const_vbuffers = 0;
 
    memset(fse->key.element, 0, 
           fse->key.nr_elements * sizeof(fse->key.element[0]));
@@ -116,9 +117,16 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
        */
       fse->key.element[i].in.buffer = src->vertex_buffer_index;
       fse->key.element[i].in.offset = src->src_offset;
+      nr_vbs = MAX2(nr_vbs, src->vertex_buffer_index + 1);
    }
    
+   for (i = 0; i < 5 && i < nr_vbs; i++) {
+      if (draw->pt.vertex_buffer[i].pitch == 0)
+         fse->key.const_vbuffers |= (1<<i);
+   }
 
+   if (0) debug_printf("%s: lookup const_vbuffers: %x\n", __FUNCTION__, fse->key.const_vbuffers);
+   
    {
       unsigned dst_offset = 0;
 
@@ -162,13 +170,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       }
    }
 
-
-   /* Would normally look up a vertex shader and peruse its list of
-    * varients somehow.  We omitted that step and put all the
-    * hardcoded "shaders" into an array.  We're just making the
-    * assumption that this happens to be a matching shader...  ie
-    * you're running isosurf, aren't you?
-    */
+   
    fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader, 
                                          &fse->key );
 
@@ -177,18 +179,17 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       return ;
    }
 
+   if (0) debug_printf("%s: found const_vbuffers: %x\n", __FUNCTION__, 
+                       fse->active->key.const_vbuffers);
+
    /* Now set buffer pointers:
     */
-   for (i = 0; i < num_vs_inputs; i++) {
-      unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index;
-
-      fse->active->set_input( fse->active, 
-                              i, 
-                              
-                              ((const ubyte *) draw->pt.user.vbuffer[buf] + 
-                               draw->pt.vertex_buffer[buf].buffer_offset),
-                              
-                              draw->pt.vertex_buffer[buf].pitch );
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      fse->active->set_buffer( fse->active, 
+                               i, 
+                               ((const ubyte *) draw->pt.user.vbuffer[i] + 
+                                draw->pt.vertex_buffer[i].buffer_offset),
+                              draw->pt.vertex_buffer[i].pitch );
    }
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 45992d1986..68c24abad3 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -64,7 +64,7 @@ struct draw_vs_varient_key {
    unsigned nr_outputs:8;
    unsigned viewport:1;
    unsigned clip:1;
-   unsigned pad:5;
+   unsigned const_vbuffers:5;
    struct draw_varient_element element[PIPE_MAX_ATTRIBS];
 };
 
@@ -76,7 +76,7 @@ struct draw_vs_varient {
 
    struct draw_vertex_shader *vs;
 
-   void (*set_input)( struct draw_vs_varient *,
+   void (*set_buffer)( struct draw_vs_varient *,
                       unsigned i,
                       const void *ptr,
                       unsigned stride );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 4c794e0e23..87232865e2 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -196,6 +196,18 @@ static void spill( struct aos_compilation *cp, unsigned idx )
 }
 
 
+void aos_spill_all( struct aos_compilation *cp )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+}
+
+
 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
                                         struct x86_reg reg )
 {
@@ -1941,6 +1953,9 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
 
    aos_init_inputs( &cp, linear );
 
+   cp.x86_reg[0] = 0;
+   cp.x86_reg[1] = 0;
+   
    /* Note address for loop jump 
     */
    label = x86_get_label(cp.func);
@@ -2066,6 +2081,8 @@ static void vaos_set_buffer( struct draw_vs_varient *varient,
       vaos->buffer[buf].base_ptr = (char *)ptr;
       vaos->buffer[buf].stride = stride;
    }
+
+   if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
 }
 
 
@@ -2078,6 +2095,8 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
 
+   if (0) debug_printf("%s %d\n", __FUNCTION__, count);
+
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
@@ -2097,6 +2116,9 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
 
+   if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, 
+                       vaos->base.key.const_vbuffers);
+
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
@@ -2140,7 +2162,7 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    
    vaos->base.key = *key;
    vaos->base.vs = vs;
-   vaos->base.set_input = vaos_set_buffer;
+   vaos->base.set_buffer = vaos_set_buffer;
    vaos->base.destroy = vaos_destroy;
    vaos->base.run_linear = vaos_run_linear;
    vaos->base.run_elts = vaos_run_elts;
@@ -2154,7 +2176,7 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    if (!vaos->buffer)
       goto fail;
 
-   debug_printf("nr_vb: %d\n", vaos->nr_vb);
+   debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
 
 #if 0
    tgsi_dump(vs->state.tokens, 0);
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 306392e5d6..264387517b 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -176,6 +176,8 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp,
                         unsigned idx,
                         unsigned dirty );
 
+void aos_spill_all( struct aos_compilation *cp );
+
 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
                                    unsigned file,
                                    unsigned idx );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index 8e08b9285f..b0c51d7fa1 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -108,29 +108,45 @@ static void emit_swizzle( struct aos_compilation *cp,
 
 
 static boolean get_buffer_ptr( struct aos_compilation *cp,
-                            unsigned buf_idx,
-                            struct x86_reg elt,
-                            struct x86_reg ptr)
+                               boolean linear,
+                               unsigned buf_idx,
+                               struct x86_reg elt,
+                               struct x86_reg ptr)
 {
    struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
                                       buf_idx * sizeof(struct aos_buffer));
 
-   struct x86_reg buf_base_ptr = x86_make_disp(buf, 
-                                               Offset(struct aos_buffer, base_ptr));
-
    struct x86_reg buf_stride = x86_make_disp(buf, 
                                              Offset(struct aos_buffer, stride));
+   if (linear) {
+      struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, ptr));
 
-   /* Calculate pointer to current attrib:
-    */
-   x86_mov(cp->func, ptr, buf_stride);
-   x86_imul(cp->func, ptr, elt);
-   x86_add(cp->func, ptr, buf_base_ptr);
 
-   return TRUE;
-}
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_ptr);
+      x86_mov(cp->func, elt, buf_stride);
+      x86_add(cp->func, elt, ptr);
+      sse_prefetchnta(cp->func, x86_deref(elt));
+      x86_mov(cp->func, buf_ptr, elt);
+   }
+   else {
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_stride);
+      x86_imul(cp->func, ptr, elt);
+      x86_add(cp->func, ptr, buf_base_ptr);
+   }
 
+   cp->insn_counter++;
 
+   return TRUE;
+}
 
 
 static boolean load_input( struct aos_compilation *cp,
@@ -200,18 +216,57 @@ static boolean load_inputs( struct aos_compilation *cp,
 
 boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
 {
-   if (linear && cp->vaos->nr_vb == 1) {
+   unsigned i;
+   for (i = 0; i < cp->vaos->nr_vb; i++) {
+      struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                         i * sizeof(struct aos_buffer));
 
-      struct x86_reg elt = cp->idx_EBX;
-      struct x86_reg ptr = cp->tmp_EAX;
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
 
-      if (!get_buffer_ptr( cp, 0, elt, ptr ))
-         return FALSE;
+      if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
+         struct x86_reg ptr = cp->tmp_EAX;
 
-      /* In the linear, single buffer case, keep the buffer pointer
-       * instead of the index number.
-       */
-      x86_mov( cp->func, elt, ptr );
+         x86_mov(cp->func, ptr, buf_base_ptr);
+
+         /* Load all inputs for this constant vertex buffer
+          */
+         load_inputs( cp, i, x86_deref(ptr) );
+         
+         /* Then just force them out to aos_machine.input[]
+          */
+         aos_spill_all( cp );
+
+      }
+      else if (linear) {
+
+         struct x86_reg elt = cp->idx_EBX;
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         struct x86_reg buf_stride = x86_make_disp(buf, 
+                                                   Offset(struct aos_buffer, stride));
+
+         struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                                Offset(struct aos_buffer, ptr));
+
+
+         /* Calculate pointer to current attrib:
+          */
+         x86_mov(cp->func, ptr, buf_stride);
+         x86_imul(cp->func, ptr, elt);
+         x86_add(cp->func, ptr, buf_base_ptr);
+
+
+         /* In the linear case, keep the buffer pointer instead of the
+          * index number.
+          */
+         if (cp->vaos->nr_vb == 1) 
+            x86_mov( cp->func, elt, ptr );
+         else
+            x86_mov( cp->func, buf_ptr, ptr );
+
+         cp->insn_counter++;
+      }
    }
 
    return TRUE;
@@ -219,23 +274,22 @@ boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
 
 boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
 {
-   if (linear && cp->vaos->nr_vb == 1) {
-      
-      load_inputs( cp, 0, cp->idx_EBX );
+   unsigned j;
 
-   }
-   else {
-      struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
-      unsigned j;
-   
-      for (j = 0; j < cp->vaos->nr_vb; j++) {
+   for (j = 0; j < cp->vaos->nr_vb; j++) {
+      if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
+         /* just retreive pre-transformed input */
+      }
+      else if (linear && cp->vaos->nr_vb == 1) {
+         load_inputs( cp, 0, cp->idx_EBX );
+      }
+      else {
+         struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
          struct x86_reg ptr = cp->tmp_EAX;
 
-         if (!get_buffer_ptr( cp, j, elt, ptr ))
+         if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
             return FALSE;
 
-         cp->insn_counter++;
-
          if (!load_inputs( cp, j, ptr ))
             return FALSE;
       }
@@ -252,13 +306,16 @@ boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
                                              Offset(struct aos_buffer, stride)));
 
       x86_add(cp->func, cp->idx_EBX, stride);
+      sse_prefetchnta(cp->func, x86_deref(cp->idx_EBX));
    }
    else if (linear) {
-      x86_inc(cp->func, cp->idx_EBX);
+      /* Nothing to do */
    } 
    else {
       x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
    }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index 4daf05dae7..7ee567d478 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -64,10 +64,10 @@ struct draw_vs_varient_generic {
 
 
-static void vsvg_set_input( struct draw_vs_varient *varient,
-                            unsigned buffer,
-                            const void *ptr,
-                            unsigned stride )
+static void vsvg_set_buffer( struct draw_vs_varient *varient,
+                             unsigned buffer,
+                             const void *ptr,
+                             unsigned stride )
 {
    struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
 
@@ -265,7 +265,7 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
 
    vsvg->base.key = *key;
    vsvg->base.vs = vs;
-   vsvg->base.set_input     = vsvg_set_input;
+   vsvg->base.set_buffer    = vsvg_set_buffer;
    vsvg->base.run_elts      = vsvg_run_elts;
    vsvg->base.run_linear    = vsvg_run_linear;
    vsvg->base.destroy       = vsvg_destroy;
-- 
cgit v1.2.3


From 918a444913435bdee33214e25811875100f873b0 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 2 Oct 2008 12:53:11 +0100
Subject: draw: modify prefetching slightly

---
 src/gallium/auxiliary/draw/draw_vs_aos_io.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index b0c51d7fa1..dd79bc799a 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -54,6 +54,7 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
 				 struct x86_reg data,
 				 struct x86_reg src_ptr )
 {
+#if 1
    sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
    /* data = z ? ? ? */
    sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
@@ -62,6 +63,16 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
    /* data = ? 0 z 1 */
    sse_movlps(cp->func, data, src_ptr);
    /* data = x y z 1 */
+#else
+   sse_movups(cp->func, data, src_ptr);
+   /* data = x y z ? */
+   sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) );
+   /* data = ? x y z */
+   sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) );
+   /* data = 1 x y z */
+   sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) );
+   /* data = x y z 1 */
+#endif
 }
 
 static void emit_load_R32G32( struct aos_compilation *cp, 
@@ -128,7 +139,7 @@ static boolean get_buffer_ptr( struct aos_compilation *cp,
       x86_mov(cp->func, ptr, buf_ptr);
       x86_mov(cp->func, elt, buf_stride);
       x86_add(cp->func, elt, ptr);
-      sse_prefetchnta(cp->func, x86_deref(elt));
+      if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192));
       x86_mov(cp->func, buf_ptr, elt);
    }
    else {
@@ -306,7 +317,7 @@ boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
                                              Offset(struct aos_buffer, stride)));
 
       x86_add(cp->func, cp->idx_EBX, stride);
-      sse_prefetchnta(cp->func, x86_deref(cp->idx_EBX));
+      sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192));
    }
    else if (linear) {
       /* Nothing to do */
@@ -327,7 +338,7 @@ static void emit_store_R32G32B32A32( struct aos_compilation *cp,
 				     struct x86_reg dst_ptr,
 				     struct x86_reg dataXMM )
 {
-   sse_movups(cp->func, dst_ptr, dataXMM);
+   sse_movaps(cp->func, dst_ptr, dataXMM);
 }
 
 static void emit_store_R32G32B32( struct aos_compilation *cp, 
@@ -430,7 +441,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp )
 
       if (data.file != file_XMM) {
          struct x86_reg tmp = aos_get_xmm_reg( cp );
-         sse_movups(cp->func, tmp, data);
+         sse_movaps(cp->func, tmp, data);
          data = tmp;
       }
       
-- 
cgit v1.2.3


From 5b60d6d07b79124499a1d75a2830cf2e5949d1ad Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 11:16:44 -0600
Subject: egl: remove space after -L flag

---
 src/egl/drivers/xdri/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/egl/drivers/xdri/Makefile b/src/egl/drivers/xdri/Makefile
index afd551dea5..a721b997e6 100644
--- a/src/egl/drivers/xdri/Makefile
+++ b/src/egl/drivers/xdri/Makefile
@@ -48,7 +48,7 @@ $(TOP)/$(LIB_DIR)/$(DRIVER_NAME): $(OBJECTS)
 	$(TOP)/bin/mklib -o $(DRIVER_NAME) \
 		-noprefix \
 		-major 1 -minor 0 \
-		-L $(TOP)/$(LIB_DIR) \
+		-L$(TOP)/$(LIB_DIR) \
 		-install $(TOP)/$(LIB_DIR) \
 		$(OBJECTS) $(DRM_LIB) $(MISC_LIBS)
 
-- 
cgit v1.2.3


From 2cb213ff233ccd566e716aece45da78daa7d015a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 11:17:09 -0600
Subject: egl: check for null ptr/name

---
 src/egl/drivers/xdri/egl_xdri.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/egl/drivers/xdri/egl_xdri.c b/src/egl/drivers/xdri/egl_xdri.c
index 83d4b86d98..3b3e312746 100644
--- a/src/egl/drivers/xdri/egl_xdri.c
+++ b/src/egl/drivers/xdri/egl_xdri.c
@@ -654,7 +654,10 @@ xdri_eglInitialize(_EGLDriver *drv, EGLDisplay dpy,
 
    xdri_drv->Base.Initialized = EGL_TRUE;
 
-   snprintf(name, sizeof(name), "X/DRI:%s", xdri_drv->dri_driver_name);
+   if (xdri_drv->dri_driver_name)
+      snprintf(name, sizeof(name), "X/DRI:%s", xdri_drv->dri_driver_name);
+   else
+      snprintf(name, sizeof(name), "X/DRI");
    xdri_drv->Base.Name = name;
 
    /* we're supporting EGL 1.4 */
-- 
cgit v1.2.3


From 5620c20b24dc4f780a2246eb5270c4476b487e0a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 11:18:06 -0600
Subject: mesa: fix temp register allocation problems.

Complex texcombine modes were running out of registers (>32 registers for 8 tex units).
---
 src/mesa/main/texenvprogram.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index 2bce93eef1..f6bbbcfaed 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -411,6 +411,14 @@ static struct ureg get_tex_temp( struct texenv_fragment_program *p )
 }
 
 
+/** Mark a temp reg as being no longer allocatable. */
+static void reserve_temp( struct texenv_fragment_program *p, struct ureg r )
+{
+   if (r.file == PROGRAM_TEMPORARY)
+      p->temps_output |= (1 << r.idx);
+}
+
+
 static void release_temps(GLcontext *ctx, struct texenv_fragment_program *p )
 {
    GLuint max_temp = ctx->Const.FragmentProgram.MaxTemps;
@@ -504,10 +512,12 @@ emit_op(struct texenv_fragment_program *p,
 
    emit_dst( &inst->DstReg, dest, mask );
 
+#if 0
    /* Accounting for indirection tracking:
     */
    if (dest.file == PROGRAM_TEMPORARY)
       p->temps_output |= 1 << dest.idx;
+#endif
 
    return inst;
 }
@@ -562,6 +572,10 @@ static struct ureg emit_texld( struct texenv_fragment_program *p,
 
    p->program->Base.NumTexInstructions++;
 
+   /* Accounting for indirection tracking:
+    */
+   reserve_temp(p, dest);
+
    /* Is this a texture indirection?
     */
    if ((coord.file == PROGRAM_TEMPORARY &&
@@ -1079,6 +1093,7 @@ create_new_program(GLcontext *ctx, struct state_key *key,
       for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++)
 	 if (key->enabled_units & (1<<unit)) {
 	    p.src_previous = emit_texenv( &p, unit );
+            reserve_temp(&p, p.src_previous); /* don't re-use this temp reg */
 	    release_temps(ctx, &p);	/* release all temps */
 	 }
    }
-- 
cgit v1.2.3


From 3f477e111a96493ff2863af06a98e8849ffbc6d8 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 28 Sep 2008 18:33:23 +0200
Subject: Gallivm: make it compile again, add some opcodes.

---
 src/gallium/auxiliary/draw/draw_vs_llvm.c      |    1 +
 src/gallium/auxiliary/gallivm/gallivm_cpu.cpp  |    1 +
 src/gallium/auxiliary/gallivm/instructions.cpp | 1210 ++++++++++++++----------
 src/gallium/auxiliary/gallivm/instructions.h   |   26 +-
 src/gallium/auxiliary/gallivm/tgsitollvm.cpp   |   60 +-
 5 files changed, 792 insertions(+), 506 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 2ce30b9a02..727977bc3a 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -32,6 +32,7 @@
   *   Brian Paul
   */
 
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_private.h"
 #include "draw_context.h"
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index e64bfb1c6c..3a4a41e544 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -46,6 +46,7 @@
 #include "tgsi/tgsi_dump.h"
 
 #include "util/u_memory.h"
+#include "util/u_math.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index a82dc30306..5fdfe09d18 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -83,6 +83,7 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
    m_llvmPow   = 0;
    m_llvmFloor = 0;
    m_llvmFlog  = 0;
+   m_llvmFexp  = 0;
    m_llvmLit  = 0;
    m_fmtPtr = 0;
 
@@ -92,194 +93,247 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
    m_mod = ParseBitcodeFile(buffer);
 }
 
-llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
+llvm::BasicBlock * Instructions::currentBlock() const
 {
-   return m_builder.CreateAdd(in1, in2, name("add"));
+   return m_builder.GetInsertBlock();
 }
 
-llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
+llvm::Value * Instructions::abs(llvm::Value *in)
 {
-   Value *mulRes = mul(in1, in2);
-   return add(mulRes, in3);
+   std::vector<llvm::Value*> vec = extractVector(in);
+   Value *xabs  = callFAbs(vec[0]);
+   Value *yabs  = callFAbs(vec[1]);
+   Value *zabs  = callFAbs(vec[2]);
+   Value *wabs  = callFAbs(vec[3]);
+   return vectorFromVals(xabs, yabs, zabs, wabs);
 }
- 
-llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
+
+llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
 {
-   return m_builder.CreateMul(in1, in2, name("mul"));
+   return m_builder.CreateAdd(in1, in2, name("add"));
 }
 
-const char * Instructions::name(const char *prefix)
+llvm::Value * Instructions::arl(llvm::Value *in)
 {
-   ++m_idx;
-   snprintf(m_name, 32, "%s%d", prefix, m_idx);
-   return m_name;
+   return floor(in);
 }
 
-llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+void Instructions::beginLoop()
 {
-   Value *mulRes = mul(in1, in2);
-   Value *x = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(0),
-                                                          name("extractx"));
-   Value *y = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(1),
-                                                          name("extracty"));
-   Value *z = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(2),
-                                                          name("extractz"));
-   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
-   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
-   return vectorFromVals(dot3, dot3, dot3, dot3);
+   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
+   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
+
+   m_builder.CreateBr(begin);
+   Loop loop;
+   loop.begin = begin;
+   loop.end   = end;
+   m_builder.SetInsertPoint(begin);
+   m_loopStack.push(loop);
 }
 
-llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+void Instructions::bgnSub(unsigned label)
 {
-   if (!m_llvmFSqrt) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fsqrtArgs;
-      fsqrtArgs.push_back(Type::FloatTy);
-      PAListPtr fsqrtPal;
-      FunctionType* fsqrtType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fsqrtArgs,
-         /*isVarArg=*/false);
-      m_llvmFSqrt = Function::Create(
-         /*Type=*/fsqrtType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.sqrt.f32", m_mod);
-      m_llvmFSqrt->setCallingConv(CallingConv::C);
-      m_llvmFSqrt->setParamAttrs(fsqrtPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
-                                         name("sqrt"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
+   llvm::Function *func = findFunction(label);
+
+   Function::arg_iterator args = func->arg_begin();
+   Value *ptr_INPUT = args++;
+   ptr_INPUT->setName("INPUT");
+   m_storage->pushArguments(ptr_INPUT);
+
+   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
+
+   m_func = func;
+   m_builder.SetInsertPoint(entry);
 }
 
-llvm::Value * Instructions::rsq(llvm::Value *in1)
+void Instructions::brk()
 {
-   Value *x = m_builder.CreateExtractElement(in1,
-                                             m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *abs  = callFAbs(x);
-   Value *sqrt = callFSqrt(abs);
+   assert(!m_loopStack.empty());
+   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
+   m_builder.CreateBr(m_loopStack.top().end);
+   m_builder.SetInsertPoint(unr);
+}
 
-   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                       sqrt,
-                                       name("rsqrt"));
-   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
+void Instructions::cal(int label, llvm::Value *input)
+{
+   std::vector<Value*> params;
+   params.push_back(input);
+   llvm::Function *func = findFunction(label);
+
+   m_builder.CreateCall(func, params.begin(), params.end());
 }
 
-llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
-                                           llvm::Value *z, llvm::Value *w)
+llvm::Value * Instructions::clamp(llvm::Value *in1)
 {
-   Constant *const_vec = Constant::getNullValue(m_floatVecType);
-   Value *res = m_builder.CreateInsertElement(const_vec, x,
-                                              m_storage->constantInt(0),
-                                              name("vecx"));
-   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
-                               name("vecxy"));
-   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
-                               name("vecxyz"));
-   if (w)
-      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
-                                          name("vecxyzw"));
-   return res;
+	// FIXME
 }
 
-llvm::Value *Instructions::callFAbs(llvm::Value *val)
+llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   if (!m_llvmFAbs) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fabsArgs;
-      fabsArgs.push_back(Type::FloatTy);
-      PAListPtr fabsPal;
-      FunctionType* fabsType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fabsArgs,
-         /*isVarArg=*/false);
-      m_llvmFAbs = Function::Create(
-         /*Type=*/fabsType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"fabs", m_mod);
-      m_llvmFAbs->setCallingConv(CallingConv::C);
-      m_llvmFAbs->setParamAttrs(fabsPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
-                                         name("fabs"));
-   call->setCallingConv(CallingConv::C);
+   llvm::Function *func = m_mod->getFunction("cmp");
+   assert(func);
+
+   std::vector<Value*> params;
+   params.push_back(in1);
+   params.push_back(in2);
+   params.push_back(in3);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
    call->setTailCall(false);
    return call;
 }
 
-llvm::Value * Instructions::lit(llvm::Value *in)
+llvm::Value * Instructions::cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   if (!m_llvmLit) {
-      m_llvmLit = m_mod->getFunction("lit");
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *half = ConstantFP::get(APFloat(0.5f));
+
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], half, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], half, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], half, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], half, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
-   return res;
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *zero = Constant::getNullValue(Type::FloatTy);
+
+   Value *xcmp  = m_builder.CreateFCmpOGE(vec1[0], zero, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGE(vec1[1], zero, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGE(vec1[2], zero, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGE(vec1[3], zero, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+llvm::Value * Instructions::cos(llvm::Value *in)
 {
-   if (!m_llvmPow) {
-      // predeclare the intrinsic
-      std::vector<const Type*> powArgs;
-      powArgs.push_back(Type::FloatTy);
-      powArgs.push_back(Type::FloatTy);
-      PAListPtr powPal;
-      FunctionType* powType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/powArgs,
-         /*isVarArg=*/false);
-      m_llvmPow = Function::Create(
-         /*Type=*/powType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.pow.f32", m_mod);
-      m_llvmPow->setCallingConv(CallingConv::C);
-      m_llvmPow->setParamAttrs(powPal);
-   }
-   std::vector<Value*> params;
-   params.push_back(val1);
-   params.push_back(val2);
-   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
-                                         name("pow"));
-   call->setCallingConv(CallingConv::C);
+#if 0
+   llvm::Function *func = m_mod->getFunction("vcos");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
    call->setTailCall(false);
    return call;
+#else
+   std::vector<llvm::Value*> elems = extractVector(in);
+   Function *func = m_mod->getFunction("cosf");
+   assert(func);
+   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
+   cos->setCallingConv(CallingConv::C);
+   cos->setTailCall(true);
+   return vectorFromVals(cos, cos, cos, cos);
+#endif
 }
 
-llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
 {
    Value *x1 = m_builder.CreateExtractElement(in1,
                                               m_storage->constantInt(0),
                                               name("x1"));
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(2),
+                                              name("z1"));
+
    Value *x2 = m_builder.CreateExtractElement(in2,
                                               m_storage->constantInt(0),
                                               name("x2"));
-   llvm::Value *val = callPow(x1, x2);
-   return vectorFromVals(val, val, val, val);
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *z2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(2),
+                                              name("z2"));
+   Value *y1z2 = mul(y1, z2);
+   Value *z1y2 = mul(z1, y2);
+
+   Value *z1x2 = mul(z1, x2);
+   Value *x1z2 = mul(x1, z2);
+
+   Value *x1y2 = mul(x1, y2);
+   Value *y1x2 = mul(y1, x2);
+
+   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
 }
 
-llvm::Value * Instructions::rcp(llvm::Value *in1)
+llvm::Value * Instructions::ddx(llvm::Value *in)
 {
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                     x1, name("rcp"));
-   return vectorFromVals(res, res, res, res);
+	// FIXME
+}
+
+llvm::Value * Instructions::ddy(llvm::Value *in)
+{
+	// FIXME
+}
+
+llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2)
+{
+   return m_builder.CreateFDiv(in1, in2, name("div"));
+}
+
+llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(in3,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot2add = m_builder.CreateAdd(xy, z, name("dot2add"));
+   return vectorFromVals(dot2add, dot2add, dot2add, dot2add);
+}
+
+llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
+   return vectorFromVals(dot3, dot3, dot3, dot3);
 }
 
 llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2)
@@ -302,23 +356,70 @@ llvm::Value * Instructions::dph(llvm::Value *in1, llvm::Value *in2)
    return vectorFromVals(dph, dph, dph, dph);
 }
 
-llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(2),
+                                             name("z"));
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *w = m_builder.CreateExtractElement(in2,
+                                             m_storage->constantInt(3),
+                                             name("w"));
+   Value *ry = m_builder.CreateMul(y1, y2, name("tyuy"));
+   return vectorFromVals(ConstantFP::get(APFloat(1.f)),
+                         ry, z, w);
+}
+
+void Instructions::elseop()
+{
+   assert(!m_ifStack.empty());
+   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
+   m_builder.CreateBr(ifend);
+   m_builder.SetInsertPoint(m_ifStack.top());
+   currentBlock()->setName(name("ifelse"));
+   m_ifStack.pop();
+   m_ifStack.push(ifend);
+}
+
+void Instructions::endif()
+{
+   assert(!m_ifStack.empty());
+   m_builder.CreateBr(m_ifStack.top());
+   m_builder.SetInsertPoint(m_ifStack.top());
+   m_ifStack.pop();
+}
+
+void Instructions::endLoop()
+{
+   assert(!m_loopStack.empty());
+   Loop loop = m_loopStack.top();
+   m_builder.CreateBr(loop.begin);
+   loop.end->moveAfter(currentBlock());
+   m_builder.SetInsertPoint(loop.end);
+   m_loopStack.pop();
+}
+
+void Instructions::end()
+{
+   m_builder.CreateRetVoid();
+}
+
+void Instructions::endSub()
+{
+   m_func = 0;
+   m_builder.SetInsertPoint(0);
+}
+
+llvm::Value * Instructions::exp(llvm::Value *in)
 {
-   Value *y1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(1),
-                                              name("y1"));
-   Value *z = m_builder.CreateExtractElement(in1,
-                                             m_storage->constantInt(2),
-                                             name("z"));
-   Value *y2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(1),
-                                              name("y2"));
-   Value *w = m_builder.CreateExtractElement(in2,
-                                             m_storage->constantInt(3),
-                                             name("w"));
-   Value *ry = m_builder.CreateMul(y1, y2, name("tyuy"));
-   return vectorFromVals(ConstantFP::get(APFloat(1.f)),
-                         ry, z, w);
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFExp(vec[0]), callFExp(vec[1]),
+                             callFExp(vec[2]), callFExp(vec[3]));
 }
 
 llvm::Value * Instructions::ex2(llvm::Value *in)
@@ -330,31 +431,6 @@ llvm::Value * Instructions::ex2(llvm::Value *in)
    return vectorFromVals(val, val, val, val);
 }
 
-llvm::Value * Instructions::callFloor(llvm::Value *val)
-{
-   if (!m_llvmFloor) {
-      // predeclare the intrinsic
-      std::vector<const Type*> floorArgs;
-      floorArgs.push_back(Type::FloatTy);
-      PAListPtr floorPal;
-      FunctionType* floorType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/floorArgs,
-         /*isVarArg=*/false);
-      m_llvmFloor = Function::Create(
-         /*Type=*/floorType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"floorf", m_mod);
-      m_llvmFloor->setCallingConv(CallingConv::C);
-      m_llvmFloor->setParamAttrs(floorPal);
-   }
-   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
-                                          name("floorf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
 llvm::Value * Instructions::floor(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
@@ -362,42 +438,52 @@ llvm::Value * Instructions::floor(llvm::Value *in)
                          callFloor(vec[2]), callFloor(vec[3]));
 }
 
-llvm::Value * Instructions::arl(llvm::Value *in)
-{
-   return floor(in);
-}
-
 llvm::Value * Instructions::frc(llvm::Value *in)
 {
    llvm::Value *flr = floor(in);
    return sub(in, flr);
 }
 
-llvm::Value * Instructions::callFLog(llvm::Value *val)
+void Instructions::ifop(llvm::Value *in)
 {
-   if (!m_llvmFlog) {
-      // predeclare the intrinsic
-      std::vector<const Type*> flogArgs;
-      flogArgs.push_back(Type::FloatTy);
-      PAListPtr flogPal;
-      FunctionType* flogType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/flogArgs,
-         /*isVarArg=*/false);
-      m_llvmFlog = Function::Create(
-         /*Type=*/flogType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"logf", m_mod);
-      m_llvmFlog->setCallingConv(CallingConv::C);
-      m_llvmFlog->setParamAttrs(flogPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
-                                         name("logf"));
-   call->setCallingConv(CallingConv::C);
+   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
+   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+
+   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
+   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
+   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+
+   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+
+   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
+   m_builder.CreateCondBr(xcmp, ifthen, ifend);
+   //m_builder.SetInsertPoint(yblock);
+
+   m_builder.SetInsertPoint(ifthen);
+   m_ifStack.push(ifend);
+}
+
+llvm::Value * Instructions::kil(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("kil");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
    call->setTailCall(false);
    return call;
 }
 
+llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   llvm::Value *m = mul(in1, in2);
+   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
+   llvm::Value *s = sub(vec1, in1);
+   return add(m, mul(s, in3));
+}
+
 llvm::Value * Instructions::lg2(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
@@ -407,120 +493,192 @@ llvm::Value * Instructions::lg2(llvm::Value *in)
                              callFLog(vec[2]), callFLog(vec[3])), const_vec);
 }
 
-llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::lit(llvm::Value *in)
+{
+   if (!m_llvmLit) {
+      m_llvmLit = m_mod->getFunction("lit");
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::log(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
+                             callFLog(vec[2]), callFLog(vec[3]));
+}
+
+llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   return add(mulRes, in3);
+}
+
+llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
 {
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
+                                          name("xcmp"));
    Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
                                         name("selx"));
 
-   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
+                                          name("ycmp"));
    Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
                                         name("sely"));
 
-   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
+                                          name("zcmp"));
    Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
                                         name("selz"));
 
-   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
+                                          name("wcmp"));
    Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
                                         name("selw"));
 
    return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
 {
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
-                                          name("xcmp"));
+   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
    Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
                                         name("selx"));
 
-   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
-                                          name("ycmp"));
+   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
    Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
                                         name("sely"));
 
-   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
-                                          name("zcmp"));
+   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
    Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
                                         name("selz"));
 
-   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
-                                          name("wcmp"));
+   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
    Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
                                         name("selw"));
 
    return vectorFromVals(selx, sely, selz, selw);
 }
 
-void Instructions::printVector(llvm::Value *val)
+llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
 {
-   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+   return m_builder.CreateMul(in1, in2, name("mul"));
+}
 
-   if (!m_fmtPtr) {
-      Constant *format = ConstantArray::get(frmt, true);
-      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
-      GlobalVariable* globalFormat = new GlobalVariable(
-         /*Type=*/arrayTy,
-         /*isConstant=*/true,
-         /*Linkage=*/GlobalValue::InternalLinkage,
-         /*Initializer=*/0, // has initializer, specified below
-         /*Name=*/name(".str"),
-         m_mod);
-      globalFormat->setInitializer(format);
+llvm::Value * Instructions::neg(llvm::Value *in)
+{
+   Value *neg = m_builder.CreateNeg(in, name("neg"));
+   return neg;
+}
 
-      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
-      std::vector<Constant*> const_ptr_21_indices;
-      const_ptr_21_indices.push_back(const_int0);
-      const_ptr_21_indices.push_back(const_int0);
-      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
-                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
-   }
+llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *x2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(0),
+                                              name("x2"));
+   llvm::Value *val = callPow(x1, x2);
+   return vectorFromVals(val, val, val, val);
+}
 
-   Function *func_printf = m_mod->getFunction("printf");
-   if (!func_printf)
-      func_printf = declarePrintf();
-   assert(func_printf);
-   std::vector<llvm::Value*> vec = extractVector(val);
-   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
-   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
-   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
-   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
-   std::vector<Value*> params;
-   params.push_back(m_fmtPtr);
-   params.push_back(dx);
-   params.push_back(dy);
-   params.push_back(dz);
-   params.push_back(dw);
-   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
-                                         name("printf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(true);
+llvm::Value * Instructions::rcp(llvm::Value *in1)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                     x1, name("rcp"));
+   return vectorFromVals(res, res, res, res);
+}
+
+llvm::Value * Instructions::rsq(llvm::Value *in1)
+{
+   Value *x = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *abs  = callFAbs(x);
+   Value *sqrt = callFSqrt(abs);
+
+   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                       sqrt,
+                                       name("rsqrt"));
+   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
+}
+
+llvm::Value * Instructions::scs(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("scs");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::seq(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOEQ(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOEQ(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOEQ(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOEQ(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
 }
 
-llvm::Function * Instructions::declarePrintf()
+llvm::Value * Instructions::sfl(llvm::Value *in1, llvm::Value *in2)
 {
-   std::vector<const Type*> args;
-   PAListPtr params;
-   FunctionType* funcTy = FunctionType::get(
-      /*Result=*/IntegerType::get(32),
-      /*Params=*/args,
-      /*isVarArg=*/true);
-   Function* func_printf = Function::Create(
-      /*Type=*/funcTy,
-      /*Linkage=*/GlobalValue::ExternalLinkage,
-      /*Name=*/"printf", m_mod);
-   func_printf->setCallingConv(CallingConv::C);
-   func_printf->setParamAttrs(params);
-   return func_printf;
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   return vectorFromVals(const0f, const0f, const0f, const0f);
 }
 
+llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
 
 llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
 {
@@ -543,7 +701,18 @@ llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
 
    return vectorFromVals(x, y, z, w);
 }
-llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
+
+llvm::Value * Instructions::sin(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("vsin");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::sle(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
    Constant *const0f = Constant::getNullValue(Type::FloatTy);
@@ -551,22 +720,21 @@ llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], vec2[0], name("xcmp"));
+   Value *xcmp = m_builder.CreateFCmpOLE(vec1[0], vec2[0], name("xcmp"));
    Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp = m_builder.CreateFCmpOLE(vec1[1], vec2[1], name("ycmp"));
    Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp = m_builder.CreateFCmpOLE(vec1[2], vec2[2], name("zcmp"));
    Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
 
-   Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp = m_builder.CreateFCmpOLE(vec1[3], vec2[3], name("wcmp"));
    Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
 
    return vectorFromVals(x, y, z, w);
 }
 
-
 llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
@@ -590,169 +758,331 @@ llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
    return vectorFromVals(x, y, z, w);
 }
 
-llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::sne(llvm::Value *in1, llvm::Value *in2)
 {
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *y1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(1),
-                                              name("y1"));
-   Value *z1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(2),
-                                              name("z1"));
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
-   Value *x2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(0),
-                                              name("x2"));
-   Value *y2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(1),
-                                              name("y2"));
-   Value *z2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(2),
-                                              name("z2"));
-   Value *y1z2 = mul(y1, z2);
-   Value *z1y2 = mul(z1, y2);
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *z1x2 = mul(z1, x2);
-   Value *x1z2 = mul(x1, z2);
+   Value *xcmp = m_builder.CreateFCmpONE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *x1y2 = mul(x1, y2);
-   Value *y1x2 = mul(y1, x2);
+   Value *ycmp = m_builder.CreateFCmpONE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
+   Value *zcmp = m_builder.CreateFCmpONE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpONE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
 }
 
+llvm::Value * Instructions::str(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
 
-llvm::Value * Instructions::abs(llvm::Value *in)
+   return vectorFromVals(const1f, const1f, const1f, const1f);
+}
+
+llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
+   return res;
+}
+
+llvm::Value * Instructions::trunc(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
-   Value *xabs  = callFAbs(vec[0]);
-   Value *yabs  = callFAbs(vec[1]);
-   Value *zabs  = callFAbs(vec[2]);
-   Value *wabs  = callFAbs(vec[3]);
-   return vectorFromVals(xabs, yabs, zabs, wabs);
+   Value *icastx = m_builder.CreateFPToSI(vec[0], IntegerType::get(32),
+                                          name("ftoix"));
+   Value *icasty = m_builder.CreateFPToSI(vec[1], IntegerType::get(32),
+                                          name("ftoiy"));
+   Value *icastz = m_builder.CreateFPToSI(vec[2], IntegerType::get(32),
+                                          name("ftoiz"));
+   Value *icastw = m_builder.CreateFPToSI(vec[3], IntegerType::get(32),
+                                          name("ftoiw"));
+   Value *fx = m_builder.CreateSIToFP(icastx, Type::FloatTy,
+                                      name("fx"));
+   Value *fy = m_builder.CreateSIToFP(icasty, Type::FloatTy,
+                                      name("fy"));
+   Value *fz = m_builder.CreateSIToFP(icastz, Type::FloatTy,
+                                      name("fz"));
+   Value *fw = m_builder.CreateSIToFP(icastw, Type::FloatTy,
+                                      name("fw"));
+   return vectorFromVals(fx, fy, fz, fw);
 }
 
-void Instructions::ifop(llvm::Value *in)
+llvm::Value * Instructions::x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
-   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
 
-   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
-   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
-   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+   Value *x2x3 = m_builder.CreateMul( vec2[0], vec3[0], name("x2x3"));
+   Value *y2y3 = m_builder.CreateMul( vec2[1], vec3[1], name("y2y3"));
+   Value *x1px2x3 = m_builder.CreateAdd (vec1[0], x2x3, name("x1 + x2x3"));
+   Value *x1px2x3py2y3 = m_builder.CreateAdd (x1px2x3, y2y3, name("x1 + x2x3 + y2y3"));
 
-   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+   Value *x2z3 = m_builder.CreateMul( vec2[0], vec3[2], name("x2z3"));
+   Value *y2w3 = m_builder.CreateMul( vec2[1], vec3[3], name("y2w3"));
+   Value *y1px2z3 = m_builder.CreateAdd (vec1[1], x2z3, name("y1 + x2z3"));
+   Value *y1px2z3py2w3 = m_builder.CreateAdd (y1px2z3, y2w3, name("y1 + x2z3 + y2w3"));
 
-   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
-   m_builder.CreateCondBr(xcmp, ifthen, ifend);
-   //m_builder.SetInsertPoint(yblock);
+   return vectorFromVals(x1px2x3py2y3, y1px2z3py2w3, x1px2x3py2y3, y1px2z3py2w3);
+}
 
-   m_builder.SetInsertPoint(ifthen);
-   m_ifStack.push(ifend);
+void Instructions::printVector(llvm::Value *val)
+{
+   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+
+   if (!m_fmtPtr) {
+      Constant *format = ConstantArray::get(frmt, true);
+      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
+      GlobalVariable* globalFormat = new GlobalVariable(
+         /*Type=*/arrayTy,
+         /*isConstant=*/true,
+         /*Linkage=*/GlobalValue::InternalLinkage,
+         /*Initializer=*/0, // has initializer, specified below
+         /*Name=*/name(".str"),
+         m_mod);
+      globalFormat->setInitializer(format);
+
+      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
+      std::vector<Constant*> const_ptr_21_indices;
+      const_ptr_21_indices.push_back(const_int0);
+      const_ptr_21_indices.push_back(const_int0);
+      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
+                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
+   }
+
+   Function *func_printf = m_mod->getFunction("printf");
+   if (!func_printf)
+      func_printf = declarePrintf();
+   assert(func_printf);
+   std::vector<llvm::Value*> vec = extractVector(val);
+   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
+   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
+   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
+   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
+   std::vector<Value*> params;
+   params.push_back(m_fmtPtr);
+   params.push_back(dx);
+   params.push_back(dy);
+   params.push_back(dz);
+   params.push_back(dw);
+   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
+                                         name("printf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(true);
 }
 
-llvm::BasicBlock * Instructions::currentBlock() const
+const char * Instructions::name(const char *prefix)
 {
-   return m_builder.GetInsertBlock();
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
 }
 
-void Instructions::elseop()
+llvm::Value *Instructions::callFAbs(llvm::Value *val)
 {
-   assert(!m_ifStack.empty());
-   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
-   m_builder.CreateBr(ifend);
-   m_builder.SetInsertPoint(m_ifStack.top());
-   currentBlock()->setName(name("ifelse"));
-   m_ifStack.pop();
-   m_ifStack.push(ifend);
+   if (!m_llvmFAbs) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fabsArgs;
+      fabsArgs.push_back(Type::FloatTy);
+      PAListPtr fabsPal;
+      FunctionType* fabsType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fabsArgs,
+         /*isVarArg=*/false);
+      m_llvmFAbs = Function::Create(
+         /*Type=*/fabsType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"fabs", m_mod);
+      m_llvmFAbs->setCallingConv(CallingConv::C);
+      m_llvmFAbs->setParamAttrs(fabsPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
+                                         name("fabs"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-void Instructions::endif()
+llvm::Value * Instructions::callFExp(llvm::Value *val)
 {
-   assert(!m_ifStack.empty());
-   m_builder.CreateBr(m_ifStack.top());
-   m_builder.SetInsertPoint(m_ifStack.top());
-   m_ifStack.pop();
+   if (!m_llvmFexp) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fexpArgs;
+      fexpArgs.push_back(Type::FloatTy);
+      PAListPtr fexpPal;
+      FunctionType* fexpType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fexpArgs,
+         /*isVarArg=*/false);
+      m_llvmFexp = Function::Create(
+         /*Type=*/fexpType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"expf", m_mod);
+      m_llvmFexp->setCallingConv(CallingConv::C);
+      m_llvmFexp->setParamAttrs(fexpPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFexp, val,
+                                         name("expf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
+llvm::Value * Instructions::callFLog(llvm::Value *val)
 {
-   llvm::Value *m = mul(in1, in2);
-   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
-   llvm::Value *s = sub(vec1, in1);
-   return add(m, mul(s, in3));
+   if (!m_llvmFlog) {
+      // predeclare the intrinsic
+      std::vector<const Type*> flogArgs;
+      flogArgs.push_back(Type::FloatTy);
+      PAListPtr flogPal;
+      FunctionType* flogType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/flogArgs,
+         /*isVarArg=*/false);
+      m_llvmFlog = Function::Create(
+         /*Type=*/flogType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"logf", m_mod);
+      m_llvmFlog->setCallingConv(CallingConv::C);
+      m_llvmFlog->setParamAttrs(flogPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
+                                         name("logf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-void Instructions::beginLoop()
+llvm::Value * Instructions::callFloor(llvm::Value *val)
 {
-   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
-   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
-
-   m_builder.CreateBr(begin);
-   Loop loop;
-   loop.begin = begin;
-   loop.end   = end;
-   m_builder.SetInsertPoint(begin);
-   m_loopStack.push(loop);
+   if (!m_llvmFloor) {
+      // predeclare the intrinsic
+      std::vector<const Type*> floorArgs;
+      floorArgs.push_back(Type::FloatTy);
+      PAListPtr floorPal;
+      FunctionType* floorType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/floorArgs,
+         /*isVarArg=*/false);
+      m_llvmFloor = Function::Create(
+         /*Type=*/floorType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"floorf", m_mod);
+      m_llvmFloor->setCallingConv(CallingConv::C);
+      m_llvmFloor->setParamAttrs(floorPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
+                                          name("floorf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-void Instructions::endLoop()
+llvm::Value *Instructions::callFSqrt(llvm::Value *val)
 {
-   assert(!m_loopStack.empty());
-   Loop loop = m_loopStack.top();
-   m_builder.CreateBr(loop.begin);
-   loop.end->moveAfter(currentBlock());
-   m_builder.SetInsertPoint(loop.end);
-   m_loopStack.pop();
+   if (!m_llvmFSqrt) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fsqrtArgs;
+      fsqrtArgs.push_back(Type::FloatTy);
+      PAListPtr fsqrtPal;
+      FunctionType* fsqrtType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fsqrtArgs,
+         /*isVarArg=*/false);
+      m_llvmFSqrt = Function::Create(
+         /*Type=*/fsqrtType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.sqrt.f32", m_mod);
+      m_llvmFSqrt->setCallingConv(CallingConv::C);
+      m_llvmFSqrt->setParamAttrs(fsqrtPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
+                                         name("sqrt"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-void Instructions::brk()
+llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
 {
-   assert(!m_loopStack.empty());
-   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
-   m_builder.CreateBr(m_loopStack.top().end);
-   m_builder.SetInsertPoint(unr);
+   if (!m_llvmPow) {
+      // predeclare the intrinsic
+      std::vector<const Type*> powArgs;
+      powArgs.push_back(Type::FloatTy);
+      powArgs.push_back(Type::FloatTy);
+      PAListPtr powPal;
+      FunctionType* powType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/powArgs,
+         /*isVarArg=*/false);
+      m_llvmPow = Function::Create(
+         /*Type=*/powType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.pow.f32", m_mod);
+      m_llvmPow->setCallingConv(CallingConv::C);
+      m_llvmPow->setParamAttrs(powPal);
+   }
+   std::vector<Value*> params;
+   params.push_back(val1);
+   params.push_back(val2);
+   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
+                                         name("pow"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
 }
 
-llvm::Value * Instructions::trunc(llvm::Value *in)
+llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
+                                           llvm::Value *z, llvm::Value *w)
 {
-   std::vector<llvm::Value*> vec = extractVector(in);
-   Value *icastx = m_builder.CreateFPToSI(vec[0], IntegerType::get(32),
-                                          name("ftoix"));
-   Value *icasty = m_builder.CreateFPToSI(vec[1], IntegerType::get(32),
-                                          name("ftoiy"));
-   Value *icastz = m_builder.CreateFPToSI(vec[2], IntegerType::get(32),
-                                          name("ftoiz"));
-   Value *icastw = m_builder.CreateFPToSI(vec[3], IntegerType::get(32),
-                                          name("ftoiw"));
-   Value *fx = m_builder.CreateSIToFP(icastx, Type::FloatTy,
-                                      name("fx"));
-   Value *fy = m_builder.CreateSIToFP(icasty, Type::FloatTy,
-                                      name("fy"));
-   Value *fz = m_builder.CreateSIToFP(icastz, Type::FloatTy,
-                                      name("fz"));
-   Value *fw = m_builder.CreateSIToFP(icastw, Type::FloatTy,
-                                      name("fw"));
-   return vectorFromVals(fx, fy, fz, fw);
+   Constant *const_vec = Constant::getNullValue(m_floatVecType);
+   Value *res = m_builder.CreateInsertElement(const_vec, x,
+                                              m_storage->constantInt(0),
+                                              name("vecx"));
+   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
+                               name("vecxy"));
+   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
+                               name("vecxyz"));
+   if (w)
+      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
+                                          name("vecxyzw"));
+   return res;
 }
 
-void Instructions::end()
+llvm::Value * Instructions::constVector(float x, float y, float z, float w)
 {
-   m_builder.CreateRetVoid();
+   std::vector<Constant*> vec(4);
+   vec[0] = ConstantFP::get(APFloat(x));
+   vec[1] = ConstantFP::get(APFloat(y));
+   vec[2] = ConstantFP::get(APFloat(z));
+   vec[3] = ConstantFP::get(APFloat(w));
+   return ConstantVector::get(m_floatVecType, vec);
 }
 
-void Instructions::cal(int label, llvm::Value *input)
+llvm::Function * Instructions::declarePrintf()
 {
-   std::vector<Value*> params;
-   params.push_back(input);
-   llvm::Function *func = findFunction(label);
-
-   m_builder.CreateCall(func, params.begin(), params.end());
+   std::vector<const Type*> args;
+   PAListPtr params;
+   FunctionType* funcTy = FunctionType::get(
+      /*Result=*/IntegerType::get(32),
+      /*Params=*/args,
+      /*isVarArg=*/true);
+   Function* func_printf = Function::Create(
+      /*Type=*/funcTy,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Name=*/"printf", m_mod);
+   func_printf->setCallingConv(CallingConv::C);
+   func_printf->setParamAttrs(params);
+   return func_printf;
 }
 
 llvm::Function * Instructions::declareFunc(int label)
@@ -778,27 +1108,6 @@ llvm::Function * Instructions::declareFunc(int label)
    return func;
 }
 
-void Instructions::bgnSub(unsigned label)
-{
-   llvm::Function *func = findFunction(label);
-
-   Function::arg_iterator args = func->arg_begin();
-   Value *ptr_INPUT = args++;
-   ptr_INPUT->setName("INPUT");
-   m_storage->pushArguments(ptr_INPUT);
-
-   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
-
-   m_func = func;
-   m_builder.SetInsertPoint(entry);
-}
-
-void Instructions::endSub()
-{
-   m_func = 0;
-   m_builder.SetInsertPoint(0);
-}
-
 llvm::Function * Instructions::findFunction(int label)
 {
    llvm::Function *func = m_functions[label];
@@ -809,17 +1118,6 @@ llvm::Function * Instructions::findFunction(int label)
    return func;
 }
 
-llvm::Value * Instructions::constVector(float x, float y, float z, float w)
-{
-   std::vector<Constant*> vec(4);
-   vec[0] = ConstantFP::get(APFloat(x));
-   vec[1] = ConstantFP::get(APFloat(y));
-   vec[2] = ConstantFP::get(APFloat(z));
-   vec[3] = ConstantFP::get(APFloat(w));
-   return ConstantVector::get(m_floatVecType, vec);
-}
-
-
 std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
 {
    std::vector<llvm::Value*> elems(4);
@@ -834,69 +1132,7 @@ std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
    return elems;
 }
 
-llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
-   llvm::Function *func = m_mod->getFunction("cmp");
-   assert(func);
-
-   std::vector<Value*> params;
-   params.push_back(in1);
-   params.push_back(in2);
-   params.push_back(in3);
-   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::cos(llvm::Value *in)
-{
-#if 0
-   llvm::Function *func = m_mod->getFunction("vcos");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
-   call->setTailCall(false);
-   return call;
-#else
-   std::vector<llvm::Value*> elems = extractVector(in);
-   Function *func = m_mod->getFunction("cosf");
-   assert(func);
-   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
-   cos->setCallingConv(CallingConv::C);
-   cos->setTailCall(true);
-   return vectorFromVals(cos, cos, cos, cos);
-#endif
-}
-
-llvm::Value * Instructions::scs(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("scs");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::kil(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("kil");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::sin(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("vsin");
-   assert(func);
 
-   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
-   call->setTailCall(false);
-   return call;
-}
 #endif //MESA_LLVM
 
 
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index d286ce80c7..8df30f62c8 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -57,15 +57,22 @@ public:
    llvm::BasicBlock *currentBlock() const;
 
    llvm::Value *abs(llvm::Value *in1);
-   llvm::Value *arl(llvm::Value *in1);
    llvm::Value *add(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *arl(llvm::Value *in1);
    void         beginLoop();
    void         bgnSub(unsigned);
    void         brk();
    void         cal(int label, llvm::Value *input);
+   llvm::Value *clamp(llvm::Value *in);
    llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
    llvm::Value *cos(llvm::Value *in);
    llvm::Value *cross(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *ddx(llvm::Value *in);
+   llvm::Value *ddy(llvm::Value *in);
+   llvm::Value *div(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3);
    llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
@@ -75,6 +82,7 @@ public:
    void         endLoop();
    void         end();
    void         endSub();
+   llvm::Value *exp(llvm::Value *in);
    llvm::Value *ex2(llvm::Value *in);
    llvm::Value *floor(llvm::Value *in);
    llvm::Value *frc(llvm::Value *in);
@@ -82,32 +90,41 @@ public:
    llvm::Value *kil(llvm::Value *in);
    llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2,
                      llvm::Value *in3);
-   llvm::Value *lit(llvm::Value *in);
    llvm::Value *lg2(llvm::Value *in);
+   llvm::Value *lit(llvm::Value *in);
+   llvm::Value *log(llvm::Value *in);
    llvm::Value *madd(llvm::Value *in1, llvm::Value *in2,
                      llvm::Value *in3);
-   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *max(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *neg(llvm::Value *in);
    llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *rcp(llvm::Value *in);
    llvm::Value *rsq(llvm::Value *in);
    llvm::Value *scs(llvm::Value *in);
+   llvm::Value *seq(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sfl(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sge(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sin(llvm::Value *in);
+   llvm::Value *sle(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *slt(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sne(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *str(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sub(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *trunc(llvm::Value *in);
+   llvm::Value *x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
 
    void printVector(llvm::Value *val);
 private:
    const char *name(const char *prefix);
 
    llvm::Value *callFAbs(llvm::Value *val);
+   llvm::Value *callFExp(llvm::Value *val);
+   llvm::Value *callFLog(llvm::Value *val);
    llvm::Value *callFloor(llvm::Value *val);
    llvm::Value *callFSqrt(llvm::Value *val);
-   llvm::Value *callFLog(llvm::Value *val);
    llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2);
 
    llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
@@ -135,6 +152,7 @@ private:
    llvm::Function   *m_llvmPow;
    llvm::Function   *m_llvmFloor;
    llvm::Function   *m_llvmFlog;
+   llvm::Function   *m_llvmFexp;
    llvm::Function   *m_llvmLit;
 
    llvm::Constant   *m_fmtPtr;
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index cc1516a45e..398fbd67bd 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -286,9 +286,13 @@ translate_instruction(llvm::Module *module,
       out = instr->rsq(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_EXP: {
+      out = instr->exp(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_LOG:
+   case TGSI_OPCODE_LOG: {
+      out = instr->log(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_MUL: {
       out = instr->mul(inputs[0], inputs[1]);
@@ -338,21 +342,31 @@ translate_instruction(llvm::Module *module,
       out = instr->lerp(inputs[0], inputs[1], inputs[2]);
    }
       break;
-   case TGSI_OPCODE_CND:
+   case TGSI_OPCODE_CND: {
+      out = instr->cnd(inputs[0], inputs[1], inputs[2]);
+   }
       break;
-   case TGSI_OPCODE_CND0:
+   case TGSI_OPCODE_CND0: {
+      out = instr->cnd0(inputs[0], inputs[1], inputs[2]);
+   }
       break;
-   case TGSI_OPCODE_DOT2ADD:
+   case TGSI_OPCODE_DOT2ADD: {
+      out = instr->dot2add(inputs[0], inputs[1], inputs[2]);
+   }
       break;
    case TGSI_OPCODE_INDEX:
       break;
-   case TGSI_OPCODE_NEGATE:
+   case TGSI_OPCODE_NEGATE: {
+      out = instr->neg(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_FRAC: {
       out = instr->frc(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_CLAMP:
+   case TGSI_OPCODE_CLAMP: {
+      out = instr->clamp(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_FLOOR: {
       out = instr->floor(inputs[0]);
@@ -392,9 +406,13 @@ translate_instruction(llvm::Module *module,
       out = instr->cos(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDX: {
+      out = instr->ddx(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_DDY:
+   case TGSI_OPCODE_DDY: {
+      out = instr->ddy(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_KILP:
       break;
@@ -408,9 +426,13 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_RFL:
       break;
-   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SEQ: {
+      out = instr->seq(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_SFL:
+   case TGSI_OPCODE_SFL: {
+      out = instr->sfl(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_SGT: {
       out = instr->sgt(inputs[0], inputs[1]);
@@ -420,11 +442,17 @@ translate_instruction(llvm::Module *module,
       out = instr->sin(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SLE: {
+      out = instr->sle(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SNE: {
+      out = instr->sne(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_STR:
+   case TGSI_OPCODE_STR: {
+      out = instr->str(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_TEX:
       break;
@@ -438,7 +466,9 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_UP4UB:
       break;
-   case TGSI_OPCODE_X2D:
+   case TGSI_OPCODE_X2D: {
+      out = instr->x2d(inputs[0], inputs[1], inputs[2]);
+   }
       break;
    case TGSI_OPCODE_ARA:
       break;
-- 
cgit v1.2.3


From 0116ea34e1308a233e406a5d26f09217a69a5ed6 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 28 Sep 2008 19:48:26 +0200
Subject: Gallivm: more instructions.

---
 src/gallium/auxiliary/gallivm/instructions.cpp | 61 ++++++++++++++++++++++++--
 src/gallium/auxiliary/gallivm/instructions.h   |  5 +++
 src/gallium/auxiliary/gallivm/tgsitollvm.cpp   | 15 ++++---
 3 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index 5fdfe09d18..3eaf9aacf6 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -163,9 +163,18 @@ void Instructions::cal(int label, llvm::Value *input)
    m_builder.CreateCall(func, params.begin(), params.end());
 }
 
+llvm::Value * Instructions::ceil(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callCeil(vec[0]), callCeil(vec[1]),
+                         callCeil(vec[2]), callCeil(vec[3]));
+}
+
 llvm::Value * Instructions::clamp(llvm::Value *in1)
 {
-	// FIXME
+   llvm::Value *zero = constVector(0.0f, 0.0f, 0.0f, 0.0f);
+   llvm::Value *one = constVector(1.0f, 1.0f, 1.0f, 1.0f);
+   return min( max(zero, in1), one);
 }
 
 llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
@@ -289,12 +298,14 @@ llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
 
 llvm::Value * Instructions::ddx(llvm::Value *in)
 {
-	// FIXME
+   // FIXME
+   assert(0);
 }
 
 llvm::Value * Instructions::ddy(llvm::Value *in)
 {
-	// FIXME
+   // FIXME
+   assert(0);
 }
 
 llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2)
@@ -319,6 +330,19 @@ llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Va
    return vectorFromVals(dot2add, dot2add, dot2add, dot2add);
 }
 
+llvm::Value * Instructions::dp2(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   return vectorFromVals(xy, xy, xy, xy);
+}
+
 llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
 {
    Value *mulRes = mul(in1, in2);
@@ -581,6 +605,12 @@ llvm::Value * Instructions::neg(llvm::Value *in)
    return neg;
 }
 
+llvm::Value * Instructions::nrm(llvm::Value *in)
+{
+   llvm::Value *v = rsq(in);
+   return mul(v, in);
+}
+
 llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
 {
    Value *x1 = m_builder.CreateExtractElement(in1,
@@ -887,6 +917,31 @@ const char * Instructions::name(const char *prefix)
    return m_name;
 }
 
+llvm::Value * Instructions::callCeil(llvm::Value *val)
+{
+   if (!m_llvmCeil) {
+      // predeclare the intrinsic
+      std::vector<const Type*> ceilArgs;
+      ceilArgs.push_back(Type::FloatTy);
+      PAListPtr ceilPal;
+      FunctionType* ceilType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/ceilArgs,
+         /*isVarArg=*/false);
+      m_llvmCeil = Function::Create(
+         /*Type=*/ceilType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"ceilf", m_mod);
+      m_llvmCeil->setCallingConv(CallingConv::C);
+      m_llvmCeil->setParamAttrs(ceilPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmCeil, val,
+                                          name("ceilf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
 llvm::Value *Instructions::callFAbs(llvm::Value *val)
 {
    if (!m_llvmFAbs) {
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index 8df30f62c8..c3b28e9746 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -63,6 +63,7 @@ public:
    void         bgnSub(unsigned);
    void         brk();
    void         cal(int label, llvm::Value *input);
+   llvm::Value *ceil(llvm::Value *in);
    llvm::Value *clamp(llvm::Value *in);
    llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
    llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
@@ -73,6 +74,7 @@ public:
    llvm::Value *ddy(llvm::Value *in);
    llvm::Value *div(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *dp2(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
@@ -99,6 +101,7 @@ public:
    llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *neg(llvm::Value *in);
+   llvm::Value *nrm(llvm::Value *in);
    llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *rcp(llvm::Value *in);
    llvm::Value *rsq(llvm::Value *in);
@@ -120,6 +123,7 @@ public:
 private:
    const char *name(const char *prefix);
 
+   llvm::Value *callCeil(llvm::Value *val);
    llvm::Value *callFAbs(llvm::Value *val);
    llvm::Value *callFExp(llvm::Value *val);
    llvm::Value *callFLog(llvm::Value *val);
@@ -147,6 +151,7 @@ private:
 
    llvm::VectorType *m_floatVecType;
 
+   llvm::Function   *m_llvmCeil;
    llvm::Function   *m_llvmFSqrt;
    llvm::Function   *m_llvmFAbs;
    llvm::Function   *m_llvmPow;
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 398fbd67bd..fdfbb76c16 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -498,11 +498,18 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_TXB:
       break;
-   case TGSI_OPCODE_NRM:
+   case TGSI_OPCODE_NRM4:
+   case TGSI_OPCODE_NRM: {
+      out = instr->nrm(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_DIV:
+   case TGSI_OPCODE_DIV: {
+      out = instr->div(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_DP2:
+   case TGSI_OPCODE_DP2: {
+      out = instr->dp2(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_TXL:
       break;
@@ -620,8 +627,6 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_M3X2:
       break;
-   case TGSI_OPCODE_NRM4:
-      break;
    case TGSI_OPCODE_CALLNZ:
       break;
    case TGSI_OPCODE_IFC:
-- 
cgit v1.2.3


From 9859edc6060c8f4d23a91dbfabd786975e6447a2 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 28 Sep 2008 21:45:48 +0200
Subject: Gallivm: need to link with libstdc++ for llvm.

---
 configs/linux-llvm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/linux-llvm b/configs/linux-llvm
index 44e200e856..3b32db34d8 100644
--- a/configs/linux-llvm
+++ b/configs/linux-llvm
@@ -31,4 +31,4 @@ else
   LLVM_CXXFLAGS=
 endif
 
-GL_LIB_DEPS = $(LLVM_LDFLAGS) $(LLVM_LIBS) $(EXTRA_LIB_PATH) -lX11 -lXext -lm -lpthread
+GL_LIB_DEPS = $(LLVM_LDFLAGS) $(LLVM_LIBS) $(EXTRA_LIB_PATH) -lX11 -lXext -lm -lpthread -lstdc++
-- 
cgit v1.2.3


From fdcaf569d446db830a6eafd9c7f7c1b1030c0a93 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Sun, 28 Sep 2008 23:18:55 +0200
Subject: Gallivm: fix off-by-one.

---
 src/gallium/auxiliary/gallivm/instructionssoa.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index efddc04e81..9a3ed9f538 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -259,7 +259,7 @@ void InstructionsSoa::createBuiltins()
 {
    MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
       (const char*)&soabuiltins_data[0],
-      (const char*)&soabuiltins_data[Elements(soabuiltins_data)]);
+      (const char*)&soabuiltins_data[Elements(soabuiltins_data)-1]);
    m_builtins = ParseBitcodeFile(buffer);
    std::cout<<"Builtins created at "<<m_builtins<<std::endl;
    assert(m_builtins);
-- 
cgit v1.2.3


From 3f4b67f5d715f53fec618ed0e48615f87ff1cfda Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Tue, 30 Sep 2008 20:50:49 +0200
Subject: Gallivm: port to llvm 2.4.

---
 configs/linux-llvm                                 |   1 +
 src/gallium/auxiliary/gallivm/gallivm_builtins.cpp | 252 ++++++++++-----------
 src/gallium/auxiliary/gallivm/instructions.cpp     |  36 +--
 src/gallium/auxiliary/gallivm/instructions.h       |   2 +-
 src/gallium/auxiliary/gallivm/instructionssoa.cpp  |   6 +-
 src/gallium/auxiliary/gallivm/instructionssoa.h    |   2 +-
 6 files changed, 150 insertions(+), 149 deletions(-)

diff --git a/configs/linux-llvm b/configs/linux-llvm
index 3b32db34d8..489cfd0546 100644
--- a/configs/linux-llvm
+++ b/configs/linux-llvm
@@ -31,4 +31,5 @@ else
   LLVM_CXXFLAGS=
 endif
 
+LD = g++
 GL_LIB_DEPS = $(LLVM_LDFLAGS) $(LLVM_LIBS) $(EXTRA_LIB_PATH) -lX11 -lXext -lm -lpthread -lstdc++
diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
index 0fc5c4ec5c..fcc5c05794 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
@@ -1,140 +1,140 @@
 static const unsigned char llvm_builtins_data[] = {
-0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x29,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
+0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
 0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39,
 0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02,
 0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24,
 0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64,
-0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x02,0x00,0x00,0x00,0x0b,0x04,0x00,0x0c,
-0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x32,0x22,0x48,0x09,
-0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,0xc6,0x05,0x42,0x52,
-0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,0x29,0x80,0x21,0x00,
-0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,0x80,0x50,0x2b,0x03,
-0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,0x14,0x01,0x80,0x11,
-0x80,0x22,0x88,0x00,0x13,0xa2,0x74,0xb0,0x03,0x3c,0xb0,0x83,0x36,0x80,0x87,0x71,
-0x68,0x03,0x76,0x48,0x07,0x77,0xa8,0x07,0x7c,0x68,0x83,0x73,0x70,0x87,0x7a,0xd8,
-0x70,0x0f,0xe5,0xd0,0x06,0xf0,0xa0,0x07,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x71,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,
-0xe9,0x80,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,0x71,0x60,0x07,0x7a,
-0x10,0x07,0x76,0xa0,0x07,0x71,0x60,0x07,0x6d,0x90,0x0e,0x73,0x20,0x07,0x7a,0x30,
-0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,
-0x72,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0e,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,
-0xa0,0x07,0x73,0x20,0x07,0x6d,0x60,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x76,0x40,0x07,0x6d,0x60,0x0f,0x76,0x40,0x07,0x7a,0x60,0x07,0x74,0xa0,0x07,
-0x76,0x40,0x07,0x6d,0x60,0x0f,0x71,0x20,0x07,0x78,0xa0,0x07,0x71,0x20,0x07,0x78,
-0xa0,0x07,0x71,0x20,0x07,0x78,0xd0,0x06,0xe1,0x00,0x07,0x7a,0x00,0x07,0x7a,0x60,
-0x07,0x74,0xd0,0x06,0xe6,0x80,0x07,0x70,0xa0,0x07,0x71,0x20,0x07,0x78,0xa0,0x07,
-0x71,0x20,0x07,0x78,0xa0,0xf3,0x40,0x88,0x04,0x32,0x32,0x02,0x04,0x20,0x76,0x46,
-0xfc,0x6c,0x48,0x92,0x00,0x40,0x00,0x00,0x00,0x00,0x0c,0x49,0x12,0x20,0x00,0x00,
-0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x01,0x00,0x00,0x00,0x30,0x24,0x59,0x00,0x20,
-0x08,0x00,0x00,0x00,0x86,0x24,0x0a,0x00,0x04,0x00,0x00,0x00,0xc0,0x90,0x84,0x01,
-0x02,0x00,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
-0x05,0x00,0x02,0x00,0x00,0x00,0x60,0x48,0x72,0x00,0x01,0x00,0x00,0x00,0x00,0x0c,
-0x49,0x14,0x00,0x08,0x00,0x00,0x00,0x80,0x21,0x49,0x01,0x00,0x41,0x00,0x00,0x00,
-0x90,0x05,0x02,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
+0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x03,0x00,0x00,0x00,0x0b,0x84,0xff,0xff,
+0xff,0xff,0x1f,0xc0,0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,
+0x32,0x22,0x48,0x09,0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,
+0xc6,0x05,0x42,0x52,0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,
+0x29,0x80,0x21,0x00,0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,
+0x80,0x50,0x2b,0x03,0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,
+0x14,0x01,0x80,0x11,0x80,0x22,0x88,0x00,0x13,0x30,0x7c,0xc0,0x03,0x3b,0xf8,0x05,
+0x3b,0xa0,0x83,0x36,0xa8,0x07,0x77,0x58,0x07,0x77,0x78,0x87,0x7b,0x70,0x87,0x36,
+0x60,0x87,0x74,0x70,0x87,0x7a,0xc0,0x87,0x36,0x38,0x07,0x77,0xa8,0x87,0x0d,0xf7,
+0x50,0x0e,0x6d,0x00,0x0f,0x7a,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xe9,0x10,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,
+0x78,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,0xe9,0x10,0x07,0x76,0xa0,0x07,0x71,
+0x60,0x07,0x7a,0x10,0x07,0x76,0xd0,0x06,0xe9,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,
+0x07,0x7a,0x30,0x07,0x72,0xd0,0x06,0xe9,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,
+0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x7a,
+0x30,0x07,0x72,0xd0,0x06,0xe6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xf6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,0x07,
+0x74,0xd0,0x06,0xf6,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,
+0x10,0x07,0x72,0x80,0x07,0x6d,0x10,0x0e,0x70,0xa0,0x07,0x70,0xa0,0x07,0x76,0x40,
+0x07,0x6d,0x60,0x0e,0x78,0x00,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,
+0x72,0x80,0x07,0x3a,0x0f,0x84,0x48,0x20,0x23,0x24,0x40,0x00,0x62,0x67,0x88,0x9f,
+0x19,0x92,0x24,0x00,0x10,0x04,0x00,0x00,0x00,0x43,0x92,0x04,0x08,0x00,0x00,0x00,
+0x00,0x60,0x48,0xa2,0x00,0x40,0x10,0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x02,
+0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x41,0x00,0x00,0x00,0x30,0x24,0x61,0x80,0x00,
+0x00,0x00,0x00,0x00,0x86,0x24,0x07,0x10,0x00,0x00,0x00,0x00,0xc0,0x90,0x44,0x01,
+0x80,0x20,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
+0x05,0x00,0x82,0x00,0x00,0x00,0x60,0x48,0x52,0x00,0x40,0x10,0x00,0x00,0x00,0x64,
+0x81,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
 0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x8a,0x8a,0x59,0x8b,0x43,0x50,0xd2,0x09,0x02,
 0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,0x64,0x56,0x28,0x66,
 0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,0xa8,0x64,0x06,0x95,
 0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,0x04,0x00,0x00,0x00,
 0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
 0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,0x10,0x84,0xc3,0x60,
-0x04,0x00,0x00,0x00,0x93,0x0c,0xce,0x43,0x4c,0x31,0x3c,0x8e,0x34,0xc9,0x30,0x41,
-0xc2,0x14,0x03,0x34,0x51,0x93,0x0c,0x4d,0x44,0x4c,0x31,0x44,0x8d,0x35,0x56,0x01,
-0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0x46,0x41,0x08,0xcc,
-0x73,0x9b,0x05,0x21,0x30,0xcf,0x6e,0x18,0x84,0x00,0x2c,0x8b,0x35,0x04,0x80,0x39,
-0x04,0x81,0x5d,0x20,0x80,0x0f,0x0c,0x43,0xe4,0xd3,0x36,0x81,0x04,0x3e,0x30,0x0c,
-0x91,0x4f,0x5b,0x05,0x12,0xf8,0xc0,0x30,0x44,0x7e,0x7d,0x00,0x05,0xd1,0x4c,0x11,
-0x66,0x12,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0xc3,0x0d,0xce,0x43,0x4c,0x37,0x3c,0x8e,0x34,0xdc,0x30,0x41,
+0xc2,0x74,0x03,0x34,0x51,0xc3,0x0d,0x4d,0x44,0x4c,0x37,0x44,0x8d,0x35,0x56,0x01,
+0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0xd6,0x10,0x00,0xe6,
+0x10,0x04,0x76,0x81,0x00,0x3e,0x30,0x0c,0x91,0x4f,0x1b,0x05,0x21,0x30,0x8f,0x6d,
+0x13,0x48,0xe0,0x03,0xc3,0x10,0xf9,0xb4,0x55,0x20,0x81,0x0f,0x0c,0x43,0xe4,0xd7,
+0x66,0x41,0x08,0xcc,0xa3,0x1f,0x40,0x41,0x34,0x53,0x84,0x99,0xc4,0x20,0x30,0x8f,
+0x61,0x10,0x02,0xb0,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x27,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
 0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,0x55,0xd6,0x1c,0x84,
 0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,0x41,0xfc,0x03,0x00,
-0x63,0x08,0x0d,0x34,0xc9,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
+0x63,0x08,0x0d,0x34,0xdc,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
 0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,0x6d,0x73,0x0c,0x19,
-0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x13,0x00,0x00,0x00,0x17,0x60,0x20,0xc5,
-0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,0x14,0x13,0xf3,0xd4,0xb8,0x69,
-0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,0xba,0x35,0x0c,0x13,0xf3,0x9c,
-0x80,0xe4,0x36,0x48,0x81,0x10,0xc3,0x4a,0x4c,0x54,0xd4,0x6c,0x8b,0x23,0x28,0x76,
-0x41,0x4c,0xcc,0xa3,0x1b,0x07,0x21,0x00,0xcb,0x72,0x00,0x05,0xd1,0x4c,0x11,0x66,
-0x18,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,0x11,0x00,0x00,0x00,
-0x63,0x08,0x4d,0x64,0x16,0xc1,0x49,0x86,0xab,0x22,0x66,0x19,0x02,0x01,0x1b,0x43,
-0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,0x0a,0x20,0x0b,0x34,
-0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x24,0x83,0x57,0x11,0xb3,
-0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,0x48,0xb3,0x04,0xc6,
-0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,0x63,0x08,0xcd,0x64,
-0x64,0x40,0x70,0x92,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,0x60,0x0c,0xc1,0x99,
-0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,0x33,0x38,0xd0,0x00,
-0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,0xe0,0x24,0x03,0x1b,
-0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,0x30,0x63,0x08,0x0f,
-0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,0xa0,0x06,0x70,0x00,
-0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x76,0x52,0x4c,0xcc,
-0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,0xc6,0x50,0x8a,0x89,
-0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,0x79,0x68,0x73,0x20,
-0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,0x9e,0xdb,0x32,0x88,
-0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,0xa7,0xb7,0x95,0x62,
-0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,0x34,0x35,0x56,0x62,
-0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,0x2c,0x82,0xd3,0x0c,
-0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,0x24,0x01,0x63,0xec,
-0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,0xfc,0xc4,0xd0,0x90,
-0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,0xc1,0x71,0x7b,0x29,
-0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,0x32,0xf6,0xe6,0x46,
-0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,0x4e,0x33,0x58,0x47,
-0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,0x33,0xe1,0xbc,0xa5,
-0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,0x3a,0x40,0xc6,0xde,
-0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,0xed,0x82,0x10,0x9c,
-0xa6,0xba,0x81,0x44,0x70,0x9a,0xc1,0x17,0x9c,0x66,0x32,0x93,0x42,0x60,0x1e,0x7b,
-0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,0xa7,0x6d,0xa4,0x98,
-0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,0x9c,0x66,0xc0,0x7b,
-0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,0x8b,0xe0,0x34,0x83,
-0x2f,0x38,0xcd,0x64,0xd3,0xe6,0x61,0x08,0x4e,0x53,0xd5,0xf6,0x01,0x14,0x44,0x33,
-0x45,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
-0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,0x1e,0xe1,0x19,0xc6,
-0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,0x15,0xc1,0x31,0x84,
-0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,0x71,0x6c,0x23,0x38,
-0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,0x35,0xc7,0x20,0x79,
-0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,0x04,0x10,0x20,0xd5,
-0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,0x30,0x06,0x64,0xe0,
-0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,0x63,0x21,0x40,0x70,
-0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,0x25,0x00,0x00,0x00,
-0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,0x05,0x31,0x31,0xcf,
-0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,0x41,0x08,0xc0,0xb2,
-0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,0x9b,0x8a,0x21,0x00,
-0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,0x82,0xd3,0x54,0xb7,
-0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,0x9d,0x9b,0x87,0x21,
-0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,0x10,0x82,0xd3,0x54,
-0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,0xb4,0x8d,0x14,0x13,
-0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,0x00,0x2c,0x8b,0xcd,
-0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,0x00,0x05,0xd1,0x4c,
-0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0x4a,0x60,0x04,
-0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0x48,
-0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,0x44,0x39,0x58,0x85,
-0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x16,0x41,0x4c,0xcc,0x63,0xdb,0x04,0x31,
-0x31,0x4f,0x6e,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x56,0x41,0x4c,
-0xcc,0xd3,0x1b,0x45,0x21,0x00,0xcb,0xb2,0x9b,0x04,0x21,0x00,0xcb,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,
-0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,0x34,0xc7,0x20,0x51,
-0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,0x94,0x83,0x58,0x38,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x76,0x51,
-0x4c,0xcc,0x53,0xdb,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,0x31,0x8f,0x6e,0x0d,
-0x43,0x05,0x2c,0x66,0x41,0x4c,0xcc,0xd3,0x1f,0x40,0x41,0x34,0x53,0x84,0x19,0x05,
-0x21,0x00,0xcb,0x02,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0xa0,0x04,
-0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,0xc9,0x30,0x49,0xc4,
-0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xc9,0x50,0x49,0xc4,0x2c,0x03,0x21,0x58,
-0x63,0x08,0x4d,0x34,0xc9,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,0x63,0x08,0x8d,0x33,
-0xc9,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,0x1a,0x00,0x00,0x00,
-0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,0x04,0x31,0x31,0x4f,
-0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x47,0x20,0xb9,0x0d,0x52,
-0x20,0xc4,0xb0,0x12,0x13,0x15,0x35,0xdb,0xe2,0x08,0x8a,0x5d,0x10,0x13,0xf3,0xec,
-0x37,0x90,0x2c,0x4e,0xf4,0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,
-0x74,0x02,0xc8,0xe2,0x44,0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,
-0x18,0x11,0x31,0x55,0xc0,0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,
-0x46,0x31,0x08,0xcc,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,
-0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,0x23,0x59,0xc2,0x20,0x09,0x92,0x1d,0x18,
-0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,
-0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,
-0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,0x84,0x34,0x85,0x31,0x10,0x0a,0xb2,0x3c,
-0x56,0x30,0x08,0xcc,0x63,0x0b,0x44,0x25,0x21,0x0d,0x00,0x00,0x00,0x00,0x00,0x00};
+0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x10,0x00,0x00,0x00,0x27,0x50,0x20,0x05,
+0xd1,0x0c,0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,
+0x14,0x13,0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,
+0xba,0x35,0x0c,0x13,0xf3,0xd8,0x05,0x31,0x31,0x8f,0x6e,0x1c,0x84,0x00,0x2c,0xcb,
+0x01,0x14,0x44,0x33,0x45,0x98,0x61,0x0c,0x02,0xf3,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,
+0x11,0x00,0x00,0x00,0x63,0x08,0x4d,0x64,0x16,0xc1,0xe1,0x86,0xab,0x22,0x66,0x19,
+0x02,0x01,0x1b,0x43,0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,
+0x0a,0x20,0x0b,0x34,0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x70,
+0x83,0x57,0x11,0xb3,0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,
+0x48,0xb3,0x04,0xc6,0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,
+0x63,0x08,0xcd,0x64,0x64,0x40,0x70,0xb8,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,
+0x60,0x0c,0xc1,0x99,0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,
+0x33,0x38,0xd0,0x00,0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,
+0xe0,0x70,0x03,0x1b,0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,
+0x30,0x63,0x08,0x0f,0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,
+0xa0,0x06,0x70,0x00,0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x76,0x52,0x4c,0xcc,0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,
+0xc6,0x50,0x8a,0x89,0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,
+0x79,0x68,0x73,0x20,0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,
+0x9e,0xdb,0x32,0x88,0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,
+0xa7,0xb7,0x95,0x62,0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,
+0x34,0x35,0x56,0x62,0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,
+0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,
+0x24,0x01,0x63,0xec,0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,
+0xfc,0xc4,0xd0,0x90,0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,
+0xc1,0x71,0x7b,0x29,0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,
+0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,
+0x4e,0x33,0x58,0x47,0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,
+0x33,0xe1,0xbc,0xa5,0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,
+0x3a,0x40,0xc6,0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,
+0x6f,0x20,0x11,0x9c,0x66,0xf0,0x05,0xa7,0x99,0xec,0x82,0x10,0x9c,0xa6,0x32,0x93,
+0x42,0x60,0x1e,0x7b,0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,
+0xa7,0x6d,0xa4,0x98,0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,
+0x9c,0x66,0xc0,0x7b,0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,
+0x8b,0xe0,0x34,0x83,0x2f,0x38,0xcd,0x64,0xd3,0x07,0x50,0x10,0xcd,0x14,0x61,0xe6,
+0x61,0x08,0x4e,0x53,0xd5,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
+0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,
+0x1e,0xe1,0x19,0xc6,0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,
+0x15,0xc1,0x31,0x84,0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,
+0x71,0x6c,0x23,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,
+0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,
+0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,
+0x30,0x06,0x64,0xe0,0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,
+0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,
+0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,
+0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,
+0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,
+0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,
+0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,
+0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,
+0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,
+0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,
+0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,
+0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,
+0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,
+0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x26,0x41,0x08,0xc0,
+0xb2,0x18,0x45,0x21,0x00,0xcb,0xb2,0x5b,0x04,0x31,0x31,0x8f,0x6d,0x13,0xc4,0xc4,
+0x3c,0xb9,0x35,0x0c,0x15,0xb0,0x58,0x05,0x31,0x31,0x4f,0x7f,0x00,0x05,0xd1,0x4c,
+0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,
+0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
+0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,
+0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,
+0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x76,0x51,0x4c,0xcc,
+0x53,0xdb,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,
+0x31,0x8f,0x6e,0x16,0xc4,0xc4,0x3c,0xbd,0x51,0x10,0x02,0xb0,0x2c,0xd6,0x30,0x54,
+0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x2c,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,
+0xdc,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xdc,0x50,0x49,0xc4,
+0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xdc,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,
+0x63,0x08,0x8d,0x33,0xdc,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,
+0x17,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,
+0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x76,
+0x41,0x4c,0xcc,0xb3,0x1f,0x81,0x11,0x11,0x13,0x15,0x35,0x37,0x90,0x2c,0x4e,0xf4,
+0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44,
+0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0,
+0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03,
+0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,
+0x23,0x19,0xc3,0xa0,0x20,0x8b,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,
+0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,
+0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,
+0x84,0x34,0x85,0x25,0x0c,0x92,0x20,0x59,0xc1,0x20,0x30,0x8f,0x2d,0x10,0x95,0x84,
+0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index 3eaf9aacf6..599975d5ad 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -923,7 +923,7 @@ llvm::Value * Instructions::callCeil(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> ceilArgs;
       ceilArgs.push_back(Type::FloatTy);
-      PAListPtr ceilPal;
+      AttrListPtr ceilPal;
       FunctionType* ceilType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/ceilArgs,
@@ -933,7 +933,7 @@ llvm::Value * Instructions::callCeil(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"ceilf", m_mod);
       m_llvmCeil->setCallingConv(CallingConv::C);
-      m_llvmCeil->setParamAttrs(ceilPal);
+      m_llvmCeil->setAttributes(ceilPal);
    }
    CallInst *call =  m_builder.CreateCall(m_llvmCeil, val,
                                           name("ceilf"));
@@ -948,7 +948,7 @@ llvm::Value *Instructions::callFAbs(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> fabsArgs;
       fabsArgs.push_back(Type::FloatTy);
-      PAListPtr fabsPal;
+      AttrListPtr fabsPal;
       FunctionType* fabsType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/fabsArgs,
@@ -958,7 +958,7 @@ llvm::Value *Instructions::callFAbs(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"fabs", m_mod);
       m_llvmFAbs->setCallingConv(CallingConv::C);
-      m_llvmFAbs->setParamAttrs(fabsPal);
+      m_llvmFAbs->setAttributes(fabsPal);
    }
    CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
                                          name("fabs"));
@@ -973,7 +973,7 @@ llvm::Value * Instructions::callFExp(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> fexpArgs;
       fexpArgs.push_back(Type::FloatTy);
-      PAListPtr fexpPal;
+      AttrListPtr fexpPal;
       FunctionType* fexpType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/fexpArgs,
@@ -983,7 +983,7 @@ llvm::Value * Instructions::callFExp(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"expf", m_mod);
       m_llvmFexp->setCallingConv(CallingConv::C);
-      m_llvmFexp->setParamAttrs(fexpPal);
+      m_llvmFexp->setAttributes(fexpPal);
    }
    CallInst *call = m_builder.CreateCall(m_llvmFexp, val,
                                          name("expf"));
@@ -998,7 +998,7 @@ llvm::Value * Instructions::callFLog(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> flogArgs;
       flogArgs.push_back(Type::FloatTy);
-      PAListPtr flogPal;
+      AttrListPtr flogPal;
       FunctionType* flogType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/flogArgs,
@@ -1008,7 +1008,7 @@ llvm::Value * Instructions::callFLog(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"logf", m_mod);
       m_llvmFlog->setCallingConv(CallingConv::C);
-      m_llvmFlog->setParamAttrs(flogPal);
+      m_llvmFlog->setAttributes(flogPal);
    }
    CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
                                          name("logf"));
@@ -1023,7 +1023,7 @@ llvm::Value * Instructions::callFloor(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> floorArgs;
       floorArgs.push_back(Type::FloatTy);
-      PAListPtr floorPal;
+      AttrListPtr floorPal;
       FunctionType* floorType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/floorArgs,
@@ -1033,7 +1033,7 @@ llvm::Value * Instructions::callFloor(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"floorf", m_mod);
       m_llvmFloor->setCallingConv(CallingConv::C);
-      m_llvmFloor->setParamAttrs(floorPal);
+      m_llvmFloor->setAttributes(floorPal);
    }
    CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
                                           name("floorf"));
@@ -1048,7 +1048,7 @@ llvm::Value *Instructions::callFSqrt(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> fsqrtArgs;
       fsqrtArgs.push_back(Type::FloatTy);
-      PAListPtr fsqrtPal;
+      AttrListPtr fsqrtPal;
       FunctionType* fsqrtType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/fsqrtArgs,
@@ -1058,7 +1058,7 @@ llvm::Value *Instructions::callFSqrt(llvm::Value *val)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"llvm.sqrt.f32", m_mod);
       m_llvmFSqrt->setCallingConv(CallingConv::C);
-      m_llvmFSqrt->setParamAttrs(fsqrtPal);
+      m_llvmFSqrt->setAttributes(fsqrtPal);
    }
    CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
                                          name("sqrt"));
@@ -1074,7 +1074,7 @@ llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
       std::vector<const Type*> powArgs;
       powArgs.push_back(Type::FloatTy);
       powArgs.push_back(Type::FloatTy);
-      PAListPtr powPal;
+      AttrListPtr powPal;
       FunctionType* powType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/powArgs,
@@ -1084,7 +1084,7 @@ llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"llvm.pow.f32", m_mod);
       m_llvmPow->setCallingConv(CallingConv::C);
-      m_llvmPow->setParamAttrs(powPal);
+      m_llvmPow->setAttributes(powPal);
    }
    std::vector<Value*> params;
    params.push_back(val1);
@@ -1126,7 +1126,7 @@ llvm::Value * Instructions::constVector(float x, float y, float z, float w)
 llvm::Function * Instructions::declarePrintf()
 {
    std::vector<const Type*> args;
-   PAListPtr params;
+   AttrListPtr params;
    FunctionType* funcTy = FunctionType::get(
       /*Result=*/IntegerType::get(32),
       /*Params=*/args,
@@ -1136,7 +1136,7 @@ llvm::Function * Instructions::declarePrintf()
       /*Linkage=*/GlobalValue::ExternalLinkage,
       /*Name=*/"printf", m_mod);
    func_printf->setCallingConv(CallingConv::C);
-   func_printf->setParamAttrs(params);
+   func_printf->setAttributes(params);
    return func_printf;
 }
 
@@ -1148,7 +1148,7 @@ llvm::Function * Instructions::declareFunc(int label)
    args.push_back(vecPtr);
    args.push_back(vecPtr);
    args.push_back(vecPtr);
-   PAListPtr params;
+   AttrListPtr params;
    FunctionType *funcType = FunctionType::get(
       /*Result=*/Type::VoidTy,
       /*Params=*/args,
@@ -1159,7 +1159,7 @@ llvm::Function * Instructions::declareFunc(int label)
       /*Linkage=*/GlobalValue::ExternalLinkage,
       /*Name=*/name.c_str(), m_mod);
    func->setCallingConv(CallingConv::C);
-   func->setParamAttrs(params);
+   func->setAttributes(params);
    return func;
 }
 
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index c3b28e9746..e18571251e 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -146,7 +146,7 @@ private:
    llvm::Module             *m_mod;
    llvm::Function           *m_func;
    char                      m_name[32];
-   llvm::IRBuilder           m_builder;
+   llvm::IRBuilder<>         m_builder;
    int                       m_idx;
 
    llvm::VectorType *m_floatVecType;
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index 9a3ed9f538..5863f37095 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -259,7 +259,7 @@ void InstructionsSoa::createBuiltins()
 {
    MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
       (const char*)&soabuiltins_data[0],
-      (const char*)&soabuiltins_data[Elements(soabuiltins_data)-1]);
+      (const char*)&soabuiltins_data[Elements(soabuiltins_data)]);
    m_builtins = ParseBitcodeFile(buffer);
    std::cout<<"Builtins created at "<<m_builtins<<std::endl;
    assert(m_builtins);
@@ -458,8 +458,8 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
       func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage,
                               originalFunc->getName(), currentModule());
       func->setCallingConv(CallingConv::C);
-      const PAListPtr pal;
-      func->setParamAttrs(pal);
+      const AttrListPtr pal;
+      func->setAttributes(pal);
       currentModule()->dump();
    } else {
       DenseMap<const Value*, Value *> val;
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 3e20b652dd..20cab3b3bb 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -96,7 +96,7 @@ private:
                                          const std::vector<llvm::Value*> in3);
    void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST);
 private:
-   llvm::IRBuilder  m_builder;
+   llvm::IRBuilder<>  m_builder;
    StorageSoa *m_storage;
 
    std::map<int, std::string> m_functionsMap;
-- 
cgit v1.2.3


From 8bdb4d2b2fdb12d0ba5249c289d349e35d893d00 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Wed, 1 Oct 2008 00:00:58 +0200
Subject: Gallivm: add slt. glxgears should be running, except it isn't.

---
 src/gallium/auxiliary/gallivm/instructionssoa.cpp |   9 ++
 src/gallium/auxiliary/gallivm/instructionssoa.h   |   2 +
 src/gallium/auxiliary/gallivm/soabuiltins.c       | 155 +++++++++++++---------
 src/gallium/auxiliary/gallivm/tgsitollvm.cpp      |   1 +
 4 files changed, 101 insertions(+), 66 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index 5863f37095..a658072551 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -181,6 +181,7 @@ void InstructionsSoa::createFunctionMap()
    m_functionsMap[TGSI_OPCODE_POWER] = "pow";
    m_functionsMap[TGSI_OPCODE_LIT]   = "lit";
    m_functionsMap[TGSI_OPCODE_RSQ]   = "rsq";
+   m_functionsMap[TGSI_OPCODE_SLT]   = "slt";
 }
 
 void InstructionsSoa::createDependencies()
@@ -280,6 +281,14 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1, in2);
 }
 
+
+std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_SLT);
+   return callBuiltin(func, in1, in2);
+}
+
 llvm::Value * InstructionsSoa::allocaTemp()
 {
    VectorType *vector   = VectorType::get(Type::FloatTy, 4);
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 20cab3b3bb..3817fdc904 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -69,6 +69,8 @@ public:
    std::vector<llvm::Value*> pow(const std::vector<llvm::Value*> in1,
                                  const std::vector<llvm::Value*> in2);
    std::vector<llvm::Value*> rsq(const std::vector<llvm::Value*> in1);
+   std::vector<llvm::Value*> slt(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
    std::vector<llvm::Value*> sub(const std::vector<llvm::Value*> in1,
                                  const std::vector<llvm::Value*> in2);
    void         end();
diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
index 78f84510e2..cb85e1734e 100644
--- a/src/gallium/auxiliary/gallivm/soabuiltins.c
+++ b/src/gallium/auxiliary/gallivm/soabuiltins.c
@@ -36,6 +36,8 @@ typedef __attribute__(( ext_vector_type(4) )) float float4;
 
 extern float fabsf(float val);
 
+/* helpers */
+
 float4 absvec(float4 vec)
 {
    float4 res;
@@ -47,6 +49,58 @@ float4 absvec(float4 vec)
    return res;
 }
 
+float4 maxvec(float4 a, float4 b)
+{
+   return (float4){(a.x > b.x) ? a.x : b.x,
+         (a.y > b.y) ? a.y : b.y,
+         (a.z > b.z) ? a.z : b.z,
+         (a.w > b.w) ? a.w : b.w};
+}
+
+float4 minvec(float4 a, float4 b)
+{
+   return (float4){(a.x < b.x) ? a.x : b.x,
+         (a.y < b.y) ? a.y : b.y,
+         (a.z < b.z) ? a.z : b.z,
+         (a.w < b.w) ? a.w : b.w};
+}
+
+extern float powf(float num, float p);
+extern float sqrtf(float x);
+
+float4 powvec(float4 vec, float4 q)
+{
+   float4 p;
+   p.x = powf(vec.x, q.x);
+   p.y = powf(vec.y, q.y);
+   p.z = powf(vec.z, q.z);
+   p.w = powf(vec.w, q.w);
+   return p;
+}
+
+float4 sqrtvec(float4 vec)
+{
+   float4 p;
+   p.x = sqrtf(vec.x);
+   p.y = sqrtf(vec.y);
+   p.z = sqrtf(vec.z);
+   p.w = sqrtf(vec.w);
+   return p;
+}
+
+float4 sltvec(float4 v1, float4 v2)
+{
+   float4 p;
+   p.x = (v1.x < v2.x) ? 1.0 : 0.0;
+   p.y = (v1.y < v2.y) ? 1.0 : 0.0;
+   p.z = (v1.z < v2.z) ? 1.0 : 0.0;
+   p.w = (v1.w < v2.w) ? 1.0 : 0.0;
+   return p;
+}
+
+
+/* instructions */
+
 void abs(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
 {
@@ -69,7 +123,6 @@ void dp3(float4 *res,
    res[3] = dot;
 }
 
-
 void dp4(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
          float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -83,35 +136,25 @@ void dp4(float4 *res,
    res[3] = dot;
 }
 
-extern float powf(float num, float p);
-extern float sqrtf(float x);
-
-float4 powvec(float4 vec, float4 q)
-{
-   float4 p;
-   p.x = powf(vec.x, q.x);
-   p.y = powf(vec.y, q.y);
-   p.z = powf(vec.z, q.z);
-   p.w = powf(vec.w, q.w);
-   return p;
-}
-
-void pow(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+void lit(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
 {
-   res[0] = powvec(tmp0x, tmp1x);
-   res[1] = res[0];
-   res[2] = res[0];
-   res[3] = res[0];
-}
+   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
+   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
+   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
 
-float4 minvec(float4 a, float4 b)
-{
-   return (float4){(a.x < b.x) ? a.x : b.x,
-         (a.y < b.y) ? a.y : b.y,
-         (a.z < b.z) ? a.z : b.z,
-         (a.w < b.w) ? a.w : b.w};
+   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
+   if (tmp0x.x > 0) {
+      float4 tmpy = maxvec(tmp0y, zerovec);
+      float4 tmpw = minvec(tmp0w, plus128);
+      tmpw = maxvec(tmpw, min128);
+      res[1] = tmp0x;
+      res[2] = powvec(tmpy, tmpw);
+   } else {
+      res[1] = zerovec;
+      res[2] = zerovec;
+   }
+   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
 }
 
 void min(float4 *res,
@@ -125,14 +168,6 @@ void min(float4 *res,
 }
 
 
-float4 maxvec(float4 a, float4 b)
-{
-   return (float4){(a.x > b.x) ? a.x : b.x,
-         (a.y > b.y) ? a.y : b.y,
-         (a.z > b.z) ? a.z : b.z,
-         (a.w > b.w) ? a.w : b.w};
-}
-
 void max(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
          float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -143,37 +178,14 @@ void max(float4 *res,
    res[3] = maxvec(tmp0w, tmp1w);
 }
 
-
-void lit(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
-{
-   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
-   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
-   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
-
-   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
-   if (tmp0x.x > 0) {
-      float4 tmpy = maxvec(tmp0y, zerovec);
-      float4 tmpw = minvec(tmp0w, plus128);
-      tmpw = maxvec(tmpw, min128);
-      res[1] = tmp0x;
-      res[2] = powvec(tmpy, tmpw);
-   } else {
-      res[1] = zerovec;
-      res[2] = zerovec;
-   }
-   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
-}
-
-
-float4 sqrtvec(float4 vec)
+void pow(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
 {
-   float4 p;
-   p.x = sqrtf(vec.x);
-   p.y = sqrtf(vec.y);
-   p.z = sqrtf(vec.z);
-   p.w = sqrtf(vec.w);
-   return p;
+   res[0] = powvec(tmp0x, tmp1x);
+   res[1] = res[0];
+   res[2] = res[0];
+   res[3] = res[0];
 }
 
 void rsq(float4 *res,
@@ -185,3 +197,14 @@ void rsq(float4 *res,
    res[2] = onevec/sqrtvec(absvec(tmp0z));
    res[3] = onevec/sqrtvec(absvec(tmp0w));
 }
+
+void slt(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   res[0] = sltvec(tmp0x, tmp1x);
+   res[1] = sltvec(tmp0y, tmp1y);
+   res[2] = sltvec(tmp0z, tmp1z);
+   res[3] = sltvec(tmp0w, tmp1w);
+}
+
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index fdfbb76c16..7292c0e366 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -767,6 +767,7 @@ translate_instructionir(llvm::Module *module,
    }
       break;
    case TGSI_OPCODE_SLT: {
+      out = instr->slt(inputs[0], inputs[1]);
    }
       break;
    case TGSI_OPCODE_SGE: {
-- 
cgit v1.2.3


From a77976d2ee578d0483c64f2aa41719bbae9c1c97 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Wed, 1 Oct 2008 19:36:04 +0200
Subject: mesa: Fix compiler warnings on Windows.

---
 src/mesa/shader/arbprogparse.c     | 2 +-
 src/mesa/shader/slang/slang_link.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/shader/arbprogparse.c b/src/mesa/shader/arbprogparse.c
index 983f61a653..448251b223 100644
--- a/src/mesa/shader/arbprogparse.c
+++ b/src/mesa/shader/arbprogparse.c
@@ -2603,7 +2603,7 @@ parse_src_reg (GLcontext * ctx, const GLubyte ** inst,
       /* If we're referencing the Program->Parameters[] array, check if the
        * parameter is really a constant/literal.  If so, set File to CONSTANT.
        */
-      assert(*Index < Program->Base.Parameters->NumParameters);
+      assert(*Index < (GLint) Program->Base.Parameters->NumParameters);
       file = Program->Base.Parameters->Parameters[*Index].Type;
       if (file == PROGRAM_CONSTANT)
          *File = PROGRAM_CONSTANT;
diff --git a/src/mesa/shader/slang/slang_link.c b/src/mesa/shader/slang/slang_link.c
index d884be2a75..00e8953768 100644
--- a/src/mesa/shader/slang/slang_link.c
+++ b/src/mesa/shader/slang/slang_link.c
@@ -408,7 +408,7 @@ _slang_update_inputs_outputs(struct gl_program *prog)
             }
          }
          else if (inst->SrcReg[j].File == PROGRAM_ADDRESS) {
-            maxAddrReg = MAX2(maxAddrReg, inst->SrcReg[j].Index + 1);
+            maxAddrReg = MAX2(maxAddrReg, (GLuint) (inst->SrcReg[j].Index + 1));
          }
       }
       if (inst->DstReg.File == PROGRAM_OUTPUT) {
-- 
cgit v1.2.3


From a15699c3f54edb5d5b42960e7568e587b752e407 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Wed, 1 Oct 2008 13:34:38 +0100
Subject: draw: add streamlined paths for fetching linear verts

---
 src/gallium/auxiliary/draw/draw_vs_aos.c    |  44 +++++----
 src/gallium/auxiliary/draw/draw_vs_aos.h    |  19 ++--
 src/gallium/auxiliary/draw/draw_vs_aos_io.c | 137 +++++++++++++++++++++-------
 3 files changed, 134 insertions(+), 66 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index a556477a76..4c794e0e23 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -92,9 +92,9 @@ struct x86_reg aos_get_x86( struct aos_compilation *cp,
          assert(which_reg == 1);
          offset = Offset(struct aos_machine, constants);
          break;
-      case X86_ATTRIBS:
+      case X86_BUFFERS:
          assert(which_reg == 0);
-         offset = Offset(struct aos_machine, attrib);
+         offset = Offset(struct aos_machine, buffer);
          break;
       default:
          assert(0);
@@ -1939,6 +1939,8 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
    save_fpu_state( &cp );
    set_fpu_round_nearest( &cp );
 
+   aos_init_inputs( &cp, linear );
+
    /* Note address for loop jump 
     */
    label = x86_get_label(cp.func);
@@ -2018,13 +2020,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
 
       /* Incr index
        */   
-      if (linear) {
-         x86_inc(cp.func, cp.idx_EBX);
-      } 
-      else {
-         x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
-      }
-
+      aos_incr_inputs( &cp, linear );
    }
    /* decr count, loop if not zero
     */
@@ -2065,14 +2061,10 @@ static void vaos_set_buffer( struct draw_vs_varient *varient,
                              unsigned stride )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
-   unsigned i;
 
-   for (i = 0; i < vaos->base.key.nr_inputs; i++) {
-      if (vaos->base.key.element[i].in.buffer == buf) {
-         vaos->attrib[i].input_ptr = ((char *)ptr +
-                                      vaos->base.key.element[i].in.offset);
-         vaos->attrib[i].input_stride = stride;
-      }
+   if (buf < vaos->nr_vb) {
+      vaos->buffer[buf].base_ptr = (char *)ptr;
+      vaos->buffer[buf].stride = stride;
    }
 }
 
@@ -2089,7 +2081,7 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
-   machine->attrib = vaos->attrib;
+   machine->buffer = vaos->buffer;
 
    vaos->gen_run_elts( machine,
                        elts,
@@ -2108,7 +2100,7 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
-   machine->attrib = vaos->attrib;
+   machine->buffer = vaos->buffer;
 
    vaos->gen_run_linear( machine,
                          start,
@@ -2127,7 +2119,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
 
-   FREE( vaos->attrib );
+   FREE( vaos->buffer );
 
    x86_release_func( &vaos->func[0] );
    x86_release_func( &vaos->func[1] );
@@ -2140,6 +2132,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
                                                  const struct draw_vs_varient_key *key )
 {
+   unsigned i;
    struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
 
    if (!vaos)
@@ -2154,10 +2147,15 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
 
    vaos->draw = vs->draw;
 
-   vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) );
-   if (!vaos->attrib)
+   for (i = 0; i < key->nr_inputs; i++) 
+      vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
+
+   vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
+   if (!vaos->buffer)
       goto fail;
 
+   debug_printf("nr_vb: %d\n", vaos->nr_vb);
+
 #if 0
    tgsi_dump(vs->state.tokens, 0);
 #endif
@@ -2179,8 +2177,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    return &vaos->base;
 
  fail:
-   if (vaos && vaos->attrib)
-      FREE(vaos->attrib);
+   if (vaos && vaos->buffer)
+      FREE(vaos->buffer);
 
    if (vaos)
       x86_release_func( &vaos->func[0] );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 7fe6f79db0..306392e5d6 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -87,9 +87,10 @@ struct lit_info {
 #define MAX_SHINE_TAB    4
 #define MAX_LIT_INFO     16
 
-struct aos_attrib {
-   const void *input_ptr;
-   unsigned input_stride;
+struct aos_buffer {
+   const void *base_ptr;
+   unsigned stride;
+   void *ptr;                   /* updated per vertex */
 };
 
 
@@ -123,7 +124,7 @@ struct aos_machine {
    const float (*immediates)[4];     /* points to shader data */
    const float (*constants)[4];      /* points to draw data */
 
-   const struct aos_attrib *attrib; /* points to ? */
+   const struct aos_buffer *buffer; /* points to ? */
 };
 
 
@@ -179,8 +180,9 @@ struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
                                    unsigned file,
                                    unsigned idx );
 
-boolean aos_fetch_inputs( struct aos_compilation *cp,
-                          boolean linear );
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear );
 
 boolean aos_emit_outputs( struct aos_compilation *cp );
 
@@ -210,7 +212,7 @@ do {                                                                    \
 #define X86_NULL       0
 #define X86_IMMEDIATES 1
 #define X86_CONSTANTS  2
-#define X86_ATTRIBS    3
+#define X86_BUFFERS    3
 
 struct x86_reg aos_get_x86( struct aos_compilation *cp,
                             unsigned which_reg,
@@ -232,7 +234,8 @@ struct draw_vs_varient_aos_sse {
    struct draw_vs_varient base;
    struct draw_context *draw;
 
-   struct aos_attrib *attrib;
+   struct aos_buffer *buffer;
+   unsigned nr_vb;
 
    vaos_run_linear_func gen_run_linear;
    vaos_run_elts_func gen_run_elts;
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index 26297c74f8..8e08b9285f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -95,28 +95,6 @@ static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
 
 
-static void get_src_ptr( struct aos_compilation *cp,
-                         struct x86_reg src,
-                         struct x86_reg elt,
-                         unsigned a )
-{
-   struct x86_reg attrib = x86_make_disp(aos_get_x86( cp, 0, X86_ATTRIBS ), 
-                                         a * sizeof(struct aos_attrib));
-
-   struct x86_reg input_ptr = x86_make_disp(attrib, 
-                                            Offset(struct aos_attrib, input_ptr));
-
-   struct x86_reg input_stride = x86_make_disp(attrib, 
-                                               Offset(struct aos_attrib, input_stride));
-
-   /* Calculate pointer to current attrib:
-    */
-   x86_mov(cp->func, src, input_stride);
-   x86_imul(cp->func, src, elt);
-   x86_add(cp->func, src, input_ptr);
-}
-
-
 /* Extended swizzles?  Maybe later.
  */  
 static void emit_swizzle( struct aos_compilation *cp,
@@ -128,22 +106,44 @@ static void emit_swizzle( struct aos_compilation *cp,
 }
 
 
+
+static boolean get_buffer_ptr( struct aos_compilation *cp,
+                            unsigned buf_idx,
+                            struct x86_reg elt,
+                            struct x86_reg ptr)
+{
+   struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                      buf_idx * sizeof(struct aos_buffer));
+
+   struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                               Offset(struct aos_buffer, base_ptr));
+
+   struct x86_reg buf_stride = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, stride));
+
+   /* Calculate pointer to current attrib:
+    */
+   x86_mov(cp->func, ptr, buf_stride);
+   x86_imul(cp->func, ptr, elt);
+   x86_add(cp->func, ptr, buf_base_ptr);
+
+   return TRUE;
+}
+
+
+
+
 static boolean load_input( struct aos_compilation *cp,
                            unsigned idx,
-                           boolean linear )
+                           struct x86_reg bufptr )
 {
    unsigned format = cp->vaos->base.key.element[idx].in.format;
-   struct x86_reg src = cp->tmp_EAX;
+   unsigned offset = cp->vaos->base.key.element[idx].in.offset;
    struct x86_reg dataXMM = aos_get_xmm_reg(cp);
 
    /* Figure out source pointer address:
     */
-   get_src_ptr(cp, 
-               src, 
-               linear ? cp->idx_EBX : x86_deref(cp->idx_EBX),
-               idx);
-
-   src = x86_deref(src);
+   struct x86_reg src = x86_make_disp(bufptr, offset);
 
    aos_adopt_xmm_reg( cp,
                       dataXMM,
@@ -179,20 +179,87 @@ static boolean load_input( struct aos_compilation *cp,
    return TRUE;
 }
 
-
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+static boolean load_inputs( struct aos_compilation *cp,
+                            unsigned buffer,
+                            struct x86_reg ptr )
 {
    unsigned i;
-   
+
    for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
-      if (!load_input( cp, i, linear ))
+      if (cp->vaos->base.key.element[i].in.buffer == buffer) {
+
+         if (!load_input( cp, i, ptr ))
+            return FALSE;
+
+         cp->insn_counter++;
+      }
+   }
+   
+   return TRUE;
+}
+
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
+{
+   if (linear && cp->vaos->nr_vb == 1) {
+
+      struct x86_reg elt = cp->idx_EBX;
+      struct x86_reg ptr = cp->tmp_EAX;
+
+      if (!get_buffer_ptr( cp, 0, elt, ptr ))
          return FALSE;
-      cp->insn_counter++;
+
+      /* In the linear, single buffer case, keep the buffer pointer
+       * instead of the index number.
+       */
+      x86_mov( cp->func, elt, ptr );
+   }
+
+   return TRUE;
+}
+
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+{
+   if (linear && cp->vaos->nr_vb == 1) {
+      
+      load_inputs( cp, 0, cp->idx_EBX );
+
+   }
+   else {
+      struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
+      unsigned j;
+   
+      for (j = 0; j < cp->vaos->nr_vb; j++) {
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         if (!get_buffer_ptr( cp, j, elt, ptr ))
+            return FALSE;
+
+         cp->insn_counter++;
+
+         if (!load_inputs( cp, j, ptr ))
+            return FALSE;
+      }
    }
 
    return TRUE;
 }
 
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
+{
+   if (linear && cp->vaos->nr_vb == 1) {
+      struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                            (0 * sizeof(struct aos_buffer) + 
+                                             Offset(struct aos_buffer, stride)));
+
+      x86_add(cp->func, cp->idx_EBX, stride);
+   }
+   else if (linear) {
+      x86_inc(cp->func, cp->idx_EBX);
+   } 
+   else {
+      x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
+   }
+}
 
 
-- 
cgit v1.2.3


From 66d4beb874606baab95fb6539de895eb373b0ccb Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 2 Oct 2008 12:46:01 +0100
Subject: rtasm: add prefetch instructions

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 26 ++++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  5 +++++
 2 files changed, 31 insertions(+)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 3bba9dcc07..a5abbcde49 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -675,6 +675,32 @@ void x86_and( struct x86_function *p,
  * SSE instructions
  */
 
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 0, ptr);
+}
+
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 1, ptr);
+}
+
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 2, ptr);
+}
+
+
+
 
 void sse_movss( struct x86_function *p,
 		struct x86_reg dst,
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 510aa1b0de..86091e7f6b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -185,6 +185,11 @@ void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg ar
 void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
+
 void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-- 
cgit v1.2.3


From 21f98ad30aaeab5085d12278830f485e61b47cc1 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Wed, 1 Oct 2008 18:40:01 +0100
Subject: draw: don't keep refetching constant inputs

---
 .../auxiliary/draw/draw_pt_fetch_shade_emit.c      |  37 +++---
 src/gallium/auxiliary/draw/draw_vs.h               |   4 +-
 src/gallium/auxiliary/draw/draw_vs_aos.c           |  26 ++++-
 src/gallium/auxiliary/draw/draw_vs_aos.h           |   2 +
 src/gallium/auxiliary/draw/draw_vs_aos_io.c        | 127 +++++++++++++++------
 src/gallium/auxiliary/draw/draw_vs_varient.c       |  10 +-
 6 files changed, 144 insertions(+), 62 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index 73fc70c1bc..a0e08dd10a 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -79,6 +79,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs;
    const struct vertex_info *vinfo;
    unsigned i;
+   unsigned nr_vbs = 0;
    
 
    if (!draw->render->set_primitive( draw->render, 
@@ -102,7 +103,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
 
    fse->key.viewport = !draw->identity_viewport;
    fse->key.clip = !draw->bypass_clipping;
-   fse->key.pad = 0;
+   fse->key.const_vbuffers = 0;
 
    memset(fse->key.element, 0, 
           fse->key.nr_elements * sizeof(fse->key.element[0]));
@@ -116,9 +117,16 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
        */
       fse->key.element[i].in.buffer = src->vertex_buffer_index;
       fse->key.element[i].in.offset = src->src_offset;
+      nr_vbs = MAX2(nr_vbs, src->vertex_buffer_index + 1);
    }
    
+   for (i = 0; i < 5 && i < nr_vbs; i++) {
+      if (draw->pt.vertex_buffer[i].pitch == 0)
+         fse->key.const_vbuffers |= (1<<i);
+   }
 
+   if (0) debug_printf("%s: lookup const_vbuffers: %x\n", __FUNCTION__, fse->key.const_vbuffers);
+   
    {
       unsigned dst_offset = 0;
 
@@ -162,13 +170,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       }
    }
 
-
-   /* Would normally look up a vertex shader and peruse its list of
-    * varients somehow.  We omitted that step and put all the
-    * hardcoded "shaders" into an array.  We're just making the
-    * assumption that this happens to be a matching shader...  ie
-    * you're running isosurf, aren't you?
-    */
+   
    fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader, 
                                          &fse->key );
 
@@ -177,18 +179,17 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       return ;
    }
 
+   if (0) debug_printf("%s: found const_vbuffers: %x\n", __FUNCTION__, 
+                       fse->active->key.const_vbuffers);
+
    /* Now set buffer pointers:
     */
-   for (i = 0; i < num_vs_inputs; i++) {
-      unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index;
-
-      fse->active->set_input( fse->active, 
-                              i, 
-                              
-                              ((const ubyte *) draw->pt.user.vbuffer[buf] + 
-                               draw->pt.vertex_buffer[buf].buffer_offset),
-                              
-                              draw->pt.vertex_buffer[buf].pitch );
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      fse->active->set_buffer( fse->active, 
+                               i, 
+                               ((const ubyte *) draw->pt.user.vbuffer[i] + 
+                                draw->pt.vertex_buffer[i].buffer_offset),
+                              draw->pt.vertex_buffer[i].pitch );
    }
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 45992d1986..68c24abad3 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -64,7 +64,7 @@ struct draw_vs_varient_key {
    unsigned nr_outputs:8;
    unsigned viewport:1;
    unsigned clip:1;
-   unsigned pad:5;
+   unsigned const_vbuffers:5;
    struct draw_varient_element element[PIPE_MAX_ATTRIBS];
 };
 
@@ -76,7 +76,7 @@ struct draw_vs_varient {
 
    struct draw_vertex_shader *vs;
 
-   void (*set_input)( struct draw_vs_varient *,
+   void (*set_buffer)( struct draw_vs_varient *,
                       unsigned i,
                       const void *ptr,
                       unsigned stride );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 4c794e0e23..87232865e2 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -196,6 +196,18 @@ static void spill( struct aos_compilation *cp, unsigned idx )
 }
 
 
+void aos_spill_all( struct aos_compilation *cp )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+}
+
+
 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
                                         struct x86_reg reg )
 {
@@ -1941,6 +1953,9 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
 
    aos_init_inputs( &cp, linear );
 
+   cp.x86_reg[0] = 0;
+   cp.x86_reg[1] = 0;
+   
    /* Note address for loop jump 
     */
    label = x86_get_label(cp.func);
@@ -2066,6 +2081,8 @@ static void vaos_set_buffer( struct draw_vs_varient *varient,
       vaos->buffer[buf].base_ptr = (char *)ptr;
       vaos->buffer[buf].stride = stride;
    }
+
+   if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
 }
 
 
@@ -2078,6 +2095,8 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
 
+   if (0) debug_printf("%s %d\n", __FUNCTION__, count);
+
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
@@ -2097,6 +2116,9 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
 
+   if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, 
+                       vaos->base.key.const_vbuffers);
+
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
@@ -2140,7 +2162,7 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    
    vaos->base.key = *key;
    vaos->base.vs = vs;
-   vaos->base.set_input = vaos_set_buffer;
+   vaos->base.set_buffer = vaos_set_buffer;
    vaos->base.destroy = vaos_destroy;
    vaos->base.run_linear = vaos_run_linear;
    vaos->base.run_elts = vaos_run_elts;
@@ -2154,7 +2176,7 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    if (!vaos->buffer)
       goto fail;
 
-   debug_printf("nr_vb: %d\n", vaos->nr_vb);
+   debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
 
 #if 0
    tgsi_dump(vs->state.tokens, 0);
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 306392e5d6..264387517b 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -176,6 +176,8 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp,
                         unsigned idx,
                         unsigned dirty );
 
+void aos_spill_all( struct aos_compilation *cp );
+
 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
                                    unsigned file,
                                    unsigned idx );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index 8e08b9285f..b0c51d7fa1 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -108,29 +108,45 @@ static void emit_swizzle( struct aos_compilation *cp,
 
 
 static boolean get_buffer_ptr( struct aos_compilation *cp,
-                            unsigned buf_idx,
-                            struct x86_reg elt,
-                            struct x86_reg ptr)
+                               boolean linear,
+                               unsigned buf_idx,
+                               struct x86_reg elt,
+                               struct x86_reg ptr)
 {
    struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
                                       buf_idx * sizeof(struct aos_buffer));
 
-   struct x86_reg buf_base_ptr = x86_make_disp(buf, 
-                                               Offset(struct aos_buffer, base_ptr));
-
    struct x86_reg buf_stride = x86_make_disp(buf, 
                                              Offset(struct aos_buffer, stride));
+   if (linear) {
+      struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, ptr));
 
-   /* Calculate pointer to current attrib:
-    */
-   x86_mov(cp->func, ptr, buf_stride);
-   x86_imul(cp->func, ptr, elt);
-   x86_add(cp->func, ptr, buf_base_ptr);
 
-   return TRUE;
-}
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_ptr);
+      x86_mov(cp->func, elt, buf_stride);
+      x86_add(cp->func, elt, ptr);
+      sse_prefetchnta(cp->func, x86_deref(elt));
+      x86_mov(cp->func, buf_ptr, elt);
+   }
+   else {
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_stride);
+      x86_imul(cp->func, ptr, elt);
+      x86_add(cp->func, ptr, buf_base_ptr);
+   }
 
+   cp->insn_counter++;
 
+   return TRUE;
+}
 
 
 static boolean load_input( struct aos_compilation *cp,
@@ -200,18 +216,57 @@ static boolean load_inputs( struct aos_compilation *cp,
 
 boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
 {
-   if (linear && cp->vaos->nr_vb == 1) {
+   unsigned i;
+   for (i = 0; i < cp->vaos->nr_vb; i++) {
+      struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                         i * sizeof(struct aos_buffer));
 
-      struct x86_reg elt = cp->idx_EBX;
-      struct x86_reg ptr = cp->tmp_EAX;
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
 
-      if (!get_buffer_ptr( cp, 0, elt, ptr ))
-         return FALSE;
+      if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
+         struct x86_reg ptr = cp->tmp_EAX;
 
-      /* In the linear, single buffer case, keep the buffer pointer
-       * instead of the index number.
-       */
-      x86_mov( cp->func, elt, ptr );
+         x86_mov(cp->func, ptr, buf_base_ptr);
+
+         /* Load all inputs for this constant vertex buffer
+          */
+         load_inputs( cp, i, x86_deref(ptr) );
+         
+         /* Then just force them out to aos_machine.input[]
+          */
+         aos_spill_all( cp );
+
+      }
+      else if (linear) {
+
+         struct x86_reg elt = cp->idx_EBX;
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         struct x86_reg buf_stride = x86_make_disp(buf, 
+                                                   Offset(struct aos_buffer, stride));
+
+         struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                                Offset(struct aos_buffer, ptr));
+
+
+         /* Calculate pointer to current attrib:
+          */
+         x86_mov(cp->func, ptr, buf_stride);
+         x86_imul(cp->func, ptr, elt);
+         x86_add(cp->func, ptr, buf_base_ptr);
+
+
+         /* In the linear case, keep the buffer pointer instead of the
+          * index number.
+          */
+         if (cp->vaos->nr_vb == 1) 
+            x86_mov( cp->func, elt, ptr );
+         else
+            x86_mov( cp->func, buf_ptr, ptr );
+
+         cp->insn_counter++;
+      }
    }
 
    return TRUE;
@@ -219,23 +274,22 @@ boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
 
 boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
 {
-   if (linear && cp->vaos->nr_vb == 1) {
-      
-      load_inputs( cp, 0, cp->idx_EBX );
+   unsigned j;
 
-   }
-   else {
-      struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
-      unsigned j;
-   
-      for (j = 0; j < cp->vaos->nr_vb; j++) {
+   for (j = 0; j < cp->vaos->nr_vb; j++) {
+      if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
+         /* just retreive pre-transformed input */
+      }
+      else if (linear && cp->vaos->nr_vb == 1) {
+         load_inputs( cp, 0, cp->idx_EBX );
+      }
+      else {
+         struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
          struct x86_reg ptr = cp->tmp_EAX;
 
-         if (!get_buffer_ptr( cp, j, elt, ptr ))
+         if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
             return FALSE;
 
-         cp->insn_counter++;
-
          if (!load_inputs( cp, j, ptr ))
             return FALSE;
       }
@@ -252,13 +306,16 @@ boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
                                              Offset(struct aos_buffer, stride)));
 
       x86_add(cp->func, cp->idx_EBX, stride);
+      sse_prefetchnta(cp->func, x86_deref(cp->idx_EBX));
    }
    else if (linear) {
-      x86_inc(cp->func, cp->idx_EBX);
+      /* Nothing to do */
    } 
    else {
       x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
    }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index 4daf05dae7..7ee567d478 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -64,10 +64,10 @@ struct draw_vs_varient_generic {
 
 
-static void vsvg_set_input( struct draw_vs_varient *varient,
-                            unsigned buffer,
-                            const void *ptr,
-                            unsigned stride )
+static void vsvg_set_buffer( struct draw_vs_varient *varient,
+                             unsigned buffer,
+                             const void *ptr,
+                             unsigned stride )
 {
    struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
 
@@ -265,7 +265,7 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
 
    vsvg->base.key = *key;
    vsvg->base.vs = vs;
-   vsvg->base.set_input     = vsvg_set_input;
+   vsvg->base.set_buffer    = vsvg_set_buffer;
    vsvg->base.run_elts      = vsvg_run_elts;
    vsvg->base.run_linear    = vsvg_run_linear;
    vsvg->base.destroy       = vsvg_destroy;
-- 
cgit v1.2.3


From 22eb067c8863cbd9078f136706effd5df3375dbb Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 2 Oct 2008 12:53:11 +0100
Subject: draw: modify prefetching slightly

---
 src/gallium/auxiliary/draw/draw_vs_aos_io.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index b0c51d7fa1..dd79bc799a 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -54,6 +54,7 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
 				 struct x86_reg data,
 				 struct x86_reg src_ptr )
 {
+#if 1
    sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
    /* data = z ? ? ? */
    sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
@@ -62,6 +63,16 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
    /* data = ? 0 z 1 */
    sse_movlps(cp->func, data, src_ptr);
    /* data = x y z 1 */
+#else
+   sse_movups(cp->func, data, src_ptr);
+   /* data = x y z ? */
+   sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) );
+   /* data = ? x y z */
+   sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) );
+   /* data = 1 x y z */
+   sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) );
+   /* data = x y z 1 */
+#endif
 }
 
 static void emit_load_R32G32( struct aos_compilation *cp, 
@@ -128,7 +139,7 @@ static boolean get_buffer_ptr( struct aos_compilation *cp,
       x86_mov(cp->func, ptr, buf_ptr);
       x86_mov(cp->func, elt, buf_stride);
       x86_add(cp->func, elt, ptr);
-      sse_prefetchnta(cp->func, x86_deref(elt));
+      if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192));
       x86_mov(cp->func, buf_ptr, elt);
    }
    else {
@@ -306,7 +317,7 @@ boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
                                              Offset(struct aos_buffer, stride)));
 
       x86_add(cp->func, cp->idx_EBX, stride);
-      sse_prefetchnta(cp->func, x86_deref(cp->idx_EBX));
+      sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192));
    }
    else if (linear) {
       /* Nothing to do */
@@ -327,7 +338,7 @@ static void emit_store_R32G32B32A32( struct aos_compilation *cp,
 				     struct x86_reg dst_ptr,
 				     struct x86_reg dataXMM )
 {
-   sse_movups(cp->func, dst_ptr, dataXMM);
+   sse_movaps(cp->func, dst_ptr, dataXMM);
 }
 
 static void emit_store_R32G32B32( struct aos_compilation *cp, 
@@ -430,7 +441,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp )
 
       if (data.file != file_XMM) {
          struct x86_reg tmp = aos_get_xmm_reg( cp );
-         sse_movups(cp->func, tmp, data);
+         sse_movaps(cp->func, tmp, data);
          data = tmp;
       }
       
-- 
cgit v1.2.3


From 6965532e14717f71a6f4353fb683c5070c6b7d7a Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 3 Oct 2008 13:50:34 +0100
Subject: rtasm: add sse_movntps

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 12 ++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 9085f4cc0e..cc5871f873 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -653,6 +653,18 @@ void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
    emit_modrm_noreg(p, 2, ptr);
 }
 
+void sse_movntps( struct x86_function *p, 
+                  struct x86_reg dst,
+                  struct x86_reg src)
+{
+   DUMP_RR( dst, reg );
+
+   assert(dst.mod != mod_REG);
+   assert(src.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0x2b);
+   emit_modrm(p, src, dst);
+}
+
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 2d7715f965..af79f07dd3 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -189,6 +189,8 @@ void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
 void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
 void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
 
+void sse_movntps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
+
 void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-- 
cgit v1.2.3


From 6280e335706f95ed0ebb089d8f72aeede9b5a1ad Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 3 Oct 2008 13:51:56 +0100
Subject: mesa: shrink texenvprogram state key struct

---
 src/mesa/main/texenvprogram.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index 97aa87e58c..ac49373604 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -44,15 +44,17 @@
 #define DISASSEM (MESA_VERBOSE & VERBOSE_DISASSEM)
 
 struct mode_opt {
-   GLuint Source:4;
-   GLuint Operand:3;
+   GLubyte Source:4;
+   GLubyte Operand:3;
 };
 
 struct state_key {
-   GLbitfield enabled_units;
+   GLuint nr_enabled_units:8;
+   GLuint enabled_units:8;
    GLuint separate_specular:1;
    GLuint fog_enabled:1;
    GLuint fog_mode:2;
+   GLuint inputs_available:12;
 
    struct {
       GLuint enabled:1;
@@ -62,10 +64,10 @@ struct state_key {
 
       GLuint NumArgsRGB:2;
       GLuint ModeRGB:4;
-      struct mode_opt OptRGB[3];
-
       GLuint NumArgsA:2;
       GLuint ModeA:4;
+
+      struct mode_opt OptRGB[3];
       struct mode_opt OptA[3];
    } unit[8];
 };
-- 
cgit v1.2.3


From 0e008d37979e4e5ede25056221583e02c08a5df7 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 3 Oct 2008 13:53:07 +0100
Subject: mesa: add missing state dependencies for various tracked constants

---
 src/mesa/shader/prog_statevars.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/mesa/shader/prog_statevars.c b/src/mesa/shader/prog_statevars.c
index 8acf37c3c0..47c46f63ec 100644
--- a/src/mesa/shader/prog_statevars.c
+++ b/src/mesa/shader/prog_statevars.c
@@ -488,6 +488,10 @@ _mesa_fetch_state(GLcontext *ctx, const gl_state_index state[],
       case STATE_PCM_BIAS:
          COPY_4V(value, ctx->Pixel.PostColorMatrixBias);
          break;
+
+         /* XXX: make sure new tokens added here are also handled in the 
+          * _mesa_program_state_flags() switch, below.
+          */
       default:
          /* unknown state indexes are silently ignored
           *  should be handled by the driver.
@@ -561,10 +565,26 @@ _mesa_program_state_flags(const gl_state_index state[STATE_LENGTH])
 
    case STATE_INTERNAL:
       switch (state[1]) {
+
+      case STATE_NORMAL_SCALE:
+         return _NEW_MODELVIEW;
+
       case STATE_TEXRECT_SCALE:
 	 return _NEW_TEXTURE;
       case STATE_FOG_PARAMS_OPTIMIZED:
 	 return _NEW_FOG;
+      case STATE_LIGHT_SPOT_DIR_NORMALIZED:
+      case STATE_LIGHT_POSITION:
+      case STATE_LIGHT_POSITION_NORMALIZED:
+      case STATE_LIGHT_HALF_VECTOR:
+         return _NEW_LIGHT;
+
+      case STATE_PT_SCALE:
+      case STATE_PT_BIAS:
+      case STATE_PCM_SCALE:
+      case STATE_PCM_BIAS:
+         return _NEW_PIXEL;
+
       default:
          /* unknown state indexes are silently ignored and
          *  no flag set, since it is handled by the driver.
-- 
cgit v1.2.3


From fa1b533012030cd67148b5bf1e018fd5e30c96f8 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 3 Oct 2008 13:55:40 +0100
Subject: mesa: add new internal state for tracking current vertex attribs

---
 src/mesa/main/mtypes.h           |  1 +
 src/mesa/main/state.c            |  4 ++++
 src/mesa/shader/prog_statevars.c |  8 ++++++++
 src/mesa/shader/prog_statevars.h |  1 +
 src/mesa/vbo/vbo_exec_api.c      | 44 ++++++++++++++++++++++++----------------
 src/mesa/vbo/vbo_save_draw.c     | 24 ++++++++++++++--------
 6 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index a5e1cf6a27..bc099dabeb 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2744,6 +2744,7 @@ struct gl_matrix_stack
 #define _NEW_MULTISAMPLE        0x2000000  /**< __GLcontextRec::Multisample */
 #define _NEW_TRACK_MATRIX       0x4000000  /**< __GLcontextRec::VertexProgram */
 #define _NEW_PROGRAM            0x8000000  /**< __GLcontextRec::VertexProgram */
+#define _NEW_CURRENT_ATTRIB     0x10000000  /**< __GLcontextRec::Current */
 #define _NEW_ALL ~0
 /*@}*/
 
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index d355f78a0e..eb8dc2a339 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -407,6 +407,9 @@ _mesa_update_state_locked( GLcontext *ctx )
    GLbitfield new_state = ctx->NewState;
    GLbitfield prog_flags = _NEW_PROGRAM;
 
+   if (new_state == _NEW_CURRENT_ATTRIB) 
+      goto out;
+
    if (MESA_VERBOSE & VERBOSE_STATE)
       _mesa_print_state("_mesa_update_state", new_state);
 
@@ -484,6 +487,7 @@ _mesa_update_state_locked( GLcontext *ctx )
     * Set ctx->NewState to zero to avoid recursion if
     * Driver.UpdateState() has to call FLUSH_VERTICES().  (fixed?)
     */
+ out:
    new_state = ctx->NewState;
    ctx->NewState = 0;
    ctx->Driver.UpdateState(ctx, new_state);
diff --git a/src/mesa/shader/prog_statevars.c b/src/mesa/shader/prog_statevars.c
index 47c46f63ec..9cc33fa2c1 100644
--- a/src/mesa/shader/prog_statevars.c
+++ b/src/mesa/shader/prog_statevars.c
@@ -395,6 +395,12 @@ _mesa_fetch_state(GLcontext *ctx, const gl_state_index state[],
 
    case STATE_INTERNAL:
       switch (state[1]) {
+      case STATE_CURRENT_ATTRIB: {
+         const GLuint idx = (GLuint) state[2];
+         COPY_4V(value, ctx->Current.Attrib[idx]);
+         return;
+      }						  
+
       case STATE_NORMAL_SCALE:
          ASSIGN_4V(value, 
                    ctx->_ModelViewInvScale, 
@@ -565,6 +571,8 @@ _mesa_program_state_flags(const gl_state_index state[STATE_LENGTH])
 
    case STATE_INTERNAL:
       switch (state[1]) {
+      case STATE_CURRENT_ATTRIB:
+         return _NEW_CURRENT_ATTRIB;
 
       case STATE_NORMAL_SCALE:
          return _NEW_MODELVIEW;
diff --git a/src/mesa/shader/prog_statevars.h b/src/mesa/shader/prog_statevars.h
index 7b490e3d63..1f728c64e8 100644
--- a/src/mesa/shader/prog_statevars.h
+++ b/src/mesa/shader/prog_statevars.h
@@ -104,6 +104,7 @@ typedef enum gl_state_index_ {
    STATE_LOCAL,
 
    STATE_INTERNAL,		/* Mesa additions */
+   STATE_CURRENT_ATTRIB,        /* ctx->Current vertex attrib value */
    STATE_NORMAL_SCALE,
    STATE_TEXRECT_SCALE,
    STATE_FOG_PARAMS_OPTIMIZED,  /* for faster fog calc */
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index d70b4bb1a1..23f4f8331e 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -143,29 +143,37 @@ static void vbo_exec_copy_to_current( struct vbo_exec_context *exec )
 
    for (i = VBO_ATTRIB_POS+1 ; i < VBO_ATTRIB_MAX ; i++) {
       if (exec->vtx.attrsz[i]) {
-	 GLfloat *current = (GLfloat *)vbo->currval[i].Ptr;
-
          /* Note: the exec->vtx.current[i] pointers point into the
           * ctx->Current.Attrib and ctx->Light.Material.Attrib arrays.
           */
-	 COPY_CLEAN_4V(current, 
-		       exec->vtx.attrsz[i], 
-		       exec->vtx.attrptr[i]);
+	 GLfloat *current = (GLfloat *)vbo->currval[i].Ptr;
+         GLfloat tmp[4];
+
+         COPY_CLEAN_4V(tmp, 
+                       exec->vtx.attrsz[i], 
+                       exec->vtx.attrptr[i]);
+         
+         if (memcmp(current, tmp, sizeof(tmp)) != 0)
+         { 
+            memcpy(current, tmp, sizeof(tmp));
 
 	 
-	 /* Given that we explicitly state size here, there is no need
-	  * for the COPY_CLEAN above, could just copy 16 bytes and be
-	  * done.  The only problem is when Mesa accesses ctx->Current
-	  * directly.
-	  */
-	 vbo->currval[i].Size = exec->vtx.attrsz[i];
-
-	 /* This triggers rather too much recalculation of Mesa state
-	  * that doesn't get used (eg light positions).
-	  */
-	 if (i >= VBO_ATTRIB_MAT_FRONT_AMBIENT &&
-	     i <= VBO_ATTRIB_MAT_BACK_INDEXES)
-	    ctx->NewState |= _NEW_LIGHT;
+            /* Given that we explicitly state size here, there is no need
+             * for the COPY_CLEAN above, could just copy 16 bytes and be
+             * done.  The only problem is when Mesa accesses ctx->Current
+             * directly.
+             */
+            vbo->currval[i].Size = exec->vtx.attrsz[i];
+
+            /* This triggers rather too much recalculation of Mesa state
+             * that doesn't get used (eg light positions).
+             */
+            if (i >= VBO_ATTRIB_MAT_FRONT_AMBIENT &&
+                i <= VBO_ATTRIB_MAT_BACK_INDEXES)
+               ctx->NewState |= _NEW_LIGHT;
+            
+            ctx->NewState |= _NEW_CURRENT_ATTRIB;
+         }
       }
    }
 
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index ed82f09958..4c97acddb9 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -64,18 +64,26 @@ static void _playback_copy_to_current( GLcontext *ctx,
    for (i = VBO_ATTRIB_POS+1 ; i < VBO_ATTRIB_MAX ; i++) {
       if (node->attrsz[i]) {
 	 GLfloat *current = (GLfloat *)vbo->currval[i].Ptr;
+         GLfloat tmp[4];
 
-	 COPY_CLEAN_4V(current, 
-		       node->attrsz[i], 
-		       data);
+         COPY_CLEAN_4V(tmp, 
+                       node->attrsz[i], 
+                       data);
+         
+         if (memcmp(current, tmp, 4 * sizeof(GLfloat)) != 0)
+         {
+            memcpy(current, tmp, 4 * sizeof(GLfloat));
 
-	 vbo->currval[i].Size = node->attrsz[i];
+            vbo->currval[i].Size = node->attrsz[i];
 
-	 data += node->attrsz[i];
+            if (i >= VBO_ATTRIB_FIRST_MATERIAL &&
+                i <= VBO_ATTRIB_LAST_MATERIAL)
+               ctx->NewState |= _NEW_LIGHT;
+
+            ctx->NewState |= _NEW_CURRENT_ATTRIB;
+         }
 
-	 if (i >= VBO_ATTRIB_FIRST_MATERIAL &&
-	     i <= VBO_ATTRIB_LAST_MATERIAL)
-	    ctx->NewState |= _NEW_LIGHT;
+	 data += node->attrsz[i];
       }
    }
 
-- 
cgit v1.2.3


From d63a36ef3a4dd9cef1273fac5949e587c42813b5 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 3 Oct 2008 16:46:48 +0100
Subject: Mesa: short-circuit case when looking up the same program twice in
 cache

---
 src/mesa/shader/prog_cache.c | 29 +++++++++++++++++++++--------
 src/mesa/shader/prog_cache.h |  2 +-
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/mesa/shader/prog_cache.c b/src/mesa/shader/prog_cache.c
index 36a25377c5..9437e59613 100644
--- a/src/mesa/shader/prog_cache.c
+++ b/src/mesa/shader/prog_cache.c
@@ -44,6 +44,7 @@ struct cache_item
 struct gl_program_cache
 {
    struct cache_item **items;
+   struct cache_item *last;
    GLuint size, n_items;
 };
 
@@ -83,6 +84,8 @@ rehash(struct gl_program_cache *cache)
    struct cache_item *c, *next;
    GLuint size, i;
 
+   cache->last = NULL;
+
    size = cache->size * 3;
    items = (struct cache_item**) _mesa_malloc(size * sizeof(*items));
    _mesa_memset(items, 0, size * sizeof(*items));
@@ -105,6 +108,8 @@ clear_cache(GLcontext *ctx, struct gl_program_cache *cache)
 {
    struct cache_item *c, *next;
    GLuint i;
+   
+   cache->last = NULL;
 
    for (i = 0; i < cache->size; i++) {
       for (c = cache->items[i]; c; c = next) {
@@ -149,18 +154,26 @@ _mesa_delete_program_cache(GLcontext *ctx, struct gl_program_cache *cache)
 
 
 struct gl_program *
-_mesa_search_program_cache(const struct gl_program_cache *cache,
+_mesa_search_program_cache(struct gl_program_cache *cache,
                            const void *key, GLuint keysize)
 {
-   const GLuint hash = hash_key(key, keysize);
-   struct cache_item *c;
-
-   for (c = cache->items[hash % cache->size]; c; c = c->next) {
-      if (c->hash == hash && memcmp(c->key, key, keysize) == 0)
-	 return c->program;
+   if (cache->last && 
+       memcmp(cache->last->key, key, keysize) == 0) {
+      return cache->last->program;
    }
+   else {
+      const GLuint hash = hash_key(key, keysize);
+      struct cache_item *c;
+
+      for (c = cache->items[hash % cache->size]; c; c = c->next) {
+         if (c->hash == hash && memcmp(c->key, key, keysize) == 0) {
+            cache->last = c;
+            return c->program;
+         }
+      }
 
-   return NULL;
+      return NULL;
+   }
 }
 
 
diff --git a/src/mesa/shader/prog_cache.h b/src/mesa/shader/prog_cache.h
index a8c91fba01..4e1ccac03f 100644
--- a/src/mesa/shader/prog_cache.h
+++ b/src/mesa/shader/prog_cache.h
@@ -42,7 +42,7 @@ _mesa_delete_program_cache(GLcontext *ctx, struct gl_program_cache *pc);
 
 
 extern struct gl_program *
-_mesa_search_program_cache(const struct gl_program_cache *cache,
+_mesa_search_program_cache(struct gl_program_cache *cache,
                            const void *key, GLuint keysize);
 
 extern void
-- 
cgit v1.2.3


From 1680ef869625dc1fe9cf481b180382a34e0738e7 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 3 Oct 2008 17:30:59 +0100
Subject: mesa: avoid generating constant vertex attributes in fixedfunc
 programs

Keep track of enabled/active vertex attributes.
Keep track of potential vertex program outputs.

When generating fragment program, replace references to fragment attributes
which are effectively non-varying and non-computed passthrough attributes with
references to the new CURRENT_ATTRIB tracked state value.

Only downside is slight ugliness in VBO code where we need to validate state
twice in succession.
---
 src/mesa/main/mtypes.h        |  2 +
 src/mesa/main/state.c         | 38 ++++++++++++++++-
 src/mesa/main/state.h         |  3 ++
 src/mesa/main/texenvprogram.c | 94 ++++++++++++++++++++++++++++++++++++++++---
 src/mesa/vbo/vbo_exec_array.c | 41 +++++++++++++++----
 src/mesa/vbo/vbo_exec_draw.c  |  8 ++++
 src/mesa/vbo/vbo_save_draw.c  |  4 ++
 7 files changed, 177 insertions(+), 13 deletions(-)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index bc099dabeb..ca1e369a35 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3073,6 +3073,8 @@ struct __GLcontextRec
    GLenum RenderMode;        /**< either GL_RENDER, GL_SELECT, GL_FEEDBACK */
    GLbitfield NewState;      /**< bitwise-or of _NEW_* flags */
 
+   GLuint varying_vp_inputs;
+
    /** \name Derived state */
    /*@{*/
    GLbitfield _TriangleCaps;      /**< bitwise-or of DD_* flags */
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index eb8dc2a339..e0eb5f81e2 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -465,7 +465,8 @@ _mesa_update_state_locked( GLcontext *ctx )
       _mesa_update_tnl_spaces( ctx, new_state );
 
    if (ctx->FragmentProgram._MaintainTexEnvProgram) {
-      prog_flags |= (_NEW_TEXTURE | _NEW_FOG | _DD_NEW_SEPARATE_SPECULAR);
+      prog_flags |= (_NEW_ARRAY | _NEW_TEXTURE_MATRIX | _NEW_LIGHT |
+                     _NEW_TEXTURE | _NEW_FOG | _DD_NEW_SEPARATE_SPECULAR);
    }
    if (ctx->VertexProgram._MaintainTnlProgram) {
       prog_flags |= (_NEW_ARRAY | _NEW_TEXTURE | _NEW_TEXTURE_MATRIX |
@@ -504,3 +505,38 @@ _mesa_update_state( GLcontext *ctx )
    _mesa_update_state_locked(ctx);
    _mesa_unlock_context_textures(ctx);
 }
+
+
+
+
+/* Want to figure out which fragment program inputs are actually
+ * constant/current values from ctx->Current.  These should be
+ * referenced as a tracked state variable rather than a fragment
+ * program input, to save the overhead of putting a constant value in
+ * every submitted vertex, transferring it to hardware, interpolating
+ * it across the triangle, etc...
+ *
+ * When there is a VP bound, just use vp->outputs.  But when we're
+ * generating vp from fixed function state, basically want to
+ * calculate:
+ *
+ * vp_out_2_fp_in( vp_in_2_vp_out( varying_inputs ) | 
+ *                 potential_vp_outputs )
+ *
+ * Where potential_vp_outputs is calculated by looking at enabled
+ * texgen, etc.
+ * 
+ * The generated fragment program should then only declare inputs that
+ * may vary or otherwise differ from the ctx->Current values.
+ * Otherwise, the fp should track them as state values instead.
+ */
+void
+_mesa_set_varying_vp_inputs( GLcontext *ctx,
+                             unsigned varying_inputs )
+{
+   if (ctx->varying_vp_inputs != varying_inputs) {
+      ctx->varying_vp_inputs = varying_inputs;
+      ctx->NewState |= _NEW_ARRAY;
+      //_mesa_printf("%s %x\n", __FUNCTION__, varying_inputs);
+   }
+}
diff --git a/src/mesa/main/state.h b/src/mesa/main/state.h
index bb7cb8f32a..dc08043a76 100644
--- a/src/mesa/main/state.h
+++ b/src/mesa/main/state.h
@@ -37,5 +37,8 @@ _mesa_update_state( GLcontext *ctx );
 extern void
 _mesa_update_state_locked( GLcontext *ctx );
 
+void
+_mesa_set_varying_vp_inputs( GLcontext *ctx,
+                             unsigned varying_inputs );
 
 #endif
diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index ac49373604..7cd82f98b0 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -189,6 +189,63 @@ static GLuint translate_tex_src_bit( GLbitfield bit )
    }
 }
 
+#define VERT_BIT_TEX_ANY    (0xff << VERT_ATTRIB_TEX0)
+#define VERT_RESULT_TEX_ANY (0xff << VERT_RESULT_TEX0)
+
+/* Identify all possible varying inputs.  The fragment program will
+ * never reference non-varying inputs, but will track them via state
+ * constants instead.
+ *
+ * This function figures out all the inputs that the fragment program
+ * has access to.  The bitmask is later reduced to just those which
+ * are actually referenced.
+ */
+static GLuint get_fp_input_mask( GLcontext *ctx )
+{
+   GLuint fp_inputs = 0;
+
+   if (1) {
+      GLuint varying_inputs = ctx->varying_vp_inputs;
+
+      /* First look at what values may be computed by the generated
+       * vertex program:
+       */
+      if (ctx->Light.Enabled) {
+         fp_inputs |= FRAG_BIT_COL0;
+
+         if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR)
+            fp_inputs |= FRAG_BIT_COL1;
+      }
+
+      fp_inputs |= (ctx->Texture._TexGenEnabled |
+                    ctx->Texture._TexMatEnabled) << FRAG_ATTRIB_TEX0;
+
+      /* Then look at what might be varying as a result of enabled
+       * arrays, etc:
+       */
+      if (varying_inputs & VERT_BIT_COLOR0) fp_inputs |= FRAG_BIT_COL0;
+      if (varying_inputs & VERT_BIT_COLOR1) fp_inputs |= FRAG_BIT_COL1;
+
+      fp_inputs |= (((varying_inputs & VERT_BIT_TEX_ANY) >> VERT_ATTRIB_TEX0) 
+                    << FRAG_ATTRIB_TEX0);
+
+   }
+   else {
+      /* calculate from vp->outputs */
+      GLuint vp_outputs = 0;
+
+      if (vp_outputs & (1 << VERT_RESULT_COL0)) fp_inputs |= FRAG_BIT_COL0;
+      if (vp_outputs & (1 << VERT_RESULT_COL1)) fp_inputs |= FRAG_BIT_COL1;
+
+      fp_inputs |= (((vp_outputs & VERT_RESULT_TEX_ANY) 
+                   << VERT_RESULT_TEX0) 
+                  >> FRAG_ATTRIB_TEX0);
+   }
+   
+   return fp_inputs;
+}
+
+
 /**
  * Examine current texture environment state and generate a unique
  * key to identify it.
@@ -196,17 +253,21 @@ static GLuint translate_tex_src_bit( GLbitfield bit )
 static void make_state_key( GLcontext *ctx,  struct state_key *key )
 {
    GLuint i, j;
-	
+   GLuint inputs_referenced = FRAG_BIT_COL0;
+   GLuint inputs_available = get_fp_input_mask( ctx );
+
    memset(key, 0, sizeof(*key));
 
    for (i=0;i<MAX_TEXTURE_UNITS;i++) {
       const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
 		
-      if (!texUnit->_ReallyEnabled)
+      if (!texUnit->_ReallyEnabled) 
          continue;
 
       key->unit[i].enabled = 1;
       key->enabled_units |= (1<<i);
+      key->nr_enabled_units = i+1;
+      inputs_referenced |= FRAG_BIT_TEX(i);
 
       key->unit[i].source_index = 
 	 translate_tex_src_bit(texUnit->_ReallyEnabled);		
@@ -234,13 +295,18 @@ static void make_state_key( GLcontext *ctx,  struct state_key *key )
       }
    }
 	
-   if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR)
+   if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR) {
       key->separate_specular = 1;
+      inputs_referenced |= FRAG_BIT_COL1;
+   }
 
    if (ctx->Fog.Enabled) {
       key->fog_enabled = 1;
       key->fog_mode = translate_fog_mode(ctx->Fog.Mode);
+      inputs_referenced |= FRAG_BIT_FOGC; /* maybe */
    }
+
+   key->inputs_available = (inputs_available & inputs_referenced);
 }
 
 /* Use uregs to represent registers internally, translate to Mesa's
@@ -446,11 +512,29 @@ static struct ureg register_param5( struct texenv_fragment_program *p,
 #define register_param3(p,s0,s1,s2)    register_param5(p,s0,s1,s2,0,0)
 #define register_param4(p,s0,s1,s2,s3) register_param5(p,s0,s1,s2,s3,0)
 
+static GLuint frag_to_vert_attrib( GLuint attrib )
+{
+   switch (attrib) {
+   case FRAG_ATTRIB_COL0: return VERT_ATTRIB_COLOR0;
+   case FRAG_ATTRIB_COL1: return VERT_ATTRIB_COLOR1;
+   default:
+      assert(attrib >= FRAG_ATTRIB_TEX0);
+      assert(attrib <= FRAG_ATTRIB_TEX7);
+      return attrib - FRAG_ATTRIB_TEX0 + VERT_ATTRIB_TEX0;
+   }
+}
+
 
 static struct ureg register_input( struct texenv_fragment_program *p, GLuint input )
 {
-   p->program->Base.InputsRead |= (1 << input);
-   return make_ureg(PROGRAM_INPUT, input);
+   if (p->state->inputs_available & (1<<input)) {
+      p->program->Base.InputsRead |= (1 << input);
+      return make_ureg(PROGRAM_INPUT, input);
+   }
+   else {
+      GLuint idx = frag_to_vert_attrib( input );
+      return register_param3( p, STATE_INTERNAL, STATE_CURRENT_ATTRIB, idx );
+   }
 }
 
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 0f9d8da356..3d74f9f431 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -127,6 +127,7 @@ static void recalculate_input_bindings( GLcontext *ctx )
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
    const struct gl_client_array **inputs = &exec->array.inputs[0];
+   GLuint const_inputs = 0;
    GLuint i;
 
    exec->array.program_mode = get_program_mode(ctx);
@@ -141,19 +142,24 @@ static void recalculate_input_bindings( GLcontext *ctx )
       for (i = 0; i <= VERT_ATTRIB_TEX7; i++) {
 	 if (exec->array.legacy_array[i]->Enabled)
 	    inputs[i] = exec->array.legacy_array[i];
-	 else
+	 else {
 	    inputs[i] = &vbo->legacy_currval[i];
+            const_inputs |= 1 << i;
+         }
       }
 
       for (i = 0; i < MAT_ATTRIB_MAX; i++) {
 	 inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->mat_currval[i];
+         const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
       }
 
       /* Could use just about anything, just to fill in the empty
        * slots:
        */
-      for (i = MAT_ATTRIB_MAX; i < VERT_ATTRIB_MAX - VERT_ATTRIB_GENERIC0; i++)
+      for (i = MAT_ATTRIB_MAX; i < VERT_ATTRIB_MAX - VERT_ATTRIB_GENERIC0; i++) {
 	 inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->generic_currval[i];
+         const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
+      }
 
       break;
    case VP_NV:
@@ -166,15 +172,19 @@ static void recalculate_input_bindings( GLcontext *ctx )
 	    inputs[i] = exec->array.generic_array[i];
 	 else if (exec->array.legacy_array[i]->Enabled)
 	    inputs[i] = exec->array.legacy_array[i];
-	 else
+	 else {
 	    inputs[i] = &vbo->legacy_currval[i];
+            const_inputs |= 1 << i;
+         }
       }
 
       /* Could use just about anything, just to fill in the empty
        * slots:
        */
-      for (i = VERT_ATTRIB_GENERIC0; i < VERT_ATTRIB_MAX; i++)
+      for (i = VERT_ATTRIB_GENERIC0; i < VERT_ATTRIB_MAX; i++) {
 	 inputs[i] = &vbo->generic_currval[i - VERT_ATTRIB_GENERIC0];
+         const_inputs |= 1 << i;
+      }
 
       break;
    case VP_ARB:
@@ -189,25 +199,34 @@ static void recalculate_input_bindings( GLcontext *ctx )
 	 inputs[0] = exec->array.generic_array[0];
       else if (exec->array.legacy_array[0]->Enabled)
 	 inputs[0] = exec->array.legacy_array[0];
-      else
+      else {
 	 inputs[0] = &vbo->legacy_currval[0];
+         const_inputs |= 1 << 0;
+      }
 
 
       for (i = 1; i <= VERT_ATTRIB_TEX7; i++) {
 	 if (exec->array.legacy_array[i]->Enabled)
 	    inputs[i] = exec->array.legacy_array[i];
-	 else
+	 else {
 	    inputs[i] = &vbo->legacy_currval[i];
+            const_inputs |= 1 << i;
+         }
       }
 
       for (i = 0; i < 16; i++) {
 	 if (exec->array.generic_array[i]->Enabled)
 	    inputs[VERT_ATTRIB_GENERIC0 + i] = exec->array.generic_array[i];
-	 else
+	 else {
 	    inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->generic_currval[i];
+            const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
+         }
+
       }
       break;
    }
+
+   _mesa_set_varying_vp_inputs( ctx, ~const_inputs );
 }
 
 static void bind_arrays( GLcontext *ctx )
@@ -257,6 +276,11 @@ vbo_exec_DrawArrays(GLenum mode, GLint start, GLsizei count)
 
    bind_arrays( ctx );
 
+   /* Again...
+    */
+   if (ctx->NewState)
+      _mesa_update_state( ctx );
+
    prim[0].begin = 1;
    prim[0].end = 1;
    prim[0].weak = 0;
@@ -297,6 +321,9 @@ vbo_exec_DrawRangeElements(GLenum mode,
 
    bind_arrays( ctx );
 
+   if (ctx->NewState)
+      _mesa_update_state( ctx );
+
    ib.count = count;
    ib.type = type; 
    ib.obj = ctx->Array.ElementArrayBufferObj;
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index f497e9a5a5..ad60c9b05f 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -150,6 +150,7 @@ static void vbo_exec_bind_arrays( GLcontext *ctx )
    GLubyte *data = exec->vtx.buffer_map;
    const GLuint *map;
    GLuint attr;
+   GLuint varying_inputs = 0;
 
    /* Install the default (ie Current) attributes first, then overlay
     * all active ones.
@@ -211,8 +212,11 @@ static void vbo_exec_bind_arrays( GLcontext *ctx )
 	 arrays[attr]._MaxElement = count; /* ??? */
 
 	 data += exec->vtx.attrsz[src] * sizeof(GLfloat);
+         varying_inputs |= 1<<attr;
       }
    }
+
+   _mesa_set_varying_vp_inputs( ctx, varying_inputs );
 }
 
 
@@ -242,6 +246,10 @@ void vbo_exec_vtx_flush( struct vbo_exec_context *exec )
 	  */
 	 vbo_exec_bind_arrays( ctx );
 
+         if (ctx->NewState)
+            _mesa_update_state( ctx );
+
+
 	 ctx->Driver.UnmapBuffer(ctx, target, exec->vtx.bufferobj);
 	 exec->vtx.buffer_map = NULL;
 
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index 4c97acddb9..b015bf2786 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -118,6 +118,7 @@ static void vbo_bind_vertex_list( GLcontext *ctx,
    GLuint data = node->buffer_offset;
    const GLuint *map;
    GLuint attr;
+   GLuint varying_inputs = 0;
 
    /* Install the default (ie Current) attributes first, then overlay
     * all active ones.
@@ -167,8 +168,11 @@ static void vbo_bind_vertex_list( GLcontext *ctx,
 	 assert(arrays[attr].BufferObj->Name);
 
 	 data += node->attrsz[src] * sizeof(GLfloat);
+         varying_inputs |= 1<<attr;
       }
    }
+
+   _mesa_set_varying_vp_inputs( ctx, varying_inputs );
 }
 
 static void vbo_save_loopback_vertex_list( GLcontext *ctx,
-- 
cgit v1.2.3


From db9ba91971a1f279b040b30bf8fd5d13a70f0a03 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Fri, 3 Oct 2008 12:16:04 -0700
Subject: intel: Don't advertise unsupported extensions on pre-965 hardware

Move GL_ARB_texture_non_power_of_two and GL_ATI_separate_stencil
from the generic extension list to the 965-specific list.  Neither
extension is supported on i830-class hardware, and
GL_ATI_separate_stencil is not supported on i915-class hardare.
GL_ARB_texture_non_power_of_two is supported on i915-class hardare and
is already in the i915-specific list.
---
 src/mesa/drivers/dri/intel/intel_context.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index e53972c46d..9dc32e487f 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -355,7 +355,6 @@ static const struct dri_extension card_extensions[] = {
    { "GL_ARB_texture_env_crossbar",       NULL },
    { "GL_ARB_texture_env_dot3",           NULL },
    { "GL_ARB_texture_mirrored_repeat",    NULL },
-   { "GL_ARB_texture_non_power_of_two",   NULL },
    { "GL_ARB_texture_rectangle",          NULL },
    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
    { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
@@ -379,7 +378,6 @@ static const struct dri_extension card_extensions[] = {
    { "GL_EXT_texture_lod_bias",           NULL },
    { "GL_3DFX_texture_compression_FXT1",  NULL },
    { "GL_APPLE_client_storage",           NULL },
-   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
    { "GL_MESA_pack_invert",               NULL },
    { "GL_MESA_ycbcr_texture",             NULL },
    { "GL_NV_blend_square",                NULL },
@@ -401,9 +399,11 @@ static const struct dri_extension brw_extensions[] = {
    { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
    { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
    { "GL_ARB_shadow",                     NULL },
+   { "GL_ARB_texture_non_power_of_two",   NULL },
    { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
    { "GL_EXT_shadow_funcs",               NULL },
    { "GL_EXT_texture_sRGB",		  NULL },
+   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
    { NULL,                                NULL }
 };
 
-- 
cgit v1.2.3


From afaa53040bd01ca86762e7d7b1a5a65810767921 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 3 Oct 2008 18:00:43 -0600
Subject: CELL: changes to generate SPU code for stenciling

This set of code changes are for stencil code generation
support.  Both one-sided and two-sided stenciling are supported.
In addition to the raw code generation changes, these changes had
to be made elsewhere in the system:

- Added new "register set" feature to the SPE assembly generation.
  A "register set" is a way to allocate multiple registers and free
  them all at the same time, delegating register allocation management
  to the spe_function unit.  It's quite useful in complex register
  allocation schemes (like stenciling).

- Added and improved SPE macro calculations.
  These are operations between registers and unsigned integer
  immediates.  In many cases, the calculation can be performed
  with a single instruction; the macros will generate the
  single instruction if possible, or generate a register load
  and register-to-register operation if not.  These macro
  functions are: spe_load_uint() (which has new ways to
  load a value in a single instruction), spe_and_uint(),
  spe_xor_uint(), spe_compare_equal_uint(), and spe_compare_greater_uint().

- Added facing to fragment generation.  While rendering, the rasterizer
  needs to be able to determine front- and back-facing fragments, in order
  to correctly apply two-sided stencil.  That requires these changes:
  - Added front_winding field to the cell_command_render block, so that
    the state tracker could communicate to the rasterizer what it
    considered to be the front-facing direction.
  - Added fragment facing as an input to the fragment function.
  - Calculated facing is passed during emit_quad().
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        | 246 +++++-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h        |  41 +-
 src/gallium/drivers/cell/common.h                  |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 881 ++++++++++++++++++---
 src/gallium/drivers/cell/ppu/cell_render.c         |   1 +
 src/gallium/drivers/cell/ppu/cell_vbuf.c           |   1 +
 src/gallium/drivers/cell/spu/spu_main.h            |   3 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |  19 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   3 +-
 src/gallium/drivers/cell/spu/spu_render.c          |   4 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  35 +-
 src/gallium/drivers/cell/spu/spu_tri.h             |   2 +-
 12 files changed, 1091 insertions(+), 146 deletions(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 491141f190..8a87e9abb1 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -359,14 +359,21 @@ void _name (struct spe_function *p, int imm) \
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
+    register unsigned int i;
+
     p->store = align_malloc(code_size, 16);
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
 
+    p->set_count = 0;
+    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
-    p->regs[0] = ~7;
-    p->regs[1] = (1U << (80 - 64)) - 1;
+    p->regs[0] = p->regs[1] = p->regs[2] = 1;
+    for (i = 80; i <= 127; i++) {
+      p->regs[i] = 1;
+    }
 
     p->print = false;
     p->indent = 0;
@@ -398,12 +405,8 @@ int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      const uint64_t mask = (1ULL << (i % 64));
-      const unsigned idx = i / 64;
-
-      assert(idx < 2);
-      if ((p->regs[idx] & mask) != 0) {
-         p->regs[idx] &= ~mask;
+      if (p->regs[i] == 0) {
+         p->regs[i] = 1;
          return i;
       }
    }
@@ -417,31 +420,68 @@ int spe_allocate_available_register(struct spe_function *p)
  */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
-
    assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) != 0);
-
-   p->regs[idx] &= ~(1ULL << bit);
+   assert(p->regs[reg] == 0);
+   p->regs[reg] = 1;
    return reg;
 }
 
 
 /**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated".  Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 1);
 
-   assert(idx < 2);
+   p->regs[reg] = 0;
+}
 
-   assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) == 0);
+/**
+ * Start a new set of registers.  This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+   register unsigned int i;
+
+   /* Keep track of the set count.  If it ever wraps around to 0, 
+    * we're in trouble.
+    */
+   p->set_count++;
+   assert(p->set_count > 0);
+
+   /* Increment the allocation count of all registers currently
+    * allocated.  Then any registers that are allocated in this set
+    * will be the only ones with a count of 1; they'll all be released
+    * when the register set is released.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]++;
+   }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* If the set count drops below zero, we're in trouble. */
+   assert(p->set_count > 0);
+   p->set_count--;
 
-   p->regs[idx] |= (1ULL << bit);
+   /* Drop the allocation level of all registers.  Any allocated
+    * during this register set will drop to 0 and then become
+    * available.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]--;
+   }
 }
 
 
@@ -603,8 +643,10 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
 {
    /* If the whole value is in the lower 18 bits, use ila, which
     * doesn't sign-extend.  Otherwise, if the two halfwords of
-    * the constant are identical, use ilh.  Otherwise, we have
-    * to use ilhu followed by iohl.
+    * the constant are identical, use ilh.  Otherwise, if every byte of
+    * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+    * Bytes Immediate (fsmbi) to load the value in a single instruction.
+    * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
    if ((ui & 0xfffc0000) == ui) {
       spe_ila(p, rT, ui);
@@ -612,13 +654,171 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
    else if ((ui >> 16) == (ui & 0xffff)) {
       spe_ilh(p, rT, ui & 0xffff);
    }
+   else if (
+      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+   ) {
+      unsigned int mask = 0;
+      /* fsmbi duplicates each bit in the given mask eight times,
+       * using a 16-bit value to initialize a 16-byte quadword.
+       * Each 4-bit nybble of the mask corresponds to a full word
+       * of the result; look at the value and figure out the mask
+       * (replicated for each word in the quadword), and then
+       * form the "select mask" to get the value.
+       */
+      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+      spe_fsmbi(p, rT, mask);
+   }
    else {
+      /* The general case: this usually uses two instructions, but
+       * may use only one if the low-order 16 bits of each word are 0.
+       */
       spe_ilhu(p, rT, ui >> 16);
       if (ui & 0xffff)
          spe_iohl(p, rT, ui & 0xffff);
    }
 }
 
+/* This function is constructed identically to spe_sor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either And Byte Immediate
+    * (which uses the same constant across each byte), And Halfword Immediate
+    * (which sign-extends a 10-bit immediate to 16 bits and uses that
+    * across each halfword), or And Word Immediate (which sign-extends
+    * a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use And Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_andi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use And Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_andhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the And Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_andbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_and(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+/* This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either Exclusive Or Byte 
+    * Immediate (which uses the same constant across each byte), Exclusive 
+    * Or Halfword Immediate (which sign-extends a 10-bit immediate to 
+    * 16 bits and uses that across each halfword), or Exclusive Or Word 
+    * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use Exclusive Or Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_xori(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use Exclusive Or Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_xorhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the Exclusive Or Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_xorbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_xor(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 9 bits or less, it fits inside a
+    * Compare Equal Word Immediate instruction.
+    */
+   if ((ui & 0x000001ff) == ui) {
+      spe_ceqi(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_ceq(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 10 bits or less, it fits inside a
+    * Compare Logical Greater Than Word Immediate instruction.
+    */
+   if ((ui & 0x000003ff) == ui) {
+      spe_clgti(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_clgt(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 61c7edeb60..cd2e245409 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -53,17 +53,26 @@ struct spe_function
    uint num_inst;
    uint max_inst;
 
-    /**
-     * Mask of used / unused registers
-     *
-     * Each set bit corresponds to an available register.  Each cleared bit
-     * corresponds to an allocated register.
+   /**
+    * The "set count" reflects the number of nested register sets
+    * are allowed.  In the unlikely case that we exceed the set count,
+    * register allocation will start to be confused, which is critical
+    * enough that we check for it.
+    */
+   unsigned char set_count;
+
+   /** 
+    * Flags for used and unused registers.  Each byte corresponds to a
+    * register; a 0 in that byte means that the register is available.
+    * A value of 1 means that the register was allocated in the current
+    * register set.  Any other value N means that the register was allocated
+    * N register sets ago.
      *
      * \sa
      * spe_allocate_register, spe_allocate_available_register,
-     * spe_release_register
+     * spe_allocate_register_set, spe_release_register_set, spe_release_register, 
      */
-    uint64_t regs[SPE_NUM_REGS / 64];
+    unsigned char regs[SPE_NUM_REGS];
 
     boolean print; /**< print/dump instructions as they're emitted? */
     int indent;    /**< number of spaces to indent */
@@ -77,6 +86,8 @@ extern unsigned spe_code_size(const struct spe_function *p);
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
 
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
@@ -307,6 +318,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i);
 extern void
 spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
 
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 99329fd8e2..c223bc1744 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -227,6 +227,7 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 653afc235d..f920ae13b4 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,10 +54,12 @@
  * \param ifragZ_reg  register containing integer fragment Z values (in)
  * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
  * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
  */
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
-               struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
    /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
@@ -132,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
        * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
        */
       spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return true;
    }
+
+   return false;
 }
 
 
@@ -238,22 +243,34 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  * it and have to allocate and load it again unnecessarily.
  */
 static inline void
-setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
 {
    if (*is_already_set) return;
    *r = spe_allocate_available_register(f);
-   spe_load_float(f, *r, value);
-   *is_already_set = true;
 }
 
 static inline void
-release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
 {
     if (!*is_already_set) return;
     spe_release_register(f, r);
     *is_already_set = false;
 }
 
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   setup_optional_register(f, is_already_set, r);
+   spe_load_float(f, *r, value);
+}
+
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    release_optional_register(f, is_already_set, r);
+}
+
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
  * \param f          SPE function to append instruction onto.
@@ -1117,6 +1134,633 @@ gen_colormask(struct spe_function *f,
     spe_release_register(f, colormask_reg);
 }
 
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
+                 unsigned int mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+    * register, taking into account whether each fragment was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      /* stencil_pass = mask & (s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      /* stencil_fail = mask & ~stencil_pass */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* stencil_pass = mask & ~(s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* stencil_pass = mask & (s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_LESS: {
+      /* stencil_pass = mask & (reference > s) */
+      /* There's no convenient Compare Less Than Immediate instruction, so
+       * we'll have to do this one the harder way, by loading a register and 
+       * comparing directly.  Compare Logical Greater Than Word (clgt) 
+       * treats its operands as unsigned - no sign extension.
+       */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_LEQUAL:
+      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL: {
+      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
+      /* As above, we have to do this by loading a register */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      spe_move(f, stencil_pass_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = mask & 1 = mask */
+      spe_move(f, stencil_pass_reg, mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+    */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
+                   unsigned int fbS_reg, unsigned int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+                       unsigned int fbS_reg, 
+                       unsigned int *fail_reg, unsigned int *zfail_reg, 
+                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
+                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+{
+   unsigned zfail_op, back_zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(dsa->stencil[0].enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes.  In particular, that means that the "zfail_op" (and the backfacing
+    * counterpart, if active) are not considered - a failing stencil test will
+    * trigger the "fail_op", and a passing stencil test will trigger the
+    * "zpass_op".
+    *
+    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
+    * we keep them from being calculated.
+    */
+   if (dsa->depth.enabled) {
+      zfail_op = dsa->stencil[0].zfail_op;
+      back_zfail_op = dsa->stencil[1].zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+      back_zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == dsa->stencil[0].fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+
+   /* If two-sided stencil is enabled, we have more work to do. */
+   if (!dsa->stencil[1].enabled) {
+      /* This just flags that the registers need not be deallocated later */
+      *back_fail_reg = fbS_reg;
+      *back_zfail_reg = fbS_reg;
+      *back_zpass_reg = fbS_reg;
+   }
+   else {
+      /* Same calculations as above, but for the back stencil */
+      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_fail_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
+         *back_fail_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == zfail_op) {
+         *back_fail_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
+         *back_fail_reg = *zpass_reg;
+      }
+      else {
+         *back_fail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_fail_reg);
+      }
+
+      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zfail_reg = fbS_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].fail_op) {
+         *back_zfail_reg = *fail_reg;
+      }
+      else if (back_zfail_op == zfail_op) {
+         *back_zfail_reg = *zfail_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
+         *back_zfail_reg = *zpass_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[1].fail_op) {
+         *back_zfail_reg = *back_fail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zfail_reg);
+      }
+
+      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zpass_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
+         *back_zpass_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == zfail_op) {
+         *back_zpass_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
+         *back_zpass_reg = *zpass_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
+         *back_zpass_reg = *back_fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
+         *back_zpass_reg = *back_zfail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zpass_reg);
+      }
+   } /* End of calculations for back-facing stencil */
+}
+
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const int const facing_reg,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = false;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   unsigned int stencil_pass_reg, stencil_fail_reg;
+   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   unsigned int stencil_writemask_reg;
+   unsigned int zmask_reg;
+   unsigned int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_allocate_register_set(f);
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    */
+   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Trivial: don't need to calculate stencil values, and don't need to 
+       * write them back to the framebuffer.
+       */
+      need_to_calculate_stencil_values = false;
+      need_to_writemask_stencil_values = false;
+   }
+   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = false;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = true;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
+      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
+         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
+         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
+         spe_release_register(f, back_write_mask_reg);
+      }
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* If two-sided stenciling is on, generate code to run the stencil
+    * test on the backfacing stencil as well, and combine the two results
+    * into the one correct result based on facing.
+    */
+   if (dsa->stencil[1].enabled) {
+      unsigned int temp_reg = spe_allocate_available_register(f);
+      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
+      spe_release_register(f, temp_reg);
+   }
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
+      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
+
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      gen_get_stencil_values(f, dsa, fbS_reg, 
+         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
+         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
+         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
+
+      /* Tricky, tricky, tricky - the things we do to create optimal
+       * code...
+       *
+       * The various stencil values registers may overlap with each other
+       * and with fbS_reg arbitrarily (as any particular operation is
+       * only calculated once and stored in one register, no matter
+       * how many times it is used).  So we can't change the values 
+       * within those registers directly - if we change a value in a
+       * register that's being referenced by two different calculations,
+       * we've just unwittingly changed the second value as well...
+       *
+       * Avoid this by allocating new registers to hold the results
+       * (there may be 2, if the depth test is off, or 3, if it is on).
+       * These will be released as part of the register set.
+       */
+      if (!dsa->stencil[1].enabled) {
+         /* The easy case: if two-sided stenciling is *not* enabled, we
+          * just use the front-sided values.
+          */
+         stencil_fail_values = front_stencil_fail_values;
+         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
+         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
+      }
+      else { /* two-sided stencil enabled */
+         /* Allocate new registers for the needed merged values */
+         stencil_fail_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
+         if (dsa->depth.enabled) {
+            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
+            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
+         }
+         else {
+            stencil_pass_depth_fail_values = fbS_reg;
+         }
+         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
+      }
+   }
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_move(f, newS_reg, fbS_reg);
+         modified_buffers = true;
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+      }
+
+      /* Same for the stencil pass/depth pass mask */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+         spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+         spe_release_register(f, stencil_pass_depth_pass_mask);
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_release_register_set(f);
+
+   /* Return true if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
 /**
  * Generate SPE code to implement the fragment operations (alpha test,
  * depth test, stencil test, blending, colormask, and final
@@ -1156,6 +1800,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
+   const int facing_reg = 13; /* uint */
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1183,6 +1828,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
+   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1195,6 +1841,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
+      spe_comment(f, 0, "Computing tile location in memory");
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
       spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
@@ -1205,124 +1852,164 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
-
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
 
+   /* If we need the stencil buffers (because one- or two-sided stencil is
+    * enabled) or the depth buffer (because the depth test is enabled),
+    * go grab them.  Note that if either one- or two-sided stencil is
+    * enabled, dsa->stencil[0].enabled will be true.
+    */
    if (dsa->depth.enabled || dsa->stencil[0].enabled) {
       const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
       boolean write_depth_stencil;
 
-      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
-      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+      /* We may or may not need to allocate a register for Z or stencil values */
+      boolean fbS_reg_set = false, fbZ_reg_set = false;
+      unsigned int fbS_reg, fbZ_reg = 0;
+
+      spe_comment(f, 0, "Loading Z/stencil tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
       spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
 
-      if (dsa->depth.enabled) {
-         /* Extract Z bits from fbZS_reg into fbZ_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            int mask_reg = spe_allocate_available_register(f);
-            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
-            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
-            spe_release_register(f, mask_reg);
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_rotmi(f, fbZ_reg, fbZS_reg, -8);  /* fbZ = fbZS >> 8 */
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 32-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 16-bit Z values now */
-         }
-         else {
-            ASSERT(0);  /* invalid format */
-         }
-
-         /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 8 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 16 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
-         }
-      }
-      else {
-         /* no Z test, but set Z to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
+      /* From the Z/stencil buffer format, pull out the bits we need for
+       * Z and/or stencil.  We'll also convert the incoming fragment Z
+       * value in fragZ_reg from a floating point value in [0.0..1.0] to
+       * an unsigned integer value with the appropriate resolution.
+       */
+      switch(zs_format) {
+
+         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+         case PIPE_FORMAT_X8Z24_UNORM:
+            if (dsa->depth.enabled) {
+               /* We need the Z part at least */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* four 24-bit Z values in the low-order bits */
+               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* four 8-bit Z values in the high-order bits */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+            }
+            break;
+
+         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+         case PIPE_FORMAT_Z24X8_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* shift by 8 to get the upper 24-bit values */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* 8-bit stencil in the low-order bits - mask them out */
+               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+            }
+            break;
+
+         case PIPE_FORMAT_Z32_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 32-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            }
+            /* No stencil, so can't do anything there */
+            break;
+
+         case PIPE_FORMAT_Z16_UNORM:
+            if (dsa->depth.enabled) {
+               /* XXX Not sure this is correct, but it was here before, so we're
+                * going with it for now
+                */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 16-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+            }
+            /* No stencil */
+            break;
+
+         default:
+            ASSERT(0); /* invalid format */
       }
 
-
+      /* If stencil is enabled, use the stencil-specific code
+       * generator to generate both the stencil and depth (if needed)
+       * tests.  Otherwise, if only depth is enabled, generate
+       * a quick depth test.  The test generators themselves will
+       * report back whether the depth/stencil buffer has to be
+       * written back.
+       */
       if (dsa->stencil[0].enabled) {
-         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX extract with a shift */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* XXX extract with a mask */
-            ASSERT(0);
-         }
-      }
-      else {
-         /* no stencil test, but set to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
-      }
+         /* This will perform the stencil and depth tests, and update
+          * the mask_reg, fbZ_reg, and fbS_reg as required by the
+          * tests.
+          */
+         ASSERT(fbS_reg_set);
+         ASSERT(fbZ_reg_set);
+         spe_comment(f, 0, "Perform stencil test");
 
-      if (dsa->stencil[0].enabled) {
-         /* XXX this may involve depth testing too */
-         // gen_stencil_test(dsa, f, ... );
-         ASSERT(0);
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
-         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_comment(f, 0, "Perform depth test");
+         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
          spe_release_register(f, zmask_reg);
       }
-
-      /* do we need to write Z and/or Stencil back into framebuffer? */
-      write_depth_stencil = (dsa->depth.writemask |
-                             dsa->stencil[0].write_mask |
-                             dsa->stencil[1].write_mask);
+      else {
+         write_depth_stencil = false;
+      }
 
       if (write_depth_stencil) {
          /* Merge latest Z and Stencil values into fbZS_reg.
           * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
           * fbS_reg has four 8-bit Z values in bits [7..0].
           */
+         spe_comment(f, 0, "Storing depth/stencil values");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
+            else {
+               spe_move(f, fbZS_reg, fbZ_reg);
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
             spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
             spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
@@ -1341,11 +2028,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
       }
 
-      spe_release_register(f, fbZ_reg);
-      spe_release_register(f, fbS_reg);
+      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+      release_optional_register(f, &fbS_reg_set, fbS_reg);
    }
 
-
    /* Get framebuffer quad/colors.  We'll need these for blending,
     * color masking, and to obey the quad/pixel mask.
     * Load: fbRGBA_reg = memory[color_tile + quad_offset]
@@ -1354,8 +2040,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
     */
    spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
 
-
    if (blend->blend_enable) {
+      spe_comment(f, 0, "Perform blending");
       gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
@@ -1369,19 +2055,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       int rgba_reg = spe_allocate_available_register(f);
 
       /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert fragment colors to framebuffer colors");
       gen_pack_colors(f, color_format,
                       fragR_reg, fragG_reg, fragB_reg, fragA_reg,
                       rgba_reg);
 
       if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
       if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
          gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
       }
 
-
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
        *    rgba[i] = rgba[i];
@@ -1393,6 +2081,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       /* Store updated quad in tile:
        * memory[color_tile + quad_offset] = rgba_reg;
        */
+      spe_comment(f, 0, "Store framebuffer colors");
       spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
 
       spe_release_register(f, rgba_reg);
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e..79cb8df82f 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
       struct cell_command_render *render = &cell_global.command[i].render;
       render->prim_type = PIPE_PRIM_TRIANGLES;
       render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
       render->vertex_size = cell->vertex_info->size * 4;
       render->xmin = cell->prim_buffer.xmin;
       render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..578ddf62dc 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
+      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 29a305232e..1cd577c23c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -73,7 +73,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask);
+                                      vector unsigned int mask,
+                                      uint facing);
 
 /** Function for running fragment program */
 typedef void (*spu_fragment_program_func)(vector float *inputs,
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f107764fb2..d252fa6dc1 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask)
+                          vector unsigned int mask,
+                          uint facing)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
@@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       /* Form bitmask depending on color buffer format and colormask bits */
       switch (spu.fb.color_format) {
       case PIPE_FORMAT_A8R8G8B8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x00ff0000; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x0000ff00; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0x000000ff; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0xff000000; /* alpha */
          break;
       case PIPE_FORMAT_B8G8R8A8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x0000ff00; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x00ff0000; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0xff000000; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0x000000ff; /* alpha */
          break;
       default:
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index f817abf046..a61689c83a 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask);
+                          vector unsigned int mask,
+                          uint facing);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc98881..82dbeb26b7 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty);
+         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("SPU %u: RENDER done\n",
              spu.init.id);
 }
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 0a8fb56a62..6039cd80b2 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -118,6 +118,8 @@ struct setup_stage {
 
    float oneoverarea;
 
+   uint facing;
+
    uint tx, ty;
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
@@ -274,7 +276,7 @@ eval_z(float x, float y)
  * overall.
  */
 static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
 {
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
@@ -344,7 +346,8 @@ emit_quad( int x, int y, mask_t mask )
                              fragZ,
                              soa_frag[0], soa_frag[1],
                              soa_frag[2], soa_frag[3],
-                             mask);
+                             mask,
+                             setup.facing);
          }
 
       }
@@ -379,7 +382,8 @@ emit_quad( int x, int y, mask_t mask )
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask);
+                          mask,
+                          setup.facing);
       }
    }
 }
@@ -483,7 +487,7 @@ static void flush_spans( void )
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
 #if 1
-      emit_quad( x, setup.span.y, calculate_mask( x ) );
+      emit_quad( x, setup.span.y, calculate_mask( x ));
 #endif
    }
 
@@ -902,13 +906,28 @@ static void subtriangle( struct edge *eleft,
    eright->sy += lines;
 }
 
+static float
+determinant( const float *v0,
+             const float *v1,
+             const float *v2 )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
 
 /**
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -919,6 +938,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
+   /* Before we sort vertices, determine the facing of the triangle,
+    * which will be needed for front/back-face stencil application
+    */
+   float det = determinant(v0, v1, v2);
+   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c9..abc3d35160 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
 
 
 #endif /* SPU_TRI_H */
-- 
cgit v1.2.3


From 0370d6b359016790c6b879c2a4b6661adac20dea Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Sat, 4 Oct 2008 12:41:56 +0100
Subject: mesa: handle vertex program enabled case also in texenvprogram.c

---
 src/mesa/main/texenvprogram.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index 7cd82f98b0..ea2ee160e4 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -204,7 +204,7 @@ static GLuint get_fp_input_mask( GLcontext *ctx )
 {
    GLuint fp_inputs = 0;
 
-   if (1) {
+   if (!ctx->VertexProgram._Enabled) {
       GLuint varying_inputs = ctx->varying_vp_inputs;
 
       /* First look at what values may be computed by the generated
@@ -232,14 +232,13 @@ static GLuint get_fp_input_mask( GLcontext *ctx )
    }
    else {
       /* calculate from vp->outputs */
-      GLuint vp_outputs = 0;
+      GLuint vp_outputs = ctx->VertexProgram._Current->Base.OutputsWritten;
 
       if (vp_outputs & (1 << VERT_RESULT_COL0)) fp_inputs |= FRAG_BIT_COL0;
       if (vp_outputs & (1 << VERT_RESULT_COL1)) fp_inputs |= FRAG_BIT_COL1;
 
-      fp_inputs |= (((vp_outputs & VERT_RESULT_TEX_ANY) 
-                   << VERT_RESULT_TEX0) 
-                  >> FRAG_ATTRIB_TEX0);
+      fp_inputs |= (((vp_outputs & VERT_RESULT_TEX_ANY) >> VERT_RESULT_TEX0) 
+                    << FRAG_ATTRIB_TEX0);
    }
    
    return fp_inputs;
-- 
cgit v1.2.3


From 91d0020eecb78ef2984fd0afafc5d555c0e957d8 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Sat, 4 Oct 2008 18:20:35 -0700
Subject: i915: Refine the texture indirect lookup accounting.

Without this, we would reject programs which sampled multiple times from
registers defined in the same phase (block of instructions with the same
texture indirection count), as each sample would count as a new phase
beginning.  Instead, keep track of which phases registers were written in,
and only bump phase when we're reading from one generated in this phase.

On the other hand, we failed to count oC or oD texture samples as being new
phases.

Bug #17865.
---
 src/mesa/drivers/dri/i915/i915_context.h |  3 +++
 src/mesa/drivers/dri/i915/i915_program.c | 25 ++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i915/i915_context.h b/src/mesa/drivers/dri/i915/i915_context.h
index c6958dd8d4..a2376e50e1 100644
--- a/src/mesa/drivers/dri/i915/i915_context.h
+++ b/src/mesa/drivers/dri/i915/i915_context.h
@@ -125,6 +125,9 @@ struct i915_fragment_program
    GLboolean on_hardware;
    GLboolean error;             /* If program is malformed for any reason. */
 
+   /** Record of which phases R registers were last written in. */
+   GLuint register_phases[16];
+   GLuint indirections;
    GLuint nr_tex_indirect;
    GLuint nr_tex_insn;
    GLuint nr_alu_insn;
diff --git a/src/mesa/drivers/dri/i915/i915_program.c b/src/mesa/drivers/dri/i915/i915_program.c
index 49193297a8..350da5e169 100644
--- a/src/mesa/drivers/dri/i915/i915_program.c
+++ b/src/mesa/drivers/dri/i915/i915_program.c
@@ -190,6 +190,9 @@ i915_emit_arith(struct i915_fragment_program * p,
    *(p->csr++) = (A1_SRC0(src0) | A1_SRC1(src1));
    *(p->csr++) = (A2_SRC1(src1) | A2_SRC2(src2));
 
+   if (GET_UREG_TYPE(dest) == REG_TYPE_R)
+      p->register_phases[GET_UREG_NR(dest)] = p->nr_tex_indirect;
+
    p->nr_alu_insn++;
    return dest;
 }
@@ -237,10 +240,22 @@ GLuint i915_emit_texld( struct i915_fragment_program *p,
    else {
       assert(GET_UREG_TYPE(dest) != REG_TYPE_CONST);
       assert(dest = UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest)));
+      /* Can't use unsaved temps for coords, as the phase boundary would result
+       * in the contents becoming undefined.
+       */
+      assert(GET_UREG_TYPE(coord) != REG_TYPE_U);
+
+      /* Output register being oC or oD defines a phase boundary */
+      if (GET_UREG_TYPE(dest) == REG_TYPE_OC ||
+	  GET_UREG_TYPE(dest) == REG_TYPE_OD)
+	 p->nr_tex_indirect++;
 
-      if (GET_UREG_TYPE(coord) != REG_TYPE_T) {
+      /* Reading from an r# register whose contents depend on output of the
+       * current phase defines a phase boundary.
+       */
+      if (GET_UREG_TYPE(coord) == REG_TYPE_R &&
+	  p->register_phases[GET_UREG_NR(coord)] == p->nr_tex_indirect)
 	 p->nr_tex_indirect++;
-      }
 
       *(p->csr++) = (op | 
 		     T0_DEST( dest ) |
@@ -249,6 +264,9 @@ GLuint i915_emit_texld( struct i915_fragment_program *p,
       *(p->csr++) = T1_ADDRESS_REG( coord );
       *(p->csr++) = T2_MBZ;
 
+      if (GET_UREG_TYPE(dest) == REG_TYPE_R)
+	 p->register_phases[GET_UREG_NR(dest)] = p->nr_tex_indirect;
+
       p->nr_tex_insn++;
       return dest;
    }
@@ -413,7 +431,8 @@ i915_init_program(struct i915_context *i915, struct i915_fragment_program *p)
    p->on_hardware = 0;
    p->error = 0;
 
-   p->nr_tex_indirect = 1;      /* correct? */
+   memset(&p->register_phases, 0, sizeof(p->register_phases));
+   p->nr_tex_indirect = 1;
    p->nr_tex_insn = 0;
    p->nr_alu_insn = 0;
    p->nr_decl_insn = 0;
-- 
cgit v1.2.3


From 53d4706c6c0922160f310834daaec5718ff1c511 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Wed, 10 Sep 2008 11:39:43 +0100
Subject: make draw's vertex_info struct smaller/quicker to compare with
 memcmp()

---
 src/gallium/auxiliary/draw/draw_pipe_vbuf.c        |  4 +-
 src/gallium/auxiliary/draw/draw_pt_emit.c          |  4 +-
 src/gallium/auxiliary/draw/draw_pt_fetch_emit.c    |  4 +-
 .../auxiliary/draw/draw_pt_fetch_shade_emit.c      |  6 +--
 src/gallium/auxiliary/draw/draw_vertex.c           |  6 +--
 src/gallium/auxiliary/draw/draw_vertex.h           | 44 ++++++++++++++++++----
 src/gallium/drivers/i915simple/i915_prim_emit.c    |  4 +-
 .../drivers/i915simple/i915_state_derived.c        |  4 +-
 src/gallium/drivers/softpipe/sp_setup.c            | 12 +++---
 9 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index c0cf4269db..9825e116c3 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -231,9 +231,9 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint prim )
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       unsigned output_format;
-      unsigned src_offset = (vbuf->vinfo->src_index[i] * 4 * sizeof(float) );
+      unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
-      switch (vbuf->vinfo->emit[i]) {
+      switch (vbuf->vinfo->attrib[i].emit) {
       case EMIT_4F:
 	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 	 emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index d4eca80588..d520b05869 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -84,11 +84,11 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       unsigned output_format;
-      unsigned src_offset = (vinfo->src_index[i] * 4 * sizeof(float) );
+      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
 
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_4F:
 	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 	 emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 5a4db6cfe5..3966ad48ba 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -121,7 +121,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
    memset(&key, 0, sizeof(key));
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->src_index[i]];
+      const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->attrib[i].src_index];
 
       unsigned emit_sz = 0;
       unsigned input_format = src->src_format;
@@ -129,7 +129,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
       unsigned input_offset = src->src_offset;
       unsigned output_format;
 
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_4F:
 	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 	 emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index a0e08dd10a..f7e6a1a8ee 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -133,7 +133,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       for (i = 0; i < vinfo->num_attribs; i++) {
          unsigned emit_sz = 0;
 
-         switch (vinfo->emit[i]) {
+         switch (vinfo->attrib[i].emit) {
          case EMIT_4F:
             emit_sz = 4 * sizeof(float);
             break;
@@ -161,8 +161,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
           * numbers, not to positions in the hw vertex description --
           * that's handled by the output_offset field.
           */
-         fse->key.element[i].out.format = vinfo->emit[i];
-         fse->key.element[i].out.vs_output = vinfo->src_index[i];
+         fse->key.element[i].out.format = vinfo->attrib[i].emit;
+         fse->key.element[i].out.vs_output = vinfo->attrib[i].src_index;
          fse->key.element[i].out.offset = dst_offset;
       
          dst_offset += emit_sz;
diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c
index 1446f785c5..3214213e44 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.c
+++ b/src/gallium/auxiliary/draw/draw_vertex.c
@@ -49,7 +49,7 @@ draw_compute_vertex_size(struct vertex_info *vinfo)
 
    vinfo->size = 0;
    for (i = 0; i < vinfo->num_attribs; i++) {
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_OMIT:
          break;
       case EMIT_4UB:
@@ -81,8 +81,8 @@ draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
    unsigned i, j;
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      j = vinfo->src_index[i];
-      switch (vinfo->emit[i]) {
+      j = vinfo->attrib[i].src_index;
+      switch (vinfo->attrib[i].emit) {
       case EMIT_OMIT:
          debug_printf("EMIT_OMIT:");
          break;
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index 16c65c4317..dca6158128 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -75,12 +75,41 @@ struct vertex_info
 {
    uint num_attribs;
    uint hwfmt[4];      /**< hardware format info for this format */
-   enum interp_mode interp_mode[PIPE_MAX_SHADER_INPUTS];
-   enum attrib_emit emit[PIPE_MAX_SHADER_INPUTS];   /**< EMIT_x */
-   uint src_index[PIPE_MAX_SHADER_INPUTS]; /**< map to post-xform attribs */
    uint size;          /**< total vertex size in dwords */
+   
+   /* Keep this small and at the end of the struct to allow quick
+    * memcmp() comparisons.
+    */
+   struct {
+      ubyte interp_mode:4;      /**< INTERP_x */
+      ubyte emit:4;             /**< EMIT_x */
+      ubyte src_index;          /**< map to post-xform attribs */
+   } attrib[PIPE_MAX_SHADER_INPUTS];
 };
 
+static inline int
+draw_vinfo_size( const struct vertex_info *a )
+{
+   return ((const char *)&a->attrib[a->num_attribs] -
+           (const char *)a);
+}
+
+static inline int
+draw_vinfo_compare( const struct vertex_info *a,
+                    const struct vertex_info *b )
+{
+   unsigned sizea = draw_vinfo_size( a );
+   return memcmp( a, b, sizea );
+}
+
+static inline void
+draw_vinfo_copy( struct vertex_info *dst,
+                 const struct vertex_info *src )
+{
+   unsigned size = draw_vinfo_size( src );
+   memcpy( dst, src, size );
+}
+
 
 
 /**
@@ -91,14 +120,15 @@ struct vertex_info
  */
 static INLINE uint
 draw_emit_vertex_attr(struct vertex_info *vinfo,
-                      enum attrib_emit emit, enum interp_mode interp,
+                      enum attrib_emit emit, 
+                      enum interp_mode interp, /* only used by softpipe??? */
                       uint src_index)
 {
    const uint n = vinfo->num_attribs;
    assert(n < PIPE_MAX_SHADER_INPUTS);
-   vinfo->emit[n] = emit;
-   vinfo->interp_mode[n] = interp;
-   vinfo->src_index[n] = src_index;
+   vinfo->attrib[n].emit = emit;
+   vinfo->attrib[n].interp_mode = interp;
+   vinfo->attrib[n].src_index = src_index;
    vinfo->num_attribs++;
    return n;
 }
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
index d194c2fb15..8f1f58b2dd 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -77,9 +77,9 @@ emit_hw_vertex( struct i915_context *i915,
    assert(!i915->dirty);
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      const uint j = vinfo->src_index[i];
+      const uint j = vinfo->attrib[i].src_index;
       const float *attrib = vertex->data[j];
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_1F:
          OUT_BATCH( fui(attrib[0]) );
          count++;
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c
index 488615067c..178d4e8781 100644
--- a/src/gallium/drivers/i915simple/i915_state_derived.c
+++ b/src/gallium/drivers/i915simple/i915_state_derived.c
@@ -88,12 +88,12 @@ static void calculate_vertex_layout( struct i915_context *i915 )
    if (needW) {
       draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZW;
-      vinfo.emit[0] = EMIT_4F;
+      vinfo.attrib[0].emit = EMIT_4F;
    }
    else {
       draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZ;
-      vinfo.emit[0] = EMIT_3F;
+      vinfo.attrib[0].emit = EMIT_3F;
    }
 
    /* hardware point size */
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index bc8263c33e..13d8017393 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -773,10 +773,10 @@ static void setup_tri_coefficients( struct setup_context *setup )
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->src_index[fragSlot];
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->interp_mode[fragSlot]) {
+      switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          for (j = 0; j < NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
@@ -1084,10 +1084,10 @@ setup_line_coefficients(struct setup_context *setup,
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->src_index[fragSlot];
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->interp_mode[fragSlot]) {
+      switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          for (j = 0; j < NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
@@ -1331,10 +1331,10 @@ setup_point( struct setup_context *setup,
    const_coeff(setup, &setup->posCoef, 0, 3);
 
    for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->src_index[fragSlot];
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->interp_mode[fragSlot]) {
+      switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          /* fall-through */
       case INTERP_LINEAR:
-- 
cgit v1.2.3


From 7053f8c902e904495dffbbf6ea55f414cec780e7 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 6 Oct 2008 11:54:22 +0100
Subject: rtasm: fix debug build

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index cc5871f873..dd26d4d9ed 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -657,7 +657,7 @@ void sse_movntps( struct x86_function *p,
                   struct x86_reg dst,
                   struct x86_reg src)
 {
-   DUMP_RR( dst, reg );
+   DUMP_RR( dst, src );
 
    assert(dst.mod != mod_REG);
    assert(src.mod == mod_REG);
-- 
cgit v1.2.3


From 9b827018133868e84ddc0998a5b5387584c7478c Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Mon, 6 Oct 2008 13:23:56 +0200
Subject: draw: Fix compiler errors on Windows.

---
 src/gallium/auxiliary/draw/draw_vertex.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index dca6158128..a943607d7e 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -87,14 +87,14 @@ struct vertex_info
    } attrib[PIPE_MAX_SHADER_INPUTS];
 };
 
-static inline int
+static INLINE int
 draw_vinfo_size( const struct vertex_info *a )
 {
    return ((const char *)&a->attrib[a->num_attribs] -
            (const char *)a);
 }
 
-static inline int
+static INLINE int
 draw_vinfo_compare( const struct vertex_info *a,
                     const struct vertex_info *b )
 {
@@ -102,7 +102,7 @@ draw_vinfo_compare( const struct vertex_info *a,
    return memcmp( a, b, sizea );
 }
 
-static inline void
+static INLINE void
 draw_vinfo_copy( struct vertex_info *dst,
                  const struct vertex_info *src )
 {
-- 
cgit v1.2.3


From 382911bdbce5545117d5a70ce7e43b71e8396e32 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 09:26:45 -0600
Subject: mesa: add missing GLcontext param to _mesa_delete_query().

Fixes vtk crash and others.
---
 src/mesa/main/queryobj.c | 2 +-
 src/mesa/main/queryobj.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index a1e32e70ba..2d06030030 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -95,7 +95,7 @@ _mesa_wait_query(GLcontext *ctx, struct gl_query_object *q)
  * XXX maybe add Delete() method to gl_query_object class and call that instead
  */
 void
-_mesa_delete_query(struct gl_query_object *q)
+_mesa_delete_query(GLcontext *ctx, struct gl_query_object *q)
 {
    _mesa_free(q);
 }
diff --git a/src/mesa/main/queryobj.h b/src/mesa/main/queryobj.h
index c05a1f3da8..9a9774641b 100644
--- a/src/mesa/main/queryobj.h
+++ b/src/mesa/main/queryobj.h
@@ -37,7 +37,7 @@ extern void
 _mesa_free_query_data(GLcontext *ctx);
 
 extern void
-_mesa_delete_query(struct gl_query_object *q);
+_mesa_delete_query(GLcontext *ctx, struct gl_query_object *q);
 
 extern void
 _mesa_begin_query(GLcontext *ctx, struct gl_query_object *q);
-- 
cgit v1.2.3


From f362788eae3d300e4003e8996dc79fc1947a0f60 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 09:27:31 -0600
Subject: mesa: add missing GLcontext param to _mesa_delete_query().

Fixes vtk crash and others.
---
 src/mesa/main/queryobj.c | 2 +-
 src/mesa/main/queryobj.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index a1e32e70ba..2d06030030 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -95,7 +95,7 @@ _mesa_wait_query(GLcontext *ctx, struct gl_query_object *q)
  * XXX maybe add Delete() method to gl_query_object class and call that instead
  */
 void
-_mesa_delete_query(struct gl_query_object *q)
+_mesa_delete_query(GLcontext *ctx, struct gl_query_object *q)
 {
    _mesa_free(q);
 }
diff --git a/src/mesa/main/queryobj.h b/src/mesa/main/queryobj.h
index c05a1f3da8..9a9774641b 100644
--- a/src/mesa/main/queryobj.h
+++ b/src/mesa/main/queryobj.h
@@ -37,7 +37,7 @@ extern void
 _mesa_free_query_data(GLcontext *ctx);
 
 extern void
-_mesa_delete_query(struct gl_query_object *q);
+_mesa_delete_query(GLcontext *ctx, struct gl_query_object *q);
 
 extern void
 _mesa_begin_query(GLcontext *ctx, struct gl_query_object *q);
-- 
cgit v1.2.3


From b0ca50bd38b69593ac683a5e7635fbafa5e06676 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 09:32:33 -0600
Subject: mesa: updated _mesa_delete_query() comments

---
 src/mesa/main/queryobj.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index 2d06030030..6f084b6a4f 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -90,9 +90,8 @@ _mesa_wait_query(GLcontext *ctx, struct gl_query_object *q)
 
 
 /**
- * Delete an occlusion query object.
+ * Delete a query object.  Called via ctx->Driver.DeleteQuery().
  * Not removed from hash table here.
- * XXX maybe add Delete() method to gl_query_object class and call that instead
  */
 void
 _mesa_delete_query(GLcontext *ctx, struct gl_query_object *q)
-- 
cgit v1.2.3


From 145d49838fe268c8524a369d59cb9f771657ad59 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 10:58:16 -0600
Subject: mesa: fix static library construction

If the .a is made of other .a files, extract the objects from the later.
---
 bin/mklib | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/bin/mklib b/bin/mklib
index a25d5bfaeb..d7b740f8e3 100755
--- a/bin/mklib
+++ b/bin/mklib
@@ -260,9 +260,31 @@ case $ARCH in
                 OPTS=${ALTOPTS}
             fi
             rm -f ${LIBNAME}
+
+	    # expand any .a objects into constituent .o files.
+	    NEWOBJECTS=""
+	    DELETIA=""
+	    for OBJ in ${OBJECTS} ; do
+		if [ `expr match $OBJ '.*\.a'` -gt 0 ] ; then
+		    # extract the .o files from this .a archive
+		    FILES=`ar t $OBJ`
+		    ar x $OBJ
+		    NEWOBJECTS="$NEWOBJECTS $FILES"
+		    # keep track of temporary .o files and delete them below
+		    DELETIA="$DELETIA $FILES"
+		else
+		    # ordinary .o file
+		    NEWOBJECTS="$NEWOBJECTS $OBJ"
+		fi
+	    done
+
             # make lib
-            ${LINK} ${OPTS} ${LIBNAME} ${OBJECTS}
+            ${LINK} ${OPTS} ${LIBNAME} ${NEWOBJECTS}
             ranlib ${LIBNAME}
+
+	    # remove temporary extracted .o files
+	    rm -f ${DELETIA}
+
             # finish up
             FINAL_LIBS=${LIBNAME}
         else
-- 
cgit v1.2.3


From 6e34fc0d374263ca40855ba4dcc237d961a79e34 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 11:34:01 -0600
Subject: mesa: adjust texcoords for swrast sprite points.

Fixes glean pointSprite test w/ software rendering
---
 src/mesa/swrast/s_points.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c
index 846c485f15..61ff4d0b84 100644
--- a/src/mesa/swrast/s_points.c
+++ b/src/mesa/swrast/s_points.c
@@ -129,13 +129,13 @@ sprite_point(GLcontext *ctx, const SWvertex *vert)
       s = 0.0;
       dsdx = 1.0 / size;
       if (ctx->Point.SpriteOrigin == GL_LOWER_LEFT) {
-         t0 = 0.0;
          dtdy = 1.0 / size;
+         t0 = 0.5 * dtdy;
       }
       else {
          /* GL_UPPER_LEFT */
-         t0 = 1.0;
          dtdy = -1.0 / size;
+         t0 = 1.0 + 0.5 * dtdy;
       }
 
       ATTRIB_LOOP_BEGIN
-- 
cgit v1.2.3


From 8e8208d6db8b764568539784a6473d545dec2265 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Mon, 6 Oct 2008 19:48:57 +0200
Subject: Mesa: fix the case where there are no vertex attributes.

---
 src/mesa/state_tracker/st_draw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index bdf8648ef7..f9016923dc 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -525,6 +525,8 @@ st_draw_vbo(GLcontext *ctx,
                                 vbuffer, velements);
       num_vbuffers = 1;
       num_velements = vp->num_inputs;
+      if (num_velements == 0)
+	      num_vbuffers = 0;
    }
    else {
       /*printf("Draw non-interleaved\n");*/
-- 
cgit v1.2.3


From f8baad2d255c77c47dddeddcaf719e163e9556fb Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 12:29:29 -0600
Subject: mesa: set FRAG_BIT_FOGC bit in InputsUsed if FogOption!=GL_NONE

---
 src/mesa/shader/arbprogparse.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mesa/shader/arbprogparse.c b/src/mesa/shader/arbprogparse.c
index 8ce5348c09..39988b5fca 100644
--- a/src/mesa/shader/arbprogparse.c
+++ b/src/mesa/shader/arbprogparse.c
@@ -3863,6 +3863,9 @@ _mesa_parse_arb_fragment_program(GLcontext* ctx, GLenum target,
    program->FogOption          = ap.FogOption;
    program->UsesKill          = ap.UsesKill;
 
+   if (program->FogOption)
+      program->Base.InputsRead |= FRAG_BIT_FOGC;
+      
    if (program->Base.Instructions)
       _mesa_free(program->Base.Instructions);
    program->Base.Instructions = ap.Base.Instructions;
-- 
cgit v1.2.3


From 4f4147eadd983bd4052c5a8e80a1750a813a25fc Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 17:10:22 -0600
Subject: mesa: fix convolve/convolution mix-ups

---
 src/mesa/main/api_exec.c  | 4 ++--
 src/mesa/main/mfeatures.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/api_exec.c b/src/mesa/main/api_exec.c
index 0c3c9c4de4..bae3bf11cb 100644
--- a/src/mesa/main/api_exec.c
+++ b/src/mesa/main/api_exec.c
@@ -58,7 +58,7 @@
 #include "colortab.h"
 #endif
 #include "context.h"
-#if FEATURE_convolution
+#if FEATURE_convolve
 #include "convolve.h"
 #endif
 #include "depth.h"
@@ -402,7 +402,7 @@ _mesa_init_exec_table(struct _glapi_table *exec)
    SET_GetColorTableParameteriv(exec, _mesa_GetColorTableParameteriv);
 #endif
 
-#if FEATURE_convolution
+#if FEATURE_convolve
    SET_ConvolutionFilter1D(exec, _mesa_ConvolutionFilter1D);
    SET_ConvolutionFilter2D(exec, _mesa_ConvolutionFilter2D);
    SET_ConvolutionParameterf(exec, _mesa_ConvolutionParameterf);
diff --git a/src/mesa/main/mfeatures.h b/src/mesa/main/mfeatures.h
index ed78f57edf..3819da3d68 100644
--- a/src/mesa/main/mfeatures.h
+++ b/src/mesa/main/mfeatures.h
@@ -39,7 +39,7 @@
 #define FEATURE_accum  _HAVE_FULL_GL
 #define FEATURE_attrib_stack  _HAVE_FULL_GL
 #define FEATURE_colortable  _HAVE_FULL_GL
-#define FEATURE_convolution  _HAVE_FULL_GL
+#define FEATURE_convolve  _HAVE_FULL_GL
 #define FEATURE_dispatch  _HAVE_FULL_GL
 #define FEATURE_dlist  _HAVE_FULL_GL
 #define FEATURE_draw_read_buffer  _HAVE_FULL_GL
-- 
cgit v1.2.3


From d055b2c001a0fb233f98c10d124b43dd2448059e Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 17:10:45 -0600
Subject: mesa: fix convolve/convolution mix-ups

---
 src/mesa/main/api_exec.c               | 4 ++--
 src/mesa/main/mfeatures.h              | 2 +-
 src/mesa/state_tracker/st_cb_texture.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/mesa/main/api_exec.c b/src/mesa/main/api_exec.c
index 0c3c9c4de4..bae3bf11cb 100644
--- a/src/mesa/main/api_exec.c
+++ b/src/mesa/main/api_exec.c
@@ -58,7 +58,7 @@
 #include "colortab.h"
 #endif
 #include "context.h"
-#if FEATURE_convolution
+#if FEATURE_convolve
 #include "convolve.h"
 #endif
 #include "depth.h"
@@ -402,7 +402,7 @@ _mesa_init_exec_table(struct _glapi_table *exec)
    SET_GetColorTableParameteriv(exec, _mesa_GetColorTableParameteriv);
 #endif
 
-#if FEATURE_convolution
+#if FEATURE_convolve
    SET_ConvolutionFilter1D(exec, _mesa_ConvolutionFilter1D);
    SET_ConvolutionFilter2D(exec, _mesa_ConvolutionFilter2D);
    SET_ConvolutionParameterf(exec, _mesa_ConvolutionParameterf);
diff --git a/src/mesa/main/mfeatures.h b/src/mesa/main/mfeatures.h
index b08c017ec8..487493f88e 100644
--- a/src/mesa/main/mfeatures.h
+++ b/src/mesa/main/mfeatures.h
@@ -39,7 +39,7 @@
 #define FEATURE_accum  _HAVE_FULL_GL
 #define FEATURE_attrib_stack  _HAVE_FULL_GL
 #define FEATURE_colortable  _HAVE_FULL_GL
-#define FEATURE_convolution  _HAVE_FULL_GL
+#define FEATURE_convolve  _HAVE_FULL_GL
 #define FEATURE_dispatch  _HAVE_FULL_GL
 #define FEATURE_dlist  _HAVE_FULL_GL
 #define FEATURE_draw_read_buffer  _HAVE_FULL_GL
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 958f88bf2c..a018cdee64 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 #include "main/imports.h"
-#if FEATURE_convolution
+#if FEATURE_convolve
 #include "main/convolve.h"
 #endif
 #include "main/enums.h"
@@ -409,7 +409,7 @@ st_TexImage(GLcontext * ctx,
    stImage->face = _mesa_tex_target_to_face(target);
    stImage->level = level;
 
-#if FEATURE_convolution
+#if FEATURE_convolve
    if (ctx->_ImageTransferState & IMAGE_CONVOLUTION_BIT) {
       _mesa_adjust_image_for_convolution(ctx, dims, &postConvWidth,
                                          &postConvHeight);
-- 
cgit v1.2.3


From f7ee3c979261b4a2b77365b47c7147f69fbfd606 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 18:31:56 -0600
Subject: gallium: replace assertion with conditional/recovery code

The assertion failed when we ran out of exec memory.
Found with conform texcombine test.
---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index dd26d4d9ed..ad9d8f8ced 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -370,7 +370,11 @@ void x86_jcc( struct x86_function *p,
    DUMP_I(cc);
    
    if (offset < 0) {
-      assert(p->csr - p->store > -offset);
+      /*assert(p->csr - p->store > -offset);*/
+      if (p->csr - p->store <= -offset) {
+         /* probably out of memory (using the error_overflow buffer) */
+         return;
+      }
    }
 
    if (offset <= 127 && offset >= -128) {
-- 
cgit v1.2.3


From 4d7394f89292131323fc8e39efa511a2eeb8cc60 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 7 Oct 2008 14:25:09 +0900
Subject: gallium: Introduce PIPE_ARCH_SSE define for SSE support.

Besides meaning x86 and x86-64 architecture, it also depends on SSE2
support enabled on gcc.

This fixes the linux-debug build.
---
 src/gallium/auxiliary/draw/draw_vs_sse.c | 2 +-
 src/gallium/auxiliary/tgsi/tgsi_sse2.c   | 2 +-
 src/gallium/auxiliary/util/u_sse.h       | 2 +-
 src/gallium/drivers/softpipe/sp_fs_sse.c | 2 +-
 src/gallium/include/pipe/p_config.h      | 8 ++++++++
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 0efabd9de8..b11ae31662 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -37,7 +37,7 @@
 
 #include "draw_vs.h"
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
 
 #include "pipe/p_shader_tokens.h"
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 79f424b692..f79170b9d6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -27,7 +27,7 @@
 
 #include "pipe/p_config.h"
 
-#ifdef PIPE_ARCH_X86
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
 
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 68e56f0816..e2a8491e62 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -39,7 +39,7 @@
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if defined(PIPE_ARCH_SSE)
 
 #include <xmmintrin.h>
 #include <emmintrin.h>
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 496ed43df2..0111469405 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -40,7 +40,7 @@
 #include "tgsi/tgsi_sse2.h"
 
 
-#ifdef PIPE_ARCH_X86
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
 
 #include "rtasm/rtasm_x86sse.h"
 
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index af3746c026..ef05547819 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -85,6 +85,14 @@
 #define PIPE_ARCH_X86_64
 #endif
 
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if defined(PIPE_CC_GCC) && !defined(__SSE2__)
+/* #warning SSE2 support requires -msse -msse2 compiler options */
+#else
+#define PIPE_ARCH_SSE
+#endif
+#endif
+
 #if 0 /* FIXME */
 #define PIPE_ARCH_PPC
 #endif
-- 
cgit v1.2.3


From 23e62c94ee17f21e99cc2e11fb1f960c0eb56f16 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 08:30:29 -0600
Subject: mesa: remove old assertion

---
 src/mesa/shader/arbprogparse.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/mesa/shader/arbprogparse.c b/src/mesa/shader/arbprogparse.c
index 34350ac4f3..4d89d057c7 100644
--- a/src/mesa/shader/arbprogparse.c
+++ b/src/mesa/shader/arbprogparse.c
@@ -3443,8 +3443,6 @@ parse_instructions(GLcontext * ctx, const GLubyte * inst,
       : ctx->Const.VertexProgram.MaxInstructions;
    GLint err = 0;
 
-   ASSERT(MAX_INSTRUCTIONS >= maxInst);
-
    Program->MajorVersion = (GLuint) * inst++;
    Program->MinorVersion = (GLuint) * inst++;
 
-- 
cgit v1.2.3


From c48da7d78b4e7bdbe056b3c9668756d49019be06 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 6 Oct 2008 12:22:55 +0100
Subject: draw: add switch for drivers to force vertex data passthrough

---
 src/gallium/auxiliary/draw/draw_context.c |  8 +++++++
 src/gallium/auxiliary/draw/draw_context.h |  3 +++
 src/gallium/auxiliary/draw/draw_private.h |  3 +++
 src/gallium/auxiliary/draw/draw_pt.c      | 38 +++++++++++++++----------------
 4 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 36751c2621..41a4cba1dd 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -272,6 +272,14 @@ draw_enable_point_sprites(struct draw_context *draw, boolean enable)
 }
 
 
+void
+draw_set_force_passthrough( struct draw_context *draw, boolean enable )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->force_passthrough = enable;
+}
+
+
 /**
  * Ask the draw module for the location/slot of the given vertex attribute in
  * a post-transformed vertex.
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 0ab3681b64..3eeb453531 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -160,6 +160,9 @@ void draw_set_render( struct draw_context *draw,
 void draw_set_driver_clipping( struct draw_context *draw,
                                boolean bypass_clipping );
 
+void draw_set_force_passthrough( struct draw_context *draw, 
+                                 boolean enable );
+
 /*******************************************************************************
  * Draw pipeline 
  */
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 626a2e3e30..37c4c87f87 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -163,12 +163,15 @@ struct draw_context
 
    struct {
       boolean bypass_clipping;
+      boolean bypass_vs;
    } driver;
 
    boolean flushing;         /**< debugging/sanity */
    boolean suspend_flushing; /**< internally set */
    boolean bypass_clipping;  /**< set if either api or driver bypass_clipping true */
 
+   boolean force_passthrough; /**< never clip or shade */
+
    /* pipe state that we need: */
    const struct pipe_rasterizer_state *rasterizer;
    struct pipe_viewport_state viewport;
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 669c11c993..87ec6ae20c 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -69,26 +69,26 @@ draw_pt_arrays(struct draw_context *draw,
          return TRUE;
    }
 
-
-   if (!draw->render) {
-      opt |= PT_PIPELINE;
-   }
-
-   if (draw_need_pipeline(draw,
-                          draw->rasterizer,
-                          prim)) {
-      opt |= PT_PIPELINE;
-   }
-
-   if (!draw->bypass_clipping && !draw->pt.test_fse) {
-      opt |= PT_CLIPTEST;
+   if (!draw->force_passthrough) {
+      if (!draw->render) {
+         opt |= PT_PIPELINE;
+      }
+      
+      if (draw_need_pipeline(draw,
+                             draw->rasterizer,
+                             prim)) {
+         opt |= PT_PIPELINE;
+      }
+
+      if (!draw->bypass_clipping && !draw->pt.test_fse) {
+         opt |= PT_CLIPTEST;
+      }
+      
+      if (!draw->rasterizer->bypass_vs) {
+         opt |= PT_SHADE;
+      }
    }
-
-   if (!draw->rasterizer->bypass_vs) {
-      opt |= PT_SHADE;
-   }
-
-
+      
    if (opt == 0) 
       middle = draw->pt.middle.fetch_emit;
    else if (opt == PT_SHADE && !draw->pt.no_fse)
-- 
cgit v1.2.3


From 6ff1cf5b82488dc5a07513b0806c23e70f7a665e Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 7 Oct 2008 12:31:31 +0100
Subject: mesa: protect against segfault in get_fp_input_mask()

---
 src/mesa/main/texenvprogram.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index ea2ee160e4..7049467c22 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -204,7 +204,10 @@ static GLuint get_fp_input_mask( GLcontext *ctx )
 {
    GLuint fp_inputs = 0;
 
-   if (!ctx->VertexProgram._Enabled) {
+   if (!ctx->VertexProgram._Enabled ||
+       !ctx->VertexProgram._Current) {
+
+      /* Fixed function logic */
       GLuint varying_inputs = ctx->varying_vp_inputs;
 
       /* First look at what values may be computed by the generated
-- 
cgit v1.2.3


From a381c9e8b32af6e98879940eba5f11680d4b89b6 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 7 Oct 2008 13:09:05 +0100
Subject: trivial: exercise vertprog sligtly

---
 progs/trivial/Makefile      |   1 +
 progs/trivial/vp-tri-swap.c | 103 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 progs/trivial/vp-tri-swap.c

diff --git a/progs/trivial/Makefile b/progs/trivial/Makefile
index c868ab6e6f..65937f2f3f 100644
--- a/progs/trivial/Makefile
+++ b/progs/trivial/Makefile
@@ -124,6 +124,7 @@ SOURCES = \
 	vp-clip.c \
 	vp-line-clip.c \
 	vp-tri.c \
+	vp-tri-swap.c \
 	vp-unfilled.c 
 
 PROGS = $(SOURCES:%.c=%)
diff --git a/progs/trivial/vp-tri-swap.c b/progs/trivial/vp-tri-swap.c
new file mode 100644
index 0000000000..e9ca1a0378
--- /dev/null
+++ b/progs/trivial/vp-tri-swap.c
@@ -0,0 +1,103 @@
+/* Test glGenProgramsNV(), glIsProgramNV(), glLoadProgramNV() */
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#define GL_GLEXT_PROTOTYPES
+#include <GL/glut.h>
+
+static void Init( void )
+{
+   GLint errno;
+   GLuint prognum;
+   
+   static const char *prog1 =
+      "!!ARBvp1.0\n"
+      "MOV  result.position, vertex.color;\n"
+      "MOV  result.color, vertex.position;\n"
+      "END\n";
+
+
+   glGenProgramsARB(1, &prognum);
+
+   glBindProgramARB(GL_VERTEX_PROGRAM_ARB, prognum);
+   glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
+		      strlen(prog1), (const GLubyte *) prog1);
+
+   assert(glIsProgramARB(prognum));
+   errno = glGetError();
+   printf("glGetError = %d\n", errno);
+   if (errno != GL_NO_ERROR)
+   {
+      GLint errorpos;
+
+      glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errorpos);
+      printf("errorpos: %d\n", errorpos);
+      printf("%s\n", (char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB));
+   }
+}
+
+static void Display( void )
+{
+   glClearColor(0.3, 0.3, 0.3, 1);
+   glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
+
+   glEnable(GL_VERTEX_PROGRAM_NV);
+
+   glBegin(GL_TRIANGLES);
+   glColor3f(0,0,.7); 
+   glVertex3f( 0.9, -0.9, -0.0);
+   glColor3f(.8,0,0); 
+   glVertex3f( 0.9,  0.9, -0.0);
+   glColor3f(0,.9,0); 
+   glVertex3f(-0.9,  0.0, -0.0);
+   glEnd();
+
+
+   glFlush(); 
+}
+
+
+static void Reshape( int width, int height )
+{
+   glViewport( 0, 0, width, height );
+   glMatrixMode( GL_PROJECTION );
+   glLoadIdentity();
+   glOrtho(-1.0, 1.0, -1.0, 1.0, -0.5, 1000.0);
+   glMatrixMode( GL_MODELVIEW );
+   glLoadIdentity();
+   /*glTranslatef( 0.0, 0.0, -15.0 );*/
+}
+
+
+static void Key( unsigned char key, int x, int y )
+{
+   (void) x;
+   (void) y;
+   switch (key) {
+      case 27:
+         exit(0);
+         break;
+   }
+   glutPostRedisplay();
+}
+
+
+
+
+int main( int argc, char *argv[] )
+{
+   glutInit( &argc, argv );
+   glutInitWindowPosition( 0, 0 );
+   glutInitWindowSize( 250, 250 );
+   glutInitDisplayMode( GLUT_DEPTH | GLUT_RGB | GLUT_SINGLE );
+   glutCreateWindow(argv[0]);
+   glutReshapeFunc( Reshape );
+   glutKeyboardFunc( Key );
+   glutDisplayFunc( Display );
+   Init();
+   glutMainLoop();
+   return 0;
+}
-- 
cgit v1.2.3


From 4070dba28a486bc0d14df028a085601ae7299f46 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:33:17 +0100
Subject: mesa: update state after binding vertex list in dlist path

---
 src/mesa/vbo/vbo_save_draw.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index b015bf2786..68f3a965a5 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -241,6 +241,11 @@ void vbo_save_playback_vertex_list( GLcontext *ctx, void *data )
 
       vbo_bind_vertex_list( ctx, node );
 
+      /* Again...
+       */
+      if (ctx->NewState)
+	 _mesa_update_state( ctx );
+
       vbo_context(ctx)->draw_prims( ctx, 
 				    save->inputs, 
 				    node->prim, 
-- 
cgit v1.2.3


From 23cc303994eb630c56b1224dfdac51dcea41ed03 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:44:24 +0100
Subject: draw: don't assume output buffer pointer is aligned

---
 src/gallium/auxiliary/draw/draw_vs_aos_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index dd79bc799a..39f75b50b7 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -338,7 +338,7 @@ static void emit_store_R32G32B32A32( struct aos_compilation *cp,
 				     struct x86_reg dst_ptr,
 				     struct x86_reg dataXMM )
 {
-   sse_movaps(cp->func, dst_ptr, dataXMM);
+   sse_movups(cp->func, dst_ptr, dataXMM);
 }
 
 static void emit_store_R32G32B32( struct aos_compilation *cp, 
-- 
cgit v1.2.3


From 239617fbe22d4dd7b2794510a6665f09602b5adf Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 11:22:47 -0600
Subject: mesa: replace GLuint with GLbitfield to be clearer about usage

Also, fix up some comments to be doxygen style.
---
 src/mesa/main/mtypes.h        |  2 +-
 src/mesa/main/state.c         |  2 +-
 src/mesa/main/state.h         |  2 +-
 src/mesa/main/texenvprogram.c | 30 ++++++++++++++++--------------
 src/mesa/vbo/vbo_exec_array.c |  2 +-
 src/mesa/vbo/vbo_exec_draw.c  |  2 +-
 src/mesa/vbo/vbo_save_draw.c  |  2 +-
 7 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index ca1e369a35..dff474d6d0 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3073,7 +3073,7 @@ struct __GLcontextRec
    GLenum RenderMode;        /**< either GL_RENDER, GL_SELECT, GL_FEEDBACK */
    GLbitfield NewState;      /**< bitwise-or of _NEW_* flags */
 
-   GLuint varying_vp_inputs;
+   GLbitfield varying_vp_inputs;  /**< mask of VERT_BIT_* flags */
 
    /** \name Derived state */
    /*@{*/
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index e0eb5f81e2..b124d48269 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -532,7 +532,7 @@ _mesa_update_state( GLcontext *ctx )
  */
 void
 _mesa_set_varying_vp_inputs( GLcontext *ctx,
-                             unsigned varying_inputs )
+                             GLbitfield varying_inputs )
 {
    if (ctx->varying_vp_inputs != varying_inputs) {
       ctx->varying_vp_inputs = varying_inputs;
diff --git a/src/mesa/main/state.h b/src/mesa/main/state.h
index dc08043a76..79f2f6beb0 100644
--- a/src/mesa/main/state.h
+++ b/src/mesa/main/state.h
@@ -39,6 +39,6 @@ _mesa_update_state_locked( GLcontext *ctx );
 
 void
 _mesa_set_varying_vp_inputs( GLcontext *ctx,
-                             unsigned varying_inputs );
+                             GLbitfield varying_inputs );
 
 #endif
diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index 7049467c22..638d6be5ad 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -192,7 +192,8 @@ static GLuint translate_tex_src_bit( GLbitfield bit )
 #define VERT_BIT_TEX_ANY    (0xff << VERT_ATTRIB_TEX0)
 #define VERT_RESULT_TEX_ANY (0xff << VERT_RESULT_TEX0)
 
-/* Identify all possible varying inputs.  The fragment program will
+/**
+ * Identify all possible varying inputs.  The fragment program will
  * never reference non-varying inputs, but will track them via state
  * constants instead.
  *
@@ -200,15 +201,15 @@ static GLuint translate_tex_src_bit( GLbitfield bit )
  * has access to.  The bitmask is later reduced to just those which
  * are actually referenced.
  */
-static GLuint get_fp_input_mask( GLcontext *ctx )
+static GLbitfield get_fp_input_mask( GLcontext *ctx )
 {
-   GLuint fp_inputs = 0;
+   GLbitfield fp_inputs = 0x0;
 
    if (!ctx->VertexProgram._Enabled ||
        !ctx->VertexProgram._Current) {
 
       /* Fixed function logic */
-      GLuint varying_inputs = ctx->varying_vp_inputs;
+      GLbitfield varying_inputs = ctx->varying_vp_inputs;
 
       /* First look at what values may be computed by the generated
        * vertex program:
@@ -235,7 +236,7 @@ static GLuint get_fp_input_mask( GLcontext *ctx )
    }
    else {
       /* calculate from vp->outputs */
-      GLuint vp_outputs = ctx->VertexProgram._Current->Base.OutputsWritten;
+      GLbitfield vp_outputs = ctx->VertexProgram._Current->Base.OutputsWritten;
 
       if (vp_outputs & (1 << VERT_RESULT_COL0)) fp_inputs |= FRAG_BIT_COL0;
       if (vp_outputs & (1 << VERT_RESULT_COL1)) fp_inputs |= FRAG_BIT_COL1;
@@ -255,8 +256,8 @@ static GLuint get_fp_input_mask( GLcontext *ctx )
 static void make_state_key( GLcontext *ctx,  struct state_key *key )
 {
    GLuint i, j;
-   GLuint inputs_referenced = FRAG_BIT_COL0;
-   GLuint inputs_available = get_fp_input_mask( ctx );
+   GLbitfield inputs_referenced = FRAG_BIT_COL0;
+   GLbitfield inputs_available = get_fp_input_mask( ctx );
 
    memset(key, 0, sizeof(*key));
 
@@ -311,7 +312,8 @@ static void make_state_key( GLcontext *ctx,  struct state_key *key )
    key->inputs_available = (inputs_available & inputs_referenced);
 }
 
-/* Use uregs to represent registers internally, translate to Mesa's
+/**
+ * Use uregs to represent registers internally, translate to Mesa's
  * expected formats on emit.  
  *
  * NOTE: These are passed by value extensively in this file rather
@@ -344,16 +346,16 @@ static const struct ureg undef = {
 };
 
 
-/* State used to build the fragment program:
+/** State used to build the fragment program:
  */
 struct texenv_fragment_program {
    struct gl_fragment_program *program;
    GLcontext *ctx;
    struct state_key *state;
 
-   GLbitfield alu_temps;	/* Track texture indirections, see spec. */
-   GLbitfield temps_output;	/* Track texture indirections, see spec. */
-   GLbitfield temp_in_use;	/* Tracks temporary regs which are in use. */
+   GLbitfield alu_temps;	/**< Track texture indirections, see spec. */
+   GLbitfield temps_output;	/**< Track texture indirections, see spec. */
+   GLbitfield temp_in_use;	/**< Tracks temporary regs which are in use. */
    GLboolean error;
 
    struct ureg src_texture[MAX_TEXTURE_UNITS];   
@@ -361,11 +363,11 @@ struct texenv_fragment_program {
     * else undef.
     */
 
-   struct ureg src_previous;	/* Reg containing color from previous 
+   struct ureg src_previous;	/**< Reg containing color from previous 
 				 * stage.  May need to be decl'd.
 				 */
 
-   GLuint last_tex_stage;	/* Number of last enabled texture unit */
+   GLuint last_tex_stage;	/**< Number of last enabled texture unit */
 
    struct ureg half;
    struct ureg one;
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 3d74f9f431..8871e10cf6 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -127,7 +127,7 @@ static void recalculate_input_bindings( GLcontext *ctx )
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
    const struct gl_client_array **inputs = &exec->array.inputs[0];
-   GLuint const_inputs = 0;
+   GLbitfield const_inputs = 0x0;
    GLuint i;
 
    exec->array.program_mode = get_program_mode(ctx);
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index ad60c9b05f..ae43857c8a 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -150,7 +150,7 @@ static void vbo_exec_bind_arrays( GLcontext *ctx )
    GLubyte *data = exec->vtx.buffer_map;
    const GLuint *map;
    GLuint attr;
-   GLuint varying_inputs = 0;
+   GLbitfield varying_inputs = 0x0;
 
    /* Install the default (ie Current) attributes first, then overlay
     * all active ones.
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index 68f3a965a5..0488c5d718 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -118,7 +118,7 @@ static void vbo_bind_vertex_list( GLcontext *ctx,
    GLuint data = node->buffer_offset;
    const GLuint *map;
    GLuint attr;
-   GLuint varying_inputs = 0;
+   GLbitfield varying_inputs = 0x0;
 
    /* Install the default (ie Current) attributes first, then overlay
     * all active ones.
-- 
cgit v1.2.3


From 1ec78df1e76a58f23cadce7b22d34849af83bf84 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Tue, 7 Oct 2008 19:12:26 +0100
Subject: trivial: add more vp tests

---
 progs/trivial/Makefile     |   2 +
 progs/trivial/vp-tri-cb.c  | 107 +++++++++++++++++++++++++++++++++++++++++++++
 progs/trivial/vp-tri-imm.c | 101 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 210 insertions(+)
 create mode 100644 progs/trivial/vp-tri-cb.c
 create mode 100644 progs/trivial/vp-tri-imm.c

diff --git a/progs/trivial/Makefile b/progs/trivial/Makefile
index 65937f2f3f..d282171826 100644
--- a/progs/trivial/Makefile
+++ b/progs/trivial/Makefile
@@ -125,6 +125,8 @@ SOURCES = \
 	vp-line-clip.c \
 	vp-tri.c \
 	vp-tri-swap.c \
+	vp-tri-imm.c \
+	vp-tri-cb.c \
 	vp-unfilled.c 
 
 PROGS = $(SOURCES:%.c=%)
diff --git a/progs/trivial/vp-tri-cb.c b/progs/trivial/vp-tri-cb.c
new file mode 100644
index 0000000000..f9d0d7f559
--- /dev/null
+++ b/progs/trivial/vp-tri-cb.c
@@ -0,0 +1,107 @@
+/* Test glGenProgramsNV(), glIsProgramNV(), glLoadProgramNV() */
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#define GL_GLEXT_PROTOTYPES
+#include <GL/glut.h>
+
+static void Init( void )
+{
+   GLint errno;
+   GLuint prognum;
+   
+   static const char *prog1 =
+      "!!ARBvp1.0\n"
+      "PARAM Diffuse = state.material.diffuse; \n"
+      "MOV  result.color, Diffuse;\n"
+      "MOV  result.position, vertex.position;\n"
+      "END\n";
+
+   const float Diffuse[4] = { 0.0, 1.0, 0.0, 1.0 };
+   glMaterialfv(GL_FRONT_AND_BACK, GL_DIFFUSE, Diffuse);
+
+
+   glGenProgramsARB(1, &prognum);
+
+   glBindProgramARB(GL_VERTEX_PROGRAM_ARB, prognum);
+   glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
+		      strlen(prog1), (const GLubyte *) prog1);
+
+   assert(glIsProgramARB(prognum));
+   errno = glGetError();
+   printf("glGetError = %d\n", errno);
+   if (errno != GL_NO_ERROR)
+   {
+      GLint errorpos;
+
+      glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errorpos);
+      printf("errorpos: %d\n", errorpos);
+      printf("%s\n", (char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB));
+   }
+}
+
+static void Display( void )
+{
+   glClearColor(0.3, 0.3, 0.3, 1);
+   glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
+
+   glEnable(GL_VERTEX_PROGRAM_NV);
+
+   glBegin(GL_TRIANGLES);
+   glColor3f(0,0,.7); 
+   glVertex3f( 0.9, -0.9, -0.0);
+   glColor3f(.8,0,0); 
+   glVertex3f( 0.9,  0.9, -0.0);
+   glColor3f(0,.9,0); 
+   glVertex3f(-0.9,  0.0, -0.0);
+   glEnd();
+
+
+   glFlush(); 
+}
+
+
+static void Reshape( int width, int height )
+{
+   glViewport( 0, 0, width, height );
+   glMatrixMode( GL_PROJECTION );
+   glLoadIdentity();
+   glOrtho(-1.0, 1.0, -1.0, 1.0, -0.5, 1000.0);
+   glMatrixMode( GL_MODELVIEW );
+   glLoadIdentity();
+   /*glTranslatef( 0.0, 0.0, -15.0 );*/
+}
+
+
+static void Key( unsigned char key, int x, int y )
+{
+   (void) x;
+   (void) y;
+   switch (key) {
+      case 27:
+         exit(0);
+         break;
+   }
+   glutPostRedisplay();
+}
+
+
+
+
+int main( int argc, char *argv[] )
+{
+   glutInit( &argc, argv );
+   glutInitWindowPosition( 0, 0 );
+   glutInitWindowSize( 250, 250 );
+   glutInitDisplayMode( GLUT_DEPTH | GLUT_RGB | GLUT_SINGLE );
+   glutCreateWindow(argv[0]);
+   glutReshapeFunc( Reshape );
+   glutKeyboardFunc( Key );
+   glutDisplayFunc( Display );
+   Init();
+   glutMainLoop();
+   return 0;
+}
diff --git a/progs/trivial/vp-tri-imm.c b/progs/trivial/vp-tri-imm.c
new file mode 100644
index 0000000000..c774573ba8
--- /dev/null
+++ b/progs/trivial/vp-tri-imm.c
@@ -0,0 +1,101 @@
+/* Test glGenProgramsNV(), glIsProgramNV(), glLoadProgramNV() */
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#define GL_GLEXT_PROTOTYPES
+#include <GL/glut.h>
+
+static void Init( void )
+{
+   GLint errno;
+   GLuint prognum;
+   
+   static const char *prog1 =
+      "!!ARBvp1.0\n"
+      "ADD  result.color, vertex.color, {.5}.x;\n"  
+      "MOV  result.position, vertex.position;\n"
+      "END\n";
+
+
+   glGenProgramsARB(1, &prognum);
+
+   glBindProgramARB(GL_VERTEX_PROGRAM_ARB, prognum);
+   glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
+		      strlen(prog1), (const GLubyte *) prog1);
+
+   assert(glIsProgramARB(prognum));
+   errno = glGetError();
+   printf("glGetError = %d\n", errno);
+   if (errno != GL_NO_ERROR)
+   {
+      GLint errorpos;
+
+      glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errorpos);
+      printf("errorpos: %d\n", errorpos);
+      printf("%s\n", (char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB));
+   }
+}
+
+static void Display( void )
+{
+   glClearColor(0.3, 0.3, 0.3, 1);
+   glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
+
+   glEnable(GL_VERTEX_PROGRAM_NV);
+
+   glBegin(GL_TRIANGLES);
+   glColor3f(0,0,0); 
+   glVertex3f( 0.9, -0.9, -0.0);
+   glVertex3f( 0.9,  0.9, -0.0);
+   glVertex3f(-0.9,  0.0, -0.0);
+   glEnd();
+
+
+   glFlush(); 
+}
+
+
+static void Reshape( int width, int height )
+{
+   glViewport( 0, 0, width, height );
+   glMatrixMode( GL_PROJECTION );
+   glLoadIdentity();
+   glOrtho(-1.0, 1.0, -1.0, 1.0, -0.5, 1000.0);
+   glMatrixMode( GL_MODELVIEW );
+   glLoadIdentity();
+   /*glTranslatef( 0.0, 0.0, -15.0 );*/
+}
+
+
+static void Key( unsigned char key, int x, int y )
+{
+   (void) x;
+   (void) y;
+   switch (key) {
+      case 27:
+         exit(0);
+         break;
+   }
+   glutPostRedisplay();
+}
+
+
+
+
+int main( int argc, char *argv[] )
+{
+   glutInit( &argc, argv );
+   glutInitWindowPosition( 0, 0 );
+   glutInitWindowSize( 250, 250 );
+   glutInitDisplayMode( GLUT_DEPTH | GLUT_RGB | GLUT_SINGLE );
+   glutCreateWindow(argv[0]);
+   glutReshapeFunc( Reshape );
+   glutKeyboardFunc( Key );
+   glutDisplayFunc( Display );
+   Init();
+   glutMainLoop();
+   return 0;
+}
-- 
cgit v1.2.3


From 94ba48bd85ec5c62e1a303d8bb3fc25c8e153247 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Tue, 7 Oct 2008 21:11:01 +0200
Subject: Gallivm: fix the constant layout, this gets a bunch of progs/
 working. Notably, gears doesn't.

---
 src/gallium/auxiliary/gallivm/gallivm_cpu.cpp     |  5 +-
 src/gallium/auxiliary/gallivm/instructionssoa.cpp |  5 ++
 src/gallium/auxiliary/gallivm/instructionssoa.h   |  1 +
 src/gallium/auxiliary/gallivm/storagesoa.cpp      | 71 +++++++++++++++++++----
 src/gallium/auxiliary/gallivm/storagesoa.h        |  7 ++-
 src/gallium/auxiliary/gallivm/tgsitollvm.cpp      | 10 ++--
 6 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index 3a4a41e544..3a2f2878a3 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -158,8 +158,8 @@ void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog
    llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
    llvm::ExecutionEngine *ee = cpu->engine;
    assert(ee);
-   /*FIXME : remove */
-   ee->DisableLazyCompilation();
+   /*FIXME : why was this disabled ? we need it for pow/sqrt/... */
+   ee->DisableLazyCompilation(false);
    ee->addModuleProvider(mp);
 
    llvm::Function *func = func_for_shader(prog);
@@ -202,7 +202,6 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
    unsigned int i, j;
    unsigned slot;
    vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
-
    assert(runner);
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index a658072551..1143ee0b0b 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -171,6 +171,11 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
    return res;
 }
 
+llvm::IRBuilder<>* InstructionsSoa::getIRBuilder()
+{
+   return &m_builder;
+}
+
 void InstructionsSoa::createFunctionMap()
 {
    m_functionsMap[TGSI_OPCODE_ABS]   = "abs";
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 3817fdc904..d6831e0a6b 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -76,6 +76,7 @@ public:
    void         end();
 
    std::vector<llvm::Value*> extractVector(llvm::Value *vector);
+   llvm::IRBuilder<>*  getIRBuilder();
 private:
    const char * name(const char *prefix) const;
    llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 78d754371f..646b9d7ca0 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -91,19 +91,29 @@ void StorageSoa::declareImmediates()
    for (unsigned int i = 0; i < m_immediatesToFlush.size(); ++i) {
       std::vector<float> vec = m_immediatesToFlush[i];
       std::vector<float> vals(4);
+      float val;
       std::vector<Constant*> channelArray;
 
-      vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0];
-      llvm::Constant *xChannel = createConstGlobalVector(vals);
+      val = vec[0];
+      llvm::Constant *xChannel = createConstGlobalFloat(val);
+      val = vec[1];
+      llvm::Constant *yChannel = createConstGlobalFloat(val);
+      val = vec[2];
+      llvm::Constant *zChannel = createConstGlobalFloat(val);
+      val = vec[3];
+      llvm::Constant *wChannel = createConstGlobalFloat(val);
 
-      vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
+//      vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
+//      llvm::Constant *xChannel = createConstGlobalVector(vec[0]);
+
+/*      vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
       llvm::Constant *yChannel = createConstGlobalVector(vals);
 
       vals[0] = vec[2]; vals[1] = vec[2]; vals[2] = vec[2]; vals[3] = vec[2];
       llvm::Constant *zChannel = createConstGlobalVector(vals);
 
       vals[0] = vec[3]; vals[1] = vec[3]; vals[2] = vec[3]; vals[3] = vec[3];
-      llvm::Constant *wChannel = createConstGlobalVector(vals);
+      llvm::Constant *wChannel = createConstGlobalVector(vals);*/
       channelArray.push_back(xChannel);
       channelArray.push_back(yChannel);
       channelArray.push_back(zChannel);
@@ -144,22 +154,54 @@ std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
    return res;
 }
 
-std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx)
+llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc)
+{
+   std::vector<llvm::Value*> x(4);
+   x[0] = m_builder->CreateExtractElement(vector,
+                                           constantInt(cc),
+                                           name("x"));
+
+   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
+   Constant *constVector = Constant::getNullValue(vectorType);
+   Value *res = m_builder->CreateInsertElement(constVector, x[0],
+                                              constantInt(0),
+                                              name("vecx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(1),
+                               name("vecxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(2),
+                               name("vecxxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(3),
+                               name("vecxxxx"));
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
 {
    std::vector<llvm::Value*> res(4);
+   std::vector<llvm::Value*> res2(4);
    llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
 
    xChannel = elementPointer(m_consts, idx, 0);
-   yChannel = elementPointer(m_consts, idx, 1);
+/*   yChannel = elementPointer(m_consts, idx, 1);
    zChannel = elementPointer(m_consts, idx, 2);
-   wChannel = elementPointer(m_consts, idx, 3);
+   wChannel = elementPointer(m_consts, idx, 3);*/
 
    res[0] = alignedArrayLoad(xChannel);
+/* res[1] = alignedArrayLoad(xChannel);
+   res[2] = alignedArrayLoad(xChannel);
+   res[3] = alignedArrayLoad(xChannel);*/
+
+
+   res2[0]=unpackConstElement(m_builder, res[0],0);
+   res2[1]=unpackConstElement(m_builder, res[0],1);
+   res2[2]=unpackConstElement(m_builder, res[0],2);
+   res2[3]=unpackConstElement(m_builder, res[0],3);
+/*res[0] = alignedArrayLoad(xChannel);
    res[1] = alignedArrayLoad(yChannel);
    res[2] = alignedArrayLoad(zChannel);
-   res[3] = alignedArrayLoad(wChannel);
+   res[3] = alignedArrayLoad(wChannel);*/
 
-   return res;
+   return res2;
 }
 
 std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
@@ -260,6 +302,12 @@ llvm::Module * StorageSoa::currentModule() const
     return m_block->getParent()->getParent();
 }
 
+llvm::Constant * StorageSoa::createConstGlobalFloat(const float val)
+{
+   Constant*c = ConstantFP::get(APFloat(val));
+   return c;
+}
+
 llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
 {
    VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -278,7 +326,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v
 }
 
 std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
-                                           llvm::Value *indIdx)
+                                           llvm::IRBuilder<>* m_builder,llvm::Value *indIdx)
 {
    std::vector<llvm::Value*> val(4);
 
@@ -302,7 +350,8 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
       val = tempElement(realIndex);
       break;
    case TGSI_FILE_CONSTANT:
-      val = constElement(realIndex);
+      val = constElement(m_builder, realIndex);
+      printf("constant COUCOU index %d\n",realIndex);
       break;
    case TGSI_FILE_IMMEDIATE:
       val = immediateElement(realIndex);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index ae2fc7c6ae..f21ca6ec43 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -29,6 +29,7 @@
 #define STORAGESOA_H
 
 #include <pipe/p_shader_tokens.h>
+#include <llvm/Support/IRBuilder.h>
 
 #include <vector>
 #include <list>
@@ -56,7 +57,7 @@ public:
 
 
    std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, 
-                                  llvm::Value *indIdx =0);
+                                  llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
    void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
               int mask);
 
@@ -76,10 +77,12 @@ private:
    const char *name(const char *prefix) const;
    llvm::Value  *alignedArrayLoad(llvm::Value *val);
    llvm::Module *currentModule() const;
+   llvm::Constant  *createConstGlobalFloat(const float val);
    llvm::Constant  *createConstGlobalVector(const std::vector<float> &vec);
 
    std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
-   std::vector<llvm::Value*> constElement(llvm::Value *indIdx);
+   llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
+   std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
    std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
    std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
    std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 7292c0e366..1191a6cae9 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -52,7 +52,7 @@ static inline FunctionType *vertexShaderFunctionType()
    // pass are castable to the following:
    // [4 x <4 x float>] inputs,
    // [4 x <4 x float>] output,
-   // [4 x [4 x float]] consts,
+   // [4 x [1 x float]] consts,
    // [4 x <4 x float>] temps
 
    std::vector<const Type*> funcArgs;
@@ -61,7 +61,7 @@ static inline FunctionType *vertexShaderFunctionType()
    PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
 
    ArrayType   *floatArray     = ArrayType::get(Type::FloatTy, 4);
-   ArrayType   *constsArray    = ArrayType::get(floatArray, 4);
+   ArrayType   *constsArray    = ArrayType::get(floatArray, 1);
    PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
 
    funcArgs.push_back(vectorArrayPtr);//inputs
@@ -246,6 +246,7 @@ translate_instruction(llvm::Module *module,
          val = storage->constElement(src->SrcRegister.Index, indIdx);
       } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
          val = storage->inputElement(src->SrcRegister.Index, indIdx);
+      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
          val = storage->tempElement(src->SrcRegister.Index);
       } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
@@ -676,6 +677,7 @@ translate_instruction(llvm::Module *module,
 
       if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
          storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
          storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
       } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
@@ -707,9 +709,8 @@ translate_instructionir(llvm::Module *module,
       if (src->SrcRegister.Indirect) {
          indIdx = storage->addrElement(src->SrcRegisterInd.Index);
       }
-
       val = storage->load((enum tgsi_file_type)src->SrcRegister.File,
-                          src->SrcRegister.Index, swizzle, indIdx);
+                          src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx);
 
       inputs[i] = val;
    }
@@ -1025,7 +1026,6 @@ translate_instructionir(llvm::Module *module,
    /* store results  */
    for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
       struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-
       storage->store((enum tgsi_file_type)dst->DstRegister.File,
                      dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
    }
-- 
cgit v1.2.3


From 85e578bbc7032b356b436b282534c765ef35f064 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Tue, 7 Oct 2008 21:13:49 +0200
Subject: Gallivm: don't say hello, it's rude.

---
 src/gallium/auxiliary/gallivm/storagesoa.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 646b9d7ca0..d4ecf97c36 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -351,7 +351,6 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
       break;
    case TGSI_FILE_CONSTANT:
       val = constElement(m_builder, realIndex);
-      printf("constant COUCOU index %d\n",realIndex);
       break;
    case TGSI_FILE_IMMEDIATE:
       val = immediateElement(realIndex);
-- 
cgit v1.2.3


From 4ccbee24391823cc559bbb341f62fa375af864f7 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Tue, 7 Oct 2008 21:21:20 +0200
Subject: Progs: add a trivial glsl test, useful for gallium driver
 bringup/debug.

---
 progs/glsl/identity.c | 282 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 282 insertions(+)
 create mode 100644 progs/glsl/identity.c

diff --git a/progs/glsl/identity.c b/progs/glsl/identity.c
new file mode 100644
index 0000000000..a2a1991529
--- /dev/null
+++ b/progs/glsl/identity.c
@@ -0,0 +1,282 @@
+/**
+ * Test very basic glsl functionality (identity vertex and fragment shaders).
+ * Brian Paul
+ * 2 May 2007
+ *
+ * NOTE: resize the window to observe how the partial derivatives of
+ * the texcoords change.
+ */
+
+
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <GL/gl.h>
+#include <GL/glut.h>
+#include <GL/glext.h>
+#include "extfuncs.h"
+
+
+static char *FragProgFile = NULL;
+static char *VertProgFile = NULL;
+static GLuint fragShader;
+static GLuint vertShader;
+static GLuint program;
+static GLint win = 0;
+static GLboolean anim = GL_TRUE;
+static GLfloat xRot = 0.0f, yRot = 0.0f;
+static int w,h;
+
+static void
+Redisplay(void)
+{
+   glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+   glBegin(GL_TRIANGLES);
+   glColor3f(.8,0,0); 
+   glVertex3f(-0.9, -0.9, 0.0);
+   glColor3f(0,.9,0); 
+   glVertex3f( 0.9, -0.9, 0.0);
+   glColor3f(0,0,.7); 
+   glVertex3f( 0.0,  0.9, 0.0);
+   glEnd();
+
+   glutSwapBuffers();
+}
+
+
+static void
+Idle(void)
+{
+   yRot = glutGet(GLUT_ELAPSED_TIME) * 0.1;
+   glutPostRedisplay();
+}
+
+
+static void
+Reshape(int width, int height)
+{
+   glViewport(0, 0, width, height);
+   glMatrixMode(GL_PROJECTION);
+   glLoadIdentity();
+   glMatrixMode(GL_MODELVIEW);
+   glLoadIdentity();
+   w = width;
+   h = height;
+}
+
+
+static void
+CleanUp(void)
+{
+   glDeleteShader_func(fragShader);
+   glDeleteShader_func(vertShader);
+   glDeleteProgram_func(program);
+   glutDestroyWindow(win);
+}
+
+
+static void
+Key(unsigned char key, int x, int y)
+{
+  (void) x;
+  (void) y;
+
+   switch(key) {
+   case ' ':
+   case 'a':
+      anim = !anim;
+      if (anim)
+         glutIdleFunc(Idle);
+      else
+         glutIdleFunc(NULL);
+      break;
+   case 27:
+      CleanUp();
+      exit(0);
+      break;
+   }
+   glutPostRedisplay();
+}
+
+
+static void
+SpecialKey(int key, int x, int y)
+{
+   const GLfloat step = 3.0f;
+
+  (void) x;
+  (void) y;
+
+   switch(key) {
+   case GLUT_KEY_UP:
+      xRot -= step;
+      break;
+   case GLUT_KEY_DOWN:
+      xRot += step;
+      break;
+   case GLUT_KEY_LEFT:
+      yRot -= step;
+      break;
+   case GLUT_KEY_RIGHT:
+      yRot += step;
+      break;
+   }
+   glutPostRedisplay();
+}
+
+
+
+
+static void
+LoadAndCompileShader(GLuint shader, const char *text)
+{
+   GLint stat;
+
+   glShaderSource_func(shader, 1, (const GLchar **) &text, NULL);
+
+   glCompileShader_func(shader);
+
+   glGetShaderiv_func(shader, GL_COMPILE_STATUS, &stat);
+   if (!stat) {
+      GLchar log[1000];
+      GLsizei len;
+      glGetShaderInfoLog_func(shader, 1000, &len, log);
+      fprintf(stderr, "fslight: problem compiling shader:\n%s\n", log);
+      exit(1);
+   }
+}
+
+
+/**
+ * Read a shader from a file.
+ */
+static void
+ReadShader(GLuint shader, const char *filename)
+{
+   const int max = 100*1000;
+   int n;
+   char *buffer = (char*) malloc(max);
+   FILE *f = fopen(filename, "r");
+   if (!f) {
+      fprintf(stderr, "fslight: Unable to open shader file %s\n", filename);
+      exit(1);
+   }
+
+   n = fread(buffer, 1, max, f);
+   printf("fslight: read %d bytes from shader file %s\n", n, filename);
+   if (n > 0) {
+      buffer[n] = 0;
+      LoadAndCompileShader(shader, buffer);
+   }
+
+   fclose(f);
+   free(buffer);
+}
+
+
+static void
+CheckLink(GLuint prog)
+{
+   GLint stat;
+   glGetProgramiv_func(prog, GL_LINK_STATUS, &stat);
+   if (!stat) {
+      GLchar log[1000];
+      GLsizei len;
+      glGetProgramInfoLog_func(prog, 1000, &len, log);
+      fprintf(stderr, "Linker error:\n%s\n", log);
+   }
+}
+
+
+static void
+Init(void)
+{
+   static const char *fragShaderText =
+      "void main() {\n"
+      "   gl_FragColor = vec4(1.0,0.0,0.0,1.0);\n"
+      "}\n";
+   static const char *vertShaderText =
+      "void main() {\n"
+      "   gl_Position = gl_Vertex;\n"
+      "}\n";
+   const char *version;
+
+   version = (const char *) glGetString(GL_VERSION);
+   if (version[0] != '2' || version[1] != '.') {
+      printf("This program requires OpenGL 2.x, found %s\n", version);
+      exit(1);
+   }
+
+   GetExtensionFuncs();
+
+   fragShader = glCreateShader_func(GL_FRAGMENT_SHADER);
+   if (FragProgFile)
+      ReadShader(fragShader, FragProgFile);
+   else
+      LoadAndCompileShader(fragShader, fragShaderText);
+
+   vertShader = glCreateShader_func(GL_VERTEX_SHADER);
+   if (VertProgFile)
+      ReadShader(vertShader, VertProgFile);
+   else
+      LoadAndCompileShader(vertShader, vertShaderText);
+
+   program = glCreateProgram_func();
+   glAttachShader_func(program, fragShader);
+   glAttachShader_func(program, vertShader);
+   glLinkProgram_func(program);
+   CheckLink(program);
+   glUseProgram_func(program);
+
+   /*assert(glGetError() == 0);*/
+
+   glClearColor(0.3f, 0.3f, 0.3f, 0.0f);
+   glEnable(GL_DEPTH_TEST);
+
+   printf("GL_RENDERER = %s\n",(const char *) glGetString(GL_RENDERER));
+
+   assert(glIsProgram_func(program));
+   assert(glIsShader_func(fragShader));
+   assert(glIsShader_func(vertShader));
+
+   glColor3f(1, 0, 0);
+}
+
+
+static void
+ParseOptions(int argc, char *argv[])
+{
+   int i;
+   for (i = 1; i < argc; i++) {
+      if (strcmp(argv[i], "-fs") == 0) {
+         FragProgFile = argv[i+1];
+      }
+      else if (strcmp(argv[i], "-vs") == 0) {
+         VertProgFile = argv[i+1];
+      }
+   }
+}
+
+
+int
+main(int argc, char *argv[])
+{
+   glutInit(&argc, argv);
+   glutInitWindowPosition( 0, 0);
+   glutInitWindowSize(200, 200);
+   glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE | GLUT_DEPTH);
+   win = glutCreateWindow(argv[0]);
+   glutReshapeFunc(Reshape);
+   glutKeyboardFunc(Key);
+   glutSpecialFunc(SpecialKey);
+   glutDisplayFunc(Redisplay);
+   if (anim)
+      glutIdleFunc(Idle);
+   ParseOptions(argc, argv);
+   Init();
+   glutMainLoop();
+   return 0;
+}
-- 
cgit v1.2.3


From 4f1dafaa82985bf0f04a16ba2ba2d1e8ccf83724 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Tue, 7 Oct 2008 21:28:38 +0200
Subject: Mesa: fix the case where there are no vertex attributes.

This is a backport of 8e8208d6db8b764568539784a6473d545dec2265 to gallium-0.1
---
 src/mesa/state_tracker/st_draw.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index bdf8648ef7..61949a9388 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -525,6 +525,8 @@ st_draw_vbo(GLcontext *ctx,
                                 vbuffer, velements);
       num_vbuffers = 1;
       num_velements = vp->num_inputs;
+      if (num_velements == 0)
+         num_vbuffers = 0;
    }
    else {
       /*printf("Draw non-interleaved\n");*/
-- 
cgit v1.2.3


From f192ad5ebca138a21fd372fa268ba2b0f4f8b147 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 14:33:16 -0600
Subject: gallium: added general-purpose key->data map/lookup container

---
 src/gallium/auxiliary/util/Makefile   |   1 +
 src/gallium/auxiliary/util/SConscript |   5 +-
 src/gallium/auxiliary/util/u_keymap.c | 309 ++++++++++++++++++++++++++++++++++
 src/gallium/auxiliary/util/u_keymap.h |  68 ++++++++
 4 files changed, 381 insertions(+), 2 deletions(-)
 create mode 100644 src/gallium/auxiliary/util/u_keymap.c
 create mode 100644 src/gallium/auxiliary/util/u_keymap.h

diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index d3951e4e7d..b3d1045a8f 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
 	u_gen_mipmap.c \
 	u_handle_table.c \
 	u_hash_table.c \
+	u_keymap.c \
 	u_math.c \
 	u_mm.c \
 	u_rect.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index e65c17b1cc..8a04955a16 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -11,13 +11,14 @@ util = env.ConvenienceLibrary(
 		'u_gen_mipmap.c',
 		'u_handle_table.c',
 		'u_hash_table.c',
+		'u_keymap.c',
 		'u_math.c',
 		'u_mm.c',
 		'u_rect.c',
 		'u_simple_shaders.c',
 		'u_snprintf.c',
-        'u_stream_stdc.c',
-        'u_stream_wd.c',
+		'u_stream_stdc.c',
+		'u_stream_wd.c',
 		'u_tile.c',
 		'u_time.c',
 	])
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
new file mode 100644
index 0000000000..01b17ddb1b
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -0,0 +1,309 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Key lookup/associative container.
+ *
+ * Like Jose's u_hash_table, based on CSO cache code for now.
+ *
+ * Author: Brian Paul
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_error.h"
+
+#include "cso_cache/cso_hash.h"
+
+#include "util/u_memory.h"
+#include "util/u_keymap.h"
+
+
+struct keymap
+{
+   struct cso_hash *cso;   
+   unsigned key_size;
+   unsigned max_entries; /* XXX not obeyed net */
+   unsigned num_entries;
+   keymap_delete_func delete_func;
+};
+
+
+struct keymap_item
+{
+   void *key, *value;
+};
+
+
+/**
+ * This the default key-delete function used when the client doesn't
+ * provide one.
+ */
+static void
+default_delete_func(const struct keymap *map,
+                    const void *key, void *data, void *user)
+{
+   FREE((void*) data);
+}
+
+
+static INLINE struct keymap_item *
+hash_table_item(struct cso_hash_iter iter)
+{
+   return (struct keymap_item *) cso_hash_iter_data(iter);
+}
+
+
+/**
+ * Return 4-byte hash key for a block of bytes.
+ */
+static unsigned
+hash(const void *key, unsigned keySize)
+{
+   unsigned i, hash;
+
+   keySize /= 4; /* convert from bytes to uints */
+
+   hash = 0;
+   for (i = 0; i < keySize; i++) {
+      hash ^= (i + 1) * ((const unsigned *) key)[i];
+   }
+
+   /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/
+
+   return hash;
+}
+
+
+/**
+ * Create a new map.
+ * \param keySize  size of the keys in bytes
+ * \param maxEntries  max number of entries to allow (~0 = infinity)
+ * \param deleteFunc  optional callback to call when entries
+ *                    are deleted/replaced
+ */
+struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                 keymap_delete_func deleteFunc)
+{
+   struct keymap *map = MALLOC_STRUCT(keymap);
+   if (!map)
+      return NULL;
+   
+   map->cso = cso_hash_create();
+   if (!map->cso) {
+      FREE(map);
+      return NULL;
+   }
+   
+   map->max_entries = maxEntries;
+   map->num_entries = 0;
+   map->key_size = keySize;
+   map->delete_func = deleteFunc ? deleteFunc : default_delete_func;
+
+   return map;
+}
+
+
+/**
+ * Delete/free a keymap and all entries.  The deleteFunc that was given at
+ * create time will be called for each entry.
+ * \param user  user-provided pointer passed through to the delete callback
+ */
+void
+util_delete_keymap(struct keymap *map, void *user)
+{
+   util_keymap_remove_all(map, user);
+   cso_hash_delete(map->cso);
+   FREE(map);
+}
+
+
+static INLINE struct cso_hash_iter
+hash_table_find_iter(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+   
+   iter = cso_hash_find(map->cso, key_hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *) cso_hash_iter_data(iter);
+      if (!memcmp(item->key, key, map->key_size))
+         break;
+      iter = cso_hash_iter_next(iter);
+   }
+   
+   return iter;
+}
+
+
+static INLINE struct keymap_item *
+hash_table_find_item(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter)) {
+      return NULL;
+   }
+   else {
+      return hash_table_item(iter);
+   }
+}
+
+
+/**
+ * Insert a new key + data pointer into the table.
+ * Note: we create a copy of the key, but not the data!
+ * If the key is already present in the table, replace the existing
+ * entry (calling the delete callback on the previous entry).
+ * If the maximum capacity of the map is reached an old entry
+ * will be deleted (the delete callback will be called).
+ */
+boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+   struct cso_hash_iter iter;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (item) {
+      /* call delete callback for old entry/item */
+      map->delete_func(map, item->key, item->value, user);
+      item->value = (void *) data;
+      return TRUE;
+   }
+   
+   item = MALLOC_STRUCT(keymap_item);
+   if (!item)
+      return FALSE;
+
+   item->key = mem_dup(key, map->key_size);
+   item->value = (void *) data;
+   
+   iter = cso_hash_insert(map->cso, key_hash, item);
+   if (cso_hash_iter_is_null(iter)) {
+      FREE(item);
+      return FALSE;
+   }
+
+   map->num_entries++;
+
+   return TRUE;
+}
+
+
+/**
+ * Look up a key in the map and return the associated data pointer.
+ */
+const void *
+util_keymap_lookup(const struct keymap *map, const void *key)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (!item)
+      return NULL;
+   
+   return item->value;
+}
+
+
+/**
+ * Remove an entry from the map.
+ * The delete callback will be called if the given key/entry is found.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove(struct keymap *map, const void *key, void *user)
+{
+   unsigned key_hash;
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter))
+      return;
+   
+   item = hash_table_item(iter);
+   assert(item);
+   map->delete_func(map, item->key, item->value, user);
+   FREE(item->key);
+   FREE(item);
+   
+   map->num_entries--;
+
+   cso_hash_erase(map->cso, iter);
+}
+
+
+/**
+ * Remove all entries from the map, calling the delete callback for each.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove_all(struct keymap *map, void *user)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+   
+   iter = cso_hash_first_node(map->cso);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *)
+         cso_hash_take(map->cso, cso_hash_iter_key(iter));
+      map->delete_func(map, item->key, item->value, user);
+      FREE(item->key);
+      FREE(item);
+      iter = cso_hash_first_node(map->cso);
+   }
+}
+
+
+extern void
+util_keymap_info(const struct keymap *map)
+{
+   debug_printf("Keymap %p: %u of max %u entries\n",
+                (void *) map, map->num_entries, map->max_entries);
+}
diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h
new file mode 100644
index 0000000000..8d60a76fc3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef U_KEYMAP_H
+#define U_KEYMAP_H
+
+#include "pipe/p_compiler.h"
+
+
+/** opaque keymap type */
+struct keymap;
+
+
+/** Delete/callback function type */
+typedef void (*keymap_delete_func)(const struct keymap *map,
+                                   const void *key, void *data,
+                                   void *user);
+
+
+extern struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                keymap_delete_func deleteFunc);
+
+extern void
+util_delete_keymap(struct keymap *map, void *user);
+
+extern boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user);
+
+extern const void *
+util_keymap_lookup(const struct keymap *map, const void *key);
+
+extern void
+util_keymap_remove(struct keymap *map, const void *key, void *user);
+
+extern void
+util_keymap_remove_all(struct keymap *map, void *user);
+
+extern void
+util_keymap_info(const struct keymap *map);
+
+
+#endif /* U_KEYMAP_H */
-- 
cgit v1.2.3


From 44799c3b7e0e4260b93e68a5da5a03c9279ac26a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 14:34:08 -0600
Subject: cell: use new keymap to save/re-use fragment ops code

---
 src/gallium/drivers/cell/ppu/cell_context.c    |  6 ++
 src/gallium/drivers/cell/ppu/cell_context.h    | 17 ++++++
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 80 ++++++++++++++++++++++++++
 3 files changed, 103 insertions(+)

diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 62e213ea35..30ce6f9762 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -62,6 +62,8 @@ cell_destroy_context( struct pipe_context *pipe )
 {
    struct cell_context *cell = cell_context(pipe);
 
+   util_delete_keymap(cell->fragment_ops_cache, NULL);
+
    cell_spu_exit(cell);
 
    align_free(cell);
@@ -131,6 +133,10 @@ cell_create_context(struct pipe_screen *screen,
 
    cell->draw = cell_draw_create(cell);
 
+   /* Create cache of fragment ops generated code */
+   cell->fragment_ops_cache =
+      util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
    cell_init_vbuf(cell);
 
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 3dc15c9233..80a9b3d7e1 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -38,6 +38,7 @@
 #include "cell/common.h"
 #include "rtasm/rtasm_ppc_spe.h"
 #include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
 
 
 struct cell_vbuf_render;
@@ -66,6 +67,19 @@ struct cell_fragment_shader_state
 };
 
 
+/**
+ * Key for mapping per-fragment state to cached SPU machine code.
+ *  keymap(cell_fragment_ops_key) => cell_command_fragment_ops
+ */
+struct cell_fragment_ops_key
+{
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state dsa;
+   enum pipe_format color_format;
+   enum pipe_format zs_format;
+};
+
+
 /**
  * Per-context state, subclass of pipe_context.
  */
@@ -107,6 +121,9 @@ struct cell_context
 
    uint dirty;
 
+   /** Cache of code generated for per-fragment ops */
+   struct keymap *fragment_ops_cache;
+
    /** The primitive drawing context */
    struct draw_context *draw;
    struct draw_stage *render_stage;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index f35893537b..b00c41f47d 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -36,6 +36,78 @@
 #include "draw/draw_private.h"
 
 
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+   struct cell_fragment_ops_key key;
+   struct cell_command_fragment_ops *ops;
+
+   /*
+    * Build key
+    */
+   key.blend = *cell->blend;
+   key.dsa = *cell->depth_stencil;
+
+   if (cell->framebuffer.cbufs[0])
+      key.color_format = cell->framebuffer.cbufs[0]->format;
+   else
+      key.color_format = PIPE_FORMAT_NONE;
+
+   if (cell->framebuffer.zsbuf)
+      key.zs_format = cell->framebuffer.zsbuf->format;
+   else
+      key.zs_format = PIPE_FORMAT_NONE;
+
+   /*
+    * Look up key in cache.
+    */
+   ops = (struct cell_command_fragment_ops *)
+      util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+   /*
+    * If not found, create/save new fragment ops command.
+    */
+   if (!ops) {
+      struct spe_function spe_code;
+
+      if (0)
+         debug_printf("**** Create New Fragment Ops\n");
+
+      /* Prepare the buffer that will hold the generated code. */
+      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+      /* generate new code */
+      cell_gen_fragment_function(cell, &spe_code);
+
+      /* alloc new fragment ops command */
+      ops = CALLOC_STRUCT(cell_command_fragment_ops);
+
+      /* populate the new cell_command_fragment_ops object */
+      ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      memcpy(ops->code, spe_code.store, spe_code_size(&spe_code));
+      ops->dsa = *cell->depth_stencil;
+      ops->blend = *cell->blend;
+
+      /* insert cell_command_fragment_ops object into keymap/cache */
+      util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+      /* release rtasm buffer */
+      spe_release_func(&spe_code);
+   }
+   else {
+      if (0)
+         debug_printf("**** Re-use Fragment Ops\n");
+   }
+
+   return ops;
+}
+
+
+
 static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
@@ -92,6 +164,7 @@ cell_emit_state(struct cell_context *cell)
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
+#if 0
       /* XXX we don't want to always do codegen here.  We should have
        * a hash/lookup table to cache previous results...
        */
@@ -114,6 +187,13 @@ cell_emit_state(struct cell_context *cell)
 
       /* free codegen buffer */
       spe_release_func(&spe_code);
+#else
+      struct cell_command_fragment_ops *fops, *fops_cmd;
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+      fops = lookup_fragment_ops(cell);
+      memcpy(fops_cmd, fops, sizeof(*fops));
+#endif
+
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
-- 
cgit v1.2.3


From be3c070b6a86255feb752b7574daff8cb6091b96 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 14:50:06 -0600
Subject: cell: memset() key to zero

---
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index b00c41f47d..69c1e4d342 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -49,6 +49,7 @@ lookup_fragment_ops(struct cell_context *cell)
    /*
     * Build key
     */
+   memset(&key, 0, sizeof(key));
    key.blend = *cell->blend;
    key.dsa = *cell->depth_stencil;
 
-- 
cgit v1.2.3


From 6f29c2ff2dc4b3aefe282133376caed68b65a3d0 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Tue, 7 Oct 2008 23:42:36 +0200
Subject: Progs: hook the glsl identity example into the makefile.

---
 progs/glsl/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/progs/glsl/Makefile b/progs/glsl/Makefile
index 9999d6c08a..eacd6dfe09 100644
--- a/progs/glsl/Makefile
+++ b/progs/glsl/Makefile
@@ -15,6 +15,7 @@ PROGS = \
 	bump \
 	convolutions \
 	deriv \
+	identity \
 	mandelbrot \
 	multitex \
 	noise \
-- 
cgit v1.2.3


From a0809c527105496f0dac234bee72d67abd2d2b17 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Tue, 7 Oct 2008 23:43:21 +0200
Subject: Gallivm: reorder the functions alphabetically so I can work on it.

---
 src/gallium/auxiliary/gallivm/instructionssoa.cpp | 240 +++++++++++-----------
 1 file changed, 119 insertions(+), 121 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index 1143ee0b0b..d5600fd22d 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -90,68 +90,11 @@ llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
    return res;
 }
 
-std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
-{
-   std::vector<llvm::Value*> res(4);
-
-   //Extract x's
-   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
-                                                    m_storage->constantInt(0),
-                                                    name("extractX"));
-   //cast it to an unsigned int
-   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
-
-   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
-   //only x is valid. the others shouldn't be necessary
-   /*
-   res[1] = Constant::getNullValue(m_floatVecType);
-   res[2] = Constant::getNullValue(m_floatVecType);
-   res[3] = Constant::getNullValue(m_floatVecType);
-   */
-
-   return res;
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
-   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
-   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
-   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
-   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
-   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
-   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
-
-   return res;
-}
-
 void InstructionsSoa::end()
 {
    m_builder.CreateRetVoid();
 }
 
-std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
-                                                const std::vector<llvm::Value*> in2,
-                                                const std::vector<llvm::Value*> in3)
-{
-   std::vector<llvm::Value*> res = mul(in1, in2);
-   return add(res, in3);
-}
-
 std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
 {
    std::vector<llvm::Value*> res(4);
@@ -279,6 +222,41 @@ std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
+   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
+   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
+   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
+{
+   std::vector<llvm::Value*> res(4);
+
+   //Extract x's
+   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
+                                                    m_storage->constantInt(0),
+                                                    name("extractX"));
+   //cast it to an unsigned int
+   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
+
+   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
+   //only x is valid. the others shouldn't be necessary
+   /*
+   res[1] = Constant::getNullValue(m_floatVecType);
+   res[2] = Constant::getNullValue(m_floatVecType);
+   res[3] = Constant::getNullValue(m_floatVecType);
+   */
+
+   return res;
+}
+
 std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
                                                const std::vector<llvm::Value*> in2)
 {
@@ -286,6 +264,59 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1, in2);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_LIT);
+   return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
+                                                const std::vector<llvm::Value*> in2,
+                                                const std::vector<llvm::Value*> in3)
+{
+   std::vector<llvm::Value*> res = mul(in1, in2);
+   return add(res, in3);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MAX);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MIN);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
+   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
+   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
+   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_POWER);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_RSQ);
+   return callBuiltin(func, in);
+}
 
 std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
                                                const std::vector<llvm::Value*> in2)
@@ -294,6 +325,37 @@ std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1, in2);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
+   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
+   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
+   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
+
+   return res;
+}
+
+void checkFunction(Function *func)
+{
+   for (Function::const_iterator BI = func->begin(), BE = func->end();
+        BI != BE; ++BI) {
+      const BasicBlock &BB = *BI;
+      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
+           II != IE; ++II) {
+         const Instruction &I = *II;
+         std::cout<< "Instr = "<<I;
+         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
+            const Value *Op = I.getOperand(op);
+            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
+            //I->setOperand(op, V);
+  }
+      }
+   }
+}
+
 llvm::Value * InstructionsSoa::allocaTemp()
 {
    VectorType *vector   = VectorType::get(Type::FloatTy, 4);
@@ -413,46 +475,6 @@ std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std
    return allocaToResult(allocaPtr);
 }
 
-std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_POWER);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MIN);
-   return callBuiltin(func, in1, in2);
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MAX);
-   return callBuiltin(func, in1, in2);
-}
-
-void checkFunction(Function *func)
-{
-   for (Function::const_iterator BI = func->begin(), BE = func->end();
-        BI != BE; ++BI) {
-      const BasicBlock &BB = *BI;
-      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
-           II != IE; ++II) {
-         const Instruction &I = *II;
-         std::cout<< "Instr = "<<I;
-         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
-            const Value *Op = I.getOperand(op);
-            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
-            //I->setOperand(op, V);
-  }
-      }
-   }
-}
-
 void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
 {
    assert(originalFunc);
@@ -497,28 +519,4 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
    }
 }
 
-std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
-   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
-   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
-   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_LIT);
-   return callBuiltin(func, in);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_RSQ);
-   return callBuiltin(func, in);
-}
 
-- 
cgit v1.2.3


From e561058641ca39a676b219a056f889ad99240029 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 14:58:05 -0600
Subject: cell: remove old code

---
 src/gallium/drivers/cell/ppu/cell_state_emit.c | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 69c1e4d342..a36fd3a601 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -165,36 +165,10 @@ cell_emit_state(struct cell_context *cell)
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
-#if 0
-      /* XXX we don't want to always do codegen here.  We should have
-       * a hash/lookup table to cache previous results...
-       */
-      struct cell_command_fragment_ops *fops
-            = cell_batch_alloc(cell, sizeof(*fops));
-      struct spe_function spe_code;
-
-      /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-
-      /* generate new code */
-      cell_gen_fragment_function(cell, &spe_code);
-
-      /* put the new code into the batch buffer */
-      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(&fops->code, spe_code.store,
-             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      fops->dsa = *cell->depth_stencil;
-      fops->blend = *cell->blend;
-
-      /* free codegen buffer */
-      spe_release_func(&spe_code);
-#else
       struct cell_command_fragment_ops *fops, *fops_cmd;
       fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
       fops = lookup_fragment_ops(cell);
       memcpy(fops_cmd, fops, sizeof(*fops));
-#endif
-
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
-- 
cgit v1.2.3


From 3008657ceaec3f91386c767c51647729afe16b34 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 15:13:48 -0600
Subject: cell: fix formatting

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 8d2d4f2a0f..5d16fc13fe 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -191,19 +191,16 @@ get_src_reg(struct codegen *gen,
       break;
    case TGSI_FILE_INPUT:
       {
-         if(swizzle == TGSI_EXTSWIZZLE_ONE)
-         {
+         if (swizzle == TGSI_EXTSWIZZLE_ONE) {
             /* Load const one float and early out */
             reg = get_const_one_reg(gen);
          }
-         else if(swizzle == TGSI_EXTSWIZZLE_ZERO)
-         {
+         else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
             /* Load const zero float and early out */
             reg = get_itemp(gen);
             spe_xor(gen->f, reg, reg, reg);
          }
-         else
-         {
+         else {
             /* offset is measured in quadwords, not bytes */
             int offset = src->SrcRegister.Index * 4 + swizzle;
             reg = get_itemp(gen);
-- 
cgit v1.2.3


From ce416566bc71d2463785a834ffbe14fb5e9eae03 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:11:20 -0600
Subject: cell: fix incorrect extended swizzle term code in get_src_reg()

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 50 ++++++++++++++++--------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 5d16fc13fe..131a2356fe 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -185,22 +185,24 @@ get_src_reg(struct codegen *gen,
    assert(swizzle >= TGSI_SWIZZLE_X);
    assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
 
-   switch (src->SrcRegister.File) {
-   case TGSI_FILE_TEMPORARY:
-      reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
-      break;
-   case TGSI_FILE_INPUT:
-      {
-         if (swizzle == TGSI_EXTSWIZZLE_ONE) {
-            /* Load const one float and early out */
-            reg = get_const_one_reg(gen);
-         }
-         else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
-            /* Load const zero float and early out */
-            reg = get_itemp(gen);
-            spe_xor(gen->f, reg, reg, reg);
-         }
-         else {
+   if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+      /* Load const one float and early out */
+      reg = get_const_one_reg(gen);
+   }
+   else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+      /* Load const zero float and early out */
+      reg = get_itemp(gen);
+      spe_xor(gen->f, reg, reg, reg);
+   }
+   else {
+      assert(swizzle < 4);
+
+      switch (src->SrcRegister.File) {
+      case TGSI_FILE_TEMPORARY:
+         reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_INPUT:
+         {
             /* offset is measured in quadwords, not bytes */
             int offset = src->SrcRegister.Index * 4 + swizzle;
             reg = get_itemp(gen);
@@ -208,15 +210,15 @@ get_src_reg(struct codegen *gen,
             /* Load:  reg = memory[(machine_reg) + offset] */
             spe_lqd(gen->f, reg, gen->inputs_reg, offset);
          }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_CONSTANT:
+         /* xxx fall-through for now / fix */
+      default:
+         assert(0);
       }
-      break;
-   case TGSI_FILE_IMMEDIATE:
-      reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
-      break;
-   case TGSI_FILE_CONSTANT:
-      /* xxx fall-through for now / fix */
-   default:
-      assert(0);
    }
 
    /*
-- 
cgit v1.2.3


From 800c350d71132bbb5126bd89310df540332978f4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:14:27 -0600
Subject: cell: add support for fragment shader constant buffers

---
 src/gallium/drivers/cell/common.h                |  1 +
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 10 +++++++++-
 src/gallium/drivers/cell/ppu/cell_state.h        |  5 +++--
 src/gallium/drivers/cell/ppu/cell_state_emit.c   | 19 +++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_state_shader.c |  5 ++++-
 src/gallium/drivers/cell/spu/spu_command.c       | 22 ++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_main.h          |  8 +++++---
 7 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c223bc1744..d261c1a640 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -94,6 +94,7 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_STATE_FS_CONSTANTS  21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 131a2356fe..3065869d04 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -215,7 +215,15 @@ get_src_reg(struct codegen *gen,
          reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
          break;
       case TGSI_FILE_CONSTANT:
-         /* xxx fall-through for now / fix */
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset);
+         }
+         break;
       default:
          assert(0);
       }
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a3..b193170f9c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
 #define CELL_NEW_TEXTURE       0x800
 #define CELL_NEW_VERTEX        0x1000
 #define CELL_NEW_VS            0x2000
-#define CELL_NEW_CONSTANTS     0x4000
-#define CELL_NEW_VERTEX_INFO   0x8000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
 
 
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index a36fd3a601..cbfa393cfb 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,6 +25,7 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "cell_context.h"
 #include "cell_gen_fragment.h"
@@ -162,6 +163,24 @@ cell_emit_state(struct cell_context *cell)
       }
    }
 
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+      uint64_t *ibuf = (uint64_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[1] = num_const;
+      j = 4;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2..54a17eaf2b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -197,7 +197,10 @@ cell_set_constant_buffer(struct pipe_context *pipe,
                         buf->buffer);
    cell->constants[shader].size = buf->size;
 
-   cell->dirty |= CELL_NEW_CONSTANTS;
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ec9da5d887..91a4c137e7 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -231,6 +231,25 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 }
 
 
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+   const uint num_const = buffer[pos + 1];
+   const float *constants = (const float *) &buffer[pos + 2];
+   uint i;
+
+   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 8-byte words) */
+   return pos + 2 + num_const / 2;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -456,6 +475,9 @@ cmd_batch(uint opcode)
             pos += sizeof(*fp) / 8;
          }
          break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 1cd577c23c..82c9c69a3a 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,9 @@
 #define MAX_HEIGHT 1024
 
 
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+
+
 /**
  * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
  * The data may be addressed through several different types.
@@ -157,9 +160,8 @@ struct spu_global
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
-   /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
-   vector float constants[MAX_CONSTANTS];
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
 
 } ALIGN16_ATTRIB;
 
-- 
cgit v1.2.3


From b99c39ea7bf7ff3d6c0fe8599ce25a6b6bf154fd Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:24:43 -0600
Subject: mesa: use the shaderutil.c helper functions

---
 progs/glsl/Makefile   |  7 ++++
 progs/glsl/identity.c | 94 ++++++---------------------------------------------
 2 files changed, 17 insertions(+), 84 deletions(-)

diff --git a/progs/glsl/Makefile b/progs/glsl/Makefile
index eacd6dfe09..04c1d25ed7 100644
--- a/progs/glsl/Makefile
+++ b/progs/glsl/Makefile
@@ -107,6 +107,13 @@ deriv: deriv.o shaderutil.o
 	$(APP_CC) -I$(INCDIR) $(CFLAGS) $(LDFLAGS) deriv.o shaderutil.o $(LIBS) -o $@
 
 
+identity.o: identity.c extfuncs.h shaderutil.h
+	$(APP_CC) -c -I$(INCDIR) $(CFLAGS) identity.c
+
+identity: identity.o shaderutil.o
+	$(APP_CC) -I$(INCDIR) $(CFLAGS) $(LDFLAGS) identity.o shaderutil.o $(LIBS) -o $@
+
+
 mandelbrot.o: mandelbrot.c extfuncs.h shaderutil.h
 	$(APP_CC) -c -I$(INCDIR) $(CFLAGS) mandelbrot.c
 
diff --git a/progs/glsl/identity.c b/progs/glsl/identity.c
index a2a1991529..dce140fc64 100644
--- a/progs/glsl/identity.c
+++ b/progs/glsl/identity.c
@@ -1,10 +1,6 @@
 /**
  * Test very basic glsl functionality (identity vertex and fragment shaders).
- * Brian Paul
- * 2 May 2007
- *
- * NOTE: resize the window to observe how the partial derivatives of
- * the texcoords change.
+ * Brian Paul & Stephane Marchesin
  */
 
 
@@ -17,6 +13,7 @@
 #include <GL/glut.h>
 #include <GL/glext.h>
 #include "extfuncs.h"
+#include "shaderutil.h"
 
 
 static char *FragProgFile = NULL;
@@ -29,6 +26,7 @@ static GLboolean anim = GL_TRUE;
 static GLfloat xRot = 0.0f, yRot = 0.0f;
 static int w,h;
 
+
 static void
 Redisplay(void)
 {
@@ -128,69 +126,6 @@ SpecialKey(int key, int x, int y)
 }
 
 
-
-
-static void
-LoadAndCompileShader(GLuint shader, const char *text)
-{
-   GLint stat;
-
-   glShaderSource_func(shader, 1, (const GLchar **) &text, NULL);
-
-   glCompileShader_func(shader);
-
-   glGetShaderiv_func(shader, GL_COMPILE_STATUS, &stat);
-   if (!stat) {
-      GLchar log[1000];
-      GLsizei len;
-      glGetShaderInfoLog_func(shader, 1000, &len, log);
-      fprintf(stderr, "fslight: problem compiling shader:\n%s\n", log);
-      exit(1);
-   }
-}
-
-
-/**
- * Read a shader from a file.
- */
-static void
-ReadShader(GLuint shader, const char *filename)
-{
-   const int max = 100*1000;
-   int n;
-   char *buffer = (char*) malloc(max);
-   FILE *f = fopen(filename, "r");
-   if (!f) {
-      fprintf(stderr, "fslight: Unable to open shader file %s\n", filename);
-      exit(1);
-   }
-
-   n = fread(buffer, 1, max, f);
-   printf("fslight: read %d bytes from shader file %s\n", n, filename);
-   if (n > 0) {
-      buffer[n] = 0;
-      LoadAndCompileShader(shader, buffer);
-   }
-
-   fclose(f);
-   free(buffer);
-}
-
-
-static void
-CheckLink(GLuint prog)
-{
-   GLint stat;
-   glGetProgramiv_func(prog, GL_LINK_STATUS, &stat);
-   if (!stat) {
-      GLchar log[1000];
-      GLsizei len;
-      glGetProgramInfoLog_func(prog, 1000, &len, log);
-      fprintf(stderr, "Linker error:\n%s\n", log);
-   }
-}
-
-
 static void
 Init(void)
 {
@@ -202,33 +137,24 @@ Init(void)
       "void main() {\n"
       "   gl_Position = gl_Vertex;\n"
       "}\n";
-   const char *version;
 
-   version = (const char *) glGetString(GL_VERSION);
-   if (version[0] != '2' || version[1] != '.') {
-      printf("This program requires OpenGL 2.x, found %s\n", version);
+   if (!ShadersSupported())
       exit(1);
-   }
 
    GetExtensionFuncs();
 
-   fragShader = glCreateShader_func(GL_FRAGMENT_SHADER);
    if (FragProgFile)
-      ReadShader(fragShader, FragProgFile);
+      fragShader = CompileShaderFile(GL_FRAGMENT_SHADER, FragProgFile);
    else
-      LoadAndCompileShader(fragShader, fragShaderText);
+      fragShader = CompileShaderText(GL_FRAGMENT_SHADER, fragShaderText);
 
-   vertShader = glCreateShader_func(GL_VERTEX_SHADER);
    if (VertProgFile)
-      ReadShader(vertShader, VertProgFile);
+      vertShader = CompileShaderFile(GL_VERTEX_SHADER, VertProgFile);
    else
-      LoadAndCompileShader(vertShader, vertShaderText);
+      vertShader = CompileShaderText(GL_VERTEX_SHADER, vertShaderText);
+
+   program = LinkShaders(vertShader, fragShader);
 
-   program = glCreateProgram_func();
-   glAttachShader_func(program, fragShader);
-   glAttachShader_func(program, vertShader);
-   glLinkProgram_func(program);
-   CheckLink(program);
    glUseProgram_func(program);
 
    /*assert(glGetError() == 0);*/
-- 
cgit v1.2.3


From 7cb723a3fd4d90ad6efa5f440e5e39f8aaa9f79c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 7 Oct 2008 16:52:47 -0600
Subject: mesa: pass 'mask', not NULL to renderbuffer->Put functions

Fixes bug 17800.
---
 src/mesa/swrast/s_depth.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/mesa/swrast/s_depth.c b/src/mesa/swrast/s_depth.c
index 293eb8628e..26e23f02d5 100644
--- a/src/mesa/swrast/s_depth.c
+++ b/src/mesa/swrast/s_depth.c
@@ -1,8 +1,8 @@
 /*
  * Mesa 3-D graphics library
- * Version:  6.5.1
+ * Version:  7.2.1
  *
- * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -534,15 +534,15 @@ depth_test_span( GLcontext *ctx, SWspan *span)
       if (rb->DataType == GL_UNSIGNED_SHORT) {
          GLushort zbuffer[MAX_WIDTH];
          rb->GetRow(ctx, rb, count, x, y, zbuffer);
-         passed = depth_test_span16(ctx, count, zbuffer, zValues, mask );
-         rb->PutRow(ctx, rb, count, x, y, zbuffer, NULL);
+         passed = depth_test_span16(ctx, count, zbuffer, zValues, mask);
+         rb->PutRow(ctx, rb, count, x, y, zbuffer, mask);
       }
       else {
          GLuint zbuffer[MAX_WIDTH];
          ASSERT(rb->DataType == GL_UNSIGNED_INT);
          rb->GetRow(ctx, rb, count, x, y, zbuffer);
-         passed = depth_test_span32(ctx, count, zbuffer, zValues, mask );
-         rb->PutRow(ctx, rb, count, x, y, zbuffer, NULL);
+         passed = depth_test_span32(ctx, count, zbuffer, zValues, mask);
+         rb->PutRow(ctx, rb, count, x, y, zbuffer, mask);
       }
    }
 
@@ -1080,15 +1080,15 @@ depth_test_pixels( GLcontext *ctx, SWspan *span )
       if (rb->DataType == GL_UNSIGNED_SHORT) {
          GLushort zbuffer[MAX_WIDTH];
          _swrast_get_values(ctx, rb, count, x, y, zbuffer, sizeof(GLushort));
-         depth_test_span16(ctx, count, zbuffer, z, mask );
-         rb->PutValues(ctx, rb, count, x, y, zbuffer, NULL);
+         depth_test_span16(ctx, count, zbuffer, z, mask);
+         rb->PutValues(ctx, rb, count, x, y, zbuffer, mask);
       }
       else {
          GLuint zbuffer[MAX_WIDTH];
          ASSERT(rb->DataType == GL_UNSIGNED_INT);
          _swrast_get_values(ctx, rb, count, x, y, zbuffer, sizeof(GLuint));
-         depth_test_span32(ctx, count, zbuffer, z, mask );
-         rb->PutValues(ctx, rb, count, x, y, zbuffer, NULL);
+         depth_test_span32(ctx, count, zbuffer, z, mask);
+         rb->PutValues(ctx, rb, count, x, y, zbuffer, mask);
       }
    }
 
-- 
cgit v1.2.3


From 94d3a30df759bb7c2724fdcee9e89a350d3a4d8b Mon Sep 17 00:00:00 2001
From: "Xiang, Haihao" <haihao.xiang@intel.com>
Date: Wed, 8 Oct 2008 09:30:12 +0800
Subject: i965: Fix a potential assertion failure.

---
 src/mesa/drivers/dri/i965/brw_draw_upload.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 303eaac5cf..cc3d9396ac 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -365,8 +365,10 @@ static void brw_prepare_vertices(struct brw_context *brw)
 	 if (i == 0) {
 	    /* Position array not properly enabled:
 	     */
-	    if (input->glarray->StrideB == 0)
-	      return;
+            if (input->glarray->StrideB == 0) {
+               intel->Fallback = 1;
+               return;
+            }
 
 	    interleave = input->glarray->StrideB;
 	    ptr = input->glarray->Ptr;
-- 
cgit v1.2.3


From fc19536aa989ad61e95c281883d32860d767f8ef Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 3 Oct 2008 16:20:00 -0700
Subject: intel: Push flushing for cliprects changes down into the cliprects
 changes.

This lets us short-circuit when we're leaving the same cliprects in place,
which becomes quite common with metaops clears, and may be useful for some of
our FBO paths.
---
 src/mesa/drivers/dri/intel/intel_buffers.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/intel/intel_buffers.c b/src/mesa/drivers/dri/intel/intel_buffers.c
index defa5b173d..f5eaf765f3 100644
--- a/src/mesa/drivers/dri/intel/intel_buffers.c
+++ b/src/mesa/drivers/dri/intel/intel_buffers.c
@@ -135,6 +135,14 @@ intel_readbuf_region(struct intel_context *intel)
 static void
 intelSetRenderbufferClipRects(struct intel_context *intel)
 {
+   /* If the batch contents require looping over cliprects, flush them before
+    * we go changing which cliprects get referenced when that happens.
+    */
+   if (intel->batch->cliprect_mode == LOOP_CLIPRECTS &&
+       (intel->fboRect.x2 != intel->ctx.DrawBuffer->Width ||
+	intel->fboRect.x2 != intel->ctx.DrawBuffer->Height))
+      intel_batchbuffer_flush(intel->batch);
+
    assert(intel->ctx.DrawBuffer->Width > 0);
    assert(intel->ctx.DrawBuffer->Height > 0);
    intel->fboRect.x1 = 0;
@@ -160,6 +168,12 @@ intelSetFrontClipRects(struct intel_context *intel)
    if (!dPriv)
       return;
 
+   /* If the batch contents require looping over cliprects, flush them before
+    * we go changing which cliprects get referenced when that happens.
+    */
+   if (intel->batch->cliprect_mode == LOOP_CLIPRECTS &&
+       intel->pClipRects != dPriv->pClipRects)
+      intel_batchbuffer_flush(intel->batch);
    intel->numClipRects = dPriv->numClipRects;
    intel->pClipRects = dPriv->pClipRects;
    intel->drawX = dPriv->x;
@@ -183,6 +197,10 @@ intelSetBackClipRects(struct intel_context *intel)
 
    if (intel_fb->pf_active || dPriv->numBackClipRects == 0) {
       /* use the front clip rects */
+      if (intel->batch->cliprect_mode == LOOP_CLIPRECTS &&
+	  intel->pClipRects != dPriv->pClipRects)
+	 intel_batchbuffer_flush(intel->batch);
+
       intel->numClipRects = dPriv->numClipRects;
       intel->pClipRects = dPriv->pClipRects;
       intel->drawX = dPriv->x;
@@ -190,6 +208,10 @@ intelSetBackClipRects(struct intel_context *intel)
    }
    else {
       /* use the back clip rects */
+      if (intel->batch->cliprect_mode == LOOP_CLIPRECTS &&
+	  intel->pClipRects != dPriv->pBackClipRects)
+	 intel_batchbuffer_flush(intel->batch);
+
       intel->numClipRects = dPriv->numBackClipRects;
       intel->pClipRects = dPriv->pBackClipRects;
       intel->drawX = dPriv->backX;
@@ -900,12 +922,6 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
    if (fb->Name)
       intel_validate_paired_depth_stencil(ctx, fb);
 
-   /* If the batch contents require looping over cliprects, flush them before
-    * we go changing which cliprects get referenced when that happens.
-    */
-   if (intel->batch->cliprect_mode == LOOP_CLIPRECTS)
-      intel_batchbuffer_flush(intel->batch);
-
    /*
     * How many color buffers are we drawing into?
     */
-- 
cgit v1.2.3


From c157cfc6376f7469ab272b18868183e5ff9ac754 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Mon, 6 Oct 2008 17:34:51 -0700
Subject: i965: Add ARB_occlusion_query support.

---
 src/mesa/drivers/dri/i965/Makefile          |   1 +
 src/mesa/drivers/dri/i965/brw_context.c     |   4 +
 src/mesa/drivers/dri/i965/brw_context.h     |  30 +++-
 src/mesa/drivers/dri/i965/brw_draw.c        |   2 +-
 src/mesa/drivers/dri/i965/brw_draw_upload.c |   3 +
 src/mesa/drivers/dri/i965/brw_queryobj.c    | 263 ++++++++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_vtbl.c        |  10 ++
 src/mesa/drivers/dri/intel/intel_context.c  |  51 +-----
 src/mesa/drivers/dri/intel/intel_reg.h      |  19 ++
 9 files changed, 331 insertions(+), 52 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/brw_queryobj.c

diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index c2d555cd0c..7bc16a02c8 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -51,6 +51,7 @@ DRIVER_SOURCES = \
 	brw_metaops.c \
 	brw_misc_state.c \
 	brw_program.c \
+	brw_queryobj.c \
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 5f60477176..474158b484 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -33,6 +33,7 @@
 #include "main/imports.h"
 #include "main/api_noop.h"
 #include "main/vtxfmt.h"
+#include "main/simple_list.h"
 #include "shader/shader_api.h"
 
 #include "brw_context.h"
@@ -68,6 +69,7 @@ static void brwInitDriverFunctions( struct dd_function_table *functions )
 
    brwInitFragProgFuncs( functions );
    brwInitProgFuncs( functions );
+   brw_init_queryobj_functions(functions);
 }
 
 
@@ -150,6 +152,8 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
    ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
    ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
+   make_empty_list(&brw->query.active_head);
+
    brw_draw_init( brw );
 
    return GL_TRUE;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 8bddc9da40..1c6a0dede0 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -410,7 +410,22 @@ struct brw_tnl_cache {
    GLuint size, n_items;
 };
 
+struct brw_query_object {
+   struct gl_query_object Base;
 
+   /** Doubly linked list of active query objects in the context. */
+   struct brw_query_object *prev, *next;
+
+   /** Last query BO associated with this query. */
+   dri_bo *bo;
+   /** First index in bo with query data for this object. */
+   int first_index;
+   /** Last index in bo with query data for this object. */
+   int last_index;
+
+   /* Total count of pixels from previous BOs */
+   unsigned int count;
+};
 
 struct brw_context 
 {
@@ -626,7 +641,12 @@ struct brw_context
       dri_bo *vp_bo;
    } cc;
 
-   
+   struct {
+      struct brw_query_object active_head;
+      dri_bo *bo;
+      int index;
+      GLboolean active;
+   } query;
    /* Used to give every program string a unique id
     */
    GLuint program_id;
@@ -651,7 +671,13 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 			    __DRIcontextPrivate *driContextPriv,
 			    void *sharedContextPrivate);
 
-
+/*======================================================================
+ * brw_queryobj.c
+ */
+void brw_init_queryobj_functions(struct dd_function_table *functions);
+void brw_prepare_query_begin(struct brw_context *brw);
+void brw_emit_query_begin(struct brw_context *brw);
+void brw_emit_query_end(struct brw_context *brw);
 
 /*======================================================================
  * brw_state.c
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 39ce8eb4b6..6c71b4abcf 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -382,7 +382,6 @@ void brw_draw_prims( GLcontext *ctx,
       return;
    }
 
-
    /* Make a first attempt at drawing:
     */
    retval = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
@@ -395,6 +394,7 @@ void brw_draw_prims( GLcontext *ctx,
        _swsetup_Wakeup(ctx);
       _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
    }
+
 }
 
 void brw_draw_init( struct brw_context *brw )
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index cc3d9396ac..7b88b5eaa1 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -415,6 +415,8 @@ static void brw_prepare_vertices(struct brw_context *brw)
           copy_array_to_vbo_array(brw, upload[i], upload[i]->element_size);
       }
    }
+
+   brw_prepare_query_begin(brw);
 }
 
 static void brw_emit_vertices(struct brw_context *brw)
@@ -435,6 +437,7 @@ static void brw_emit_vertices(struct brw_context *brw)
       enabled[nr_enabled++] = input;
    }
 
+   brw_emit_query_begin(brw);
 
    /* Now emit VB and VEP state packets.
     *
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
new file mode 100644
index 0000000000..a1a1353dee
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file support for ARB_query_object
+ *
+ * ARB_query_object is implemented by using the PIPE_CONTROL command to stall
+ * execution on the completion of previous depth tests, and write the
+ * current PS_DEPTH_COUNT to a buffer object.
+ *
+ * We use before and after counts when drawing during a query so that
+ * we don't pick up other clients' query data in ours.  To reduce overhead,
+ * a single BO is used to record the query data for all active queries at
+ * once.  This also gives us a simple bound on how much batchbuffer space is
+ * required for handling queries, so that we can be sure that we won't
+ * have to emit a batchbuffer without getting the ending PS_DEPTH_COUNT.
+ */
+#include "main/simple_list.h"
+#include "main/imports.h"
+
+#include "brw_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_reg.h"
+
+/** Waits on the query object's BO and totals the results for this query */
+static void
+brw_queryobj_get_results(struct brw_query_object *query)
+{
+   int i;
+   uint64_t *results;
+
+   if (query->bo == NULL)
+      return;
+
+   /* Map and count the pixels from the current query BO */
+   dri_bo_map(query->bo, GL_FALSE);
+   results = query->bo->virtual;
+   for (i = query->first_index; i <= query->last_index; i++) {
+      query->Base.Result += results[i * 2 + 1] - results[i * 2];
+   }
+   dri_bo_unmap(query->bo);
+
+   dri_bo_unreference(query->bo);
+   query->bo = NULL;
+}
+
+static struct gl_query_object *
+brw_new_query_object(GLcontext *ctx, GLuint id)
+{
+   struct brw_query_object *query;
+
+   query = _mesa_calloc(sizeof(struct brw_query_object));
+
+   query->Base.Id = id;
+   query->Base.Result = 0;
+   query->Base.Active = GL_FALSE;
+   query->Base.Ready = GL_TRUE;
+
+   return &query->Base;
+}
+
+static void
+brw_delete_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   dri_bo_unreference(query->bo);
+   _mesa_free(query);
+}
+
+static void
+brw_begin_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Reset our driver's tracking of query state. */
+   dri_bo_unreference(query->bo);
+   query->bo = NULL;
+   query->first_index = -1;
+   query->last_index = -1;
+
+   insert_at_head(&brw->query.active_head, query);
+   intel->stats_wm++;
+}
+
+/**
+ * Begin the ARB_occlusion_query query on a query object.
+ */
+static void
+brw_end_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Flush the batchbuffer in case it has writes to our query BO.
+    * Have later queries write to a new query BO so that further rendering
+    * doesn't delay the collection of our results.
+    */
+   if (query->bo) {
+      brw_emit_query_end(brw);
+      intel_batchbuffer_flush(intel->batch);
+
+      dri_bo_unreference(brw->query.bo);
+      brw->query.bo = NULL;
+   }
+
+   remove_from_list(query);
+
+   intel->stats_wm--;
+}
+
+static void brw_wait_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   brw_queryobj_get_results(query);
+   query->Base.Ready = GL_TRUE;
+}
+
+static void brw_check_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   /* XXX: Need to expose dri_bo_is_idle from bufmgr. */
+#if 0
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   if (dri_bo_is_idle(query->bo)) {
+      brw_queryobj_get_results(query);
+      query->Base.Ready = GL_TRUE;
+   }
+#else
+   brw_wait_query(ctx, q);
+#endif
+}
+
+/** Called to set up the query BO and account for its aperture space */
+void
+brw_prepare_query_begin(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   dri_bo *aper_array[] = {
+      intel->batch->buf,
+      brw->query.bo,
+   };
+
+   /* Skip if we're not doing any queries. */
+   if (is_empty_list(&brw->query.active_head))
+      return;
+
+   /* Get a new query BO if we're going to need it. */
+   if (brw->query.bo == NULL ||
+       brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
+      dri_bo_unreference(brw->query.bo);
+      brw->query.bo = NULL;
+
+      brw->query.bo = dri_bo_alloc(intel->bufmgr, "query", 4096, 1);
+      brw->query.index = 0;
+   }
+
+   if (dri_bufmgr_check_aperture_space(aper_array, ARRAY_SIZE(aper_array)))
+      intel_batchbuffer_flush(intel->batch);
+}
+
+/** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
+void
+brw_emit_query_begin(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_query_object *query;
+
+   /* Skip if we're not doing any queries, or we've emitted the start. */
+   if (brw->query.active || is_empty_list(&brw->query.active_head))
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   /* This object could be mapped cacheable, but we don't have an exposed
+    * mechanism to support that.  Since it's going uncached, tell GEM that
+    * we're writing to it.  The usual clflush should be all that's required
+    * to pick up the results.
+    */
+   OUT_RELOC(brw->query.bo,
+	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   foreach(query, &brw->query.active_head) {
+      if (query->bo != brw->query.bo) {
+	 if (query->bo != NULL)
+	    brw_queryobj_get_results(query);
+	 dri_bo_reference(brw->query.bo);
+	 query->bo = brw->query.bo;
+	 query->first_index = brw->query.index;
+      }
+      query->last_index = brw->query.index;
+   }
+   brw->query.active = GL_TRUE;
+}
+
+/** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */
+void
+brw_emit_query_end(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   if (!brw->query.active)
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   OUT_RELOC(brw->query.bo,
+	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2 + 1) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   brw->query.active = GL_FALSE;
+   brw->query.index++;
+}
+
+void brw_init_queryobj_functions(struct dd_function_table *functions)
+{
+   functions->NewQueryObject = brw_new_query_object;
+   functions->DeleteQuery = brw_delete_query;
+   functions->BeginQuery = brw_begin_query;
+   functions->EndQuery = brw_end_query;
+   functions->CheckQuery = brw_check_query;
+   functions->WaitQuery = brw_wait_query;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index cd074dfed6..a64e437860 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -120,6 +120,15 @@ static void brw_set_draw_region( struct intel_context *intel,
    brw->state.nr_draw_regions = num_regions;
 }
 
+/* called from intel_batchbuffer_flush and children before sending a
+ * batchbuffer off.
+ */
+static void brw_finish_batch(struct intel_context *intel)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+
+   brw_emit_query_end(brw);
+}
 
 /* called from intelFlushBatchLocked
  */
@@ -218,6 +227,7 @@ void brwInitVtbl( struct brw_context *brw )
    brw->intel.vtbl.note_fence = brw_note_fence; 
    brw->intel.vtbl.note_unlock = brw_note_unlock; 
    brw->intel.vtbl.new_batch = brw_new_batch;
+   brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
    brw->intel.vtbl.set_draw_region = brw_set_draw_region;
    brw->intel.vtbl.flush_cmd = brw_flush_cmd;
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 9dc32e487f..e6c0d3175e 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -394,6 +394,7 @@ static const struct dri_extension brw_extensions[] = {
    { "GL_ARB_fragment_program",           NULL },
    { "GL_ARB_fragment_program_shadow",    NULL },
    { "GL_ARB_fragment_shader",            NULL },
+   { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
    { "GL_ARB_point_sprite", 		  NULL },
    { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
    { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
@@ -407,12 +408,9 @@ static const struct dri_extension brw_extensions[] = {
    { NULL,                                NULL }
 };
 
-#ifdef I915_MMIO_READ
-static const struct dri_extension arb_oc_extensions[] = {
-   { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
+static const struct dri_extension arb_oq_extensions[] = {
    { NULL, NULL }
 };
-#endif
 
 static const struct dri_extension ttm_extensions[] = {
    { "GL_ARB_pixel_buffer_object",        NULL },
@@ -437,13 +435,6 @@ void intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging)
    if (intel == NULL || intel->ttm)
       driInitExtensions(ctx, ttm_extensions, GL_FALSE);
 
-#ifdef I915_MMIO_READ
-   if (intel == NULL || 
-       (IS_965(intel->intelScreen->deviceID) && 
-	intel->intelScreen->drmMinor >= 8))
-      driInitExtensions(ctx, arb_oc_extensions, GL_FALSE);
-#endif
-
    if (intel == NULL || IS_965(intel->intelScreen->deviceID))
       driInitExtensions(ctx, brw_extensions, GL_FALSE);
 }
@@ -540,39 +531,6 @@ intelFinish(GLcontext * ctx)
    }
 }
 
-#ifdef I915_MMIO_READ
-static void
-intelBeginQuery(GLcontext *ctx, struct gl_query_object *q)
-{
-	struct intel_context *intel = intel_context( ctx );
-	struct drm_i915_mmio io = {
-		.read_write = I915_MMIO_READ,
-		.reg = MMIO_REGS_PS_DEPTH_COUNT,
-		.data = &q->Result 
-	};
-	intel->stats_wm++;
-	intelFinish(&intel->ctx);
-	drmCommandWrite(intel->driFd, DRM_I915_MMIO, &io, sizeof(io));
-}
-
-static void
-intelEndQuery(GLcontext *ctx, struct gl_query_object *q)
-{
-	struct intel_context *intel = intel_context( ctx );
-	GLuint64EXT tmp;	
-	struct drm_i915_mmio io = {
-		.read_write = I915_MMIO_READ,
-		.reg = MMIO_REGS_PS_DEPTH_COUNT,
-		.data = &tmp
-	};
-	intelFinish(&intel->ctx);
-	drmCommandWrite(intel->driFd, DRM_I915_MMIO, &io, sizeof(io));
-	q->Result = tmp - q->Result;
-	q->Ready = GL_TRUE;
-	intel->stats_wm--;
-}
-#endif
-
 void
 intelInitDriverFunctions(struct dd_function_table *functions)
 {
@@ -589,11 +547,6 @@ intelInitDriverFunctions(struct dd_function_table *functions)
    functions->CopyConvolutionFilter1D = _swrast_CopyConvolutionFilter1D;
    functions->CopyConvolutionFilter2D = _swrast_CopyConvolutionFilter2D;
 
-#ifdef I915_MMIO_READ
-   functions->BeginQuery = intelBeginQuery;
-   functions->EndQuery = intelEndQuery;
-#endif
-
    intelInitTextureFuncs(functions);
    intelInitStateFuncs(functions);
    intelInitBufferFuncs(functions);
diff --git a/src/mesa/drivers/dri/intel/intel_reg.h b/src/mesa/drivers/dri/intel/intel_reg.h
index 96af7e1a03..c21f408093 100644
--- a/src/mesa/drivers/dri/intel/intel_reg.h
+++ b/src/mesa/drivers/dri/intel/intel_reg.h
@@ -44,6 +44,25 @@
 #define _3DSTATE_LOAD_STATE_IMMEDIATE_1   (CMD_3D | (0x1d<<24) | (0x04<<16))
 #define I1_LOAD_S(n)                      (1<<(4+n))
 
+/** @{
+ *
+ * PIPE_CONTROL operation, a combination MI_FLUSH and register write with
+ * additional flushing control.
+ */
+#define _3DSTATE_PIPE_CONTROL		(CMD_3D | (3 << 27) | (2 << 24) | 2)
+#define PIPE_CONTROL_NO_WRITE		(0 << 14)
+#define PIPE_CONTROL_WRITE_IMMEDIATE	(1 << 14)
+#define PIPE_CONTROL_WRITE_DEPTH_COUNT	(2 << 14)
+#define PIPE_CONTROL_WRITE_TIMESTAMP	(3 << 14)
+#define PIPE_CONTROL_DEPTH_STALL	(1 << 13)
+#define PIPE_CONTROL_WRITE_FLUSH	(1 << 12)
+#define PIPE_CONTROL_INSTRUCTION_FLUSH	(1 << 11)
+#define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
+#define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
+#define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
+
+/** @} */
+
 /** @{
  * 915 definitions
  */
-- 
cgit v1.2.3


From 902727b7e3eb3c2c9bcddf1d55d3c95c73377cf3 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 7 Oct 2008 18:47:31 -0700
Subject: mesa: Pass the context to query object delete cb to avoid null
 dereference.

---
 src/mesa/main/queryobj.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index 6f084b6a4f..554e0b0d18 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -545,6 +545,6 @@ delete_queryobj_cb(GLuint id, void *data, void *userData)
 void
 _mesa_free_query_data(GLcontext *ctx)
 {
-   _mesa_HashDeleteAll(ctx->Query.QueryObjects, delete_queryobj_cb, NULL);
+   _mesa_HashDeleteAll(ctx->Query.QueryObjects, delete_queryobj_cb, ctx);
    _mesa_DeleteHashTable(ctx->Query.QueryObjects);
 }
-- 
cgit v1.2.3


From 5462d447aa5bce9e558594eabeddd624cd39b1de Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 09:28:10 -0600
Subject: mesa: fix vertex format/attribute bug

If the tnl output attributes matches the swsetup input attributes we still
need to check if the desired vertex color type (float vs. chan) has changed
so that we use the right emit functions.

Fixes a conformance failure found with logicop test at pathlevel 3.
---
 src/mesa/swrast_setup/ss_context.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/mesa/swrast_setup/ss_context.c b/src/mesa/swrast_setup/ss_context.c
index f4d90c514b..61172f9979 100644
--- a/src/mesa/swrast_setup/ss_context.c
+++ b/src/mesa/swrast_setup/ss_context.c
@@ -112,22 +112,25 @@ setup_vertex_format(GLcontext *ctx)
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    SScontext *swsetup = SWSETUP_CONTEXT(ctx);
+   GLboolean intColors = !ctx->FragmentProgram._Current
+                      && !ctx->ATIFragmentShader._Enabled
+                      && ctx->RenderMode == GL_RENDER
+                      && CHAN_TYPE == GL_UNSIGNED_BYTE;
 
-   if (!RENDERINPUTS_EQUAL(tnl->render_inputs_bitset,
+   if (intColors != swsetup->intColors ||
+       !RENDERINPUTS_EQUAL(tnl->render_inputs_bitset,
                            swsetup->last_index_bitset)) {
       DECLARE_RENDERINPUTS(index_bitset);
       struct tnl_attr_map map[_TNL_ATTRIB_MAX];
       int i, e = 0;
 
+      swsetup->intColors = intColors;
+
       RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
 
       EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F_VIEWPORT, attrib[FRAG_ATTRIB_WPOS] );
 
       if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR0 )) {
-         swsetup->intColors = !ctx->FragmentProgram._Current
-                           && !ctx->ATIFragmentShader._Enabled
-                           && ctx->RenderMode == GL_RENDER
-                           && CHAN_TYPE == GL_UNSIGNED_BYTE;
          if (swsetup->intColors)
             EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4CHAN_4F_RGBA, color );
          else
-- 
cgit v1.2.3


From a71b1af5ad7859d00f88b554ed3514561c245e0a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 09:33:27 -0600
Subject: mesa: vertex emit debug code (disabled)

---
 src/mesa/tnl/t_vertex_generic.c | 80 +++++++++++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 22 deletions(-)

diff --git a/src/mesa/tnl/t_vertex_generic.c b/src/mesa/tnl/t_vertex_generic.c
index db70ad4dad..f763522f91 100644
--- a/src/mesa/tnl/t_vertex_generic.c
+++ b/src/mesa/tnl/t_vertex_generic.c
@@ -34,6 +34,12 @@
 #include "t_vertex.h"
 
 
+#if 0
+#define DEBUG_INSERT printf("%s\n", __FUNCTION__)
+#else
+#define DEBUG_INSERT
+#endif
+
 
 /*
  * These functions take the NDC coordinates pointed to by 'in', apply the
@@ -45,7 +51,7 @@ static INLINE void insert_4f_viewport_4( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -57,7 +63,7 @@ static INLINE void insert_4f_viewport_3( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -69,7 +75,7 @@ static INLINE void insert_4f_viewport_2( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[14];
@@ -81,7 +87,7 @@ static INLINE void insert_4f_viewport_1( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[13];
    out[2] = vp[14];
@@ -93,7 +99,7 @@ static INLINE void insert_3f_viewport_3( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -104,7 +110,7 @@ static INLINE void insert_3f_viewport_2( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -115,7 +121,7 @@ static INLINE void insert_3f_viewport_1( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[13];
    out[2] = vp[14];
@@ -126,7 +132,7 @@ static INLINE void insert_2f_viewport_2( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
 }
@@ -136,7 +142,7 @@ static INLINE void insert_2f_viewport_1( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[13];
 }
@@ -150,7 +156,7 @@ static INLINE void insert_4f_4( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[2];
@@ -161,7 +167,7 @@ static INLINE void insert_4f_3( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[2];
@@ -172,7 +178,7 @@ static INLINE void insert_4f_2( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = 0;
@@ -183,7 +189,7 @@ static INLINE void insert_4f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = 0;
    out[2] = 0;
@@ -194,7 +200,7 @@ static INLINE void insert_3f_xyw_4( const struct tnl_clipspace_attr *a, GLubyte
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[3];
@@ -203,6 +209,7 @@ static INLINE void insert_3f_xyw_4( const struct tnl_clipspace_attr *a, GLubyte
 static INLINE void insert_3f_xyw_err( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
 {
    (void) a; (void) v; (void) in;
+   DEBUG_INSERT;
    _mesa_exit(1);
 }
 
@@ -210,7 +217,7 @@ static INLINE void insert_3f_3( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[2];
@@ -220,7 +227,7 @@ static INLINE void insert_3f_2( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = 0;
@@ -230,7 +237,7 @@ static INLINE void insert_3f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = 0;
    out[2] = 0;
@@ -241,7 +248,7 @@ static INLINE void insert_2f_2( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
 }
@@ -250,7 +257,7 @@ static INLINE void insert_2f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = 0;
 }
@@ -259,12 +266,13 @@ static INLINE void insert_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-
+   DEBUG_INSERT;
    out[0] = in[0];
 }
 
 static INLINE void insert_null( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a; (void) v; (void) in;
 }
 
@@ -272,6 +280,7 @@ static INLINE void insert_4chan_4f_rgba_4( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
@@ -283,6 +292,7 @@ static INLINE void insert_4chan_4f_rgba_3( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
@@ -294,6 +304,7 @@ static INLINE void insert_4chan_4f_rgba_2( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
@@ -305,6 +316,7 @@ static INLINE void insert_4chan_4f_rgba_1( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    c[1] = 0;
@@ -315,6 +327,7 @@ static INLINE void insert_4chan_4f_rgba_1( const struct tnl_clipspace_attr *a, G
 static INLINE void insert_4ub_4f_rgba_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -325,6 +338,7 @@ static INLINE void insert_4ub_4f_rgba_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_rgba_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -335,6 +349,7 @@ static INLINE void insert_4ub_4f_rgba_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_rgba_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -345,6 +360,7 @@ static INLINE void insert_4ub_4f_rgba_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_rgba_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    v[1] = 0;
@@ -355,6 +371,7 @@ static INLINE void insert_4ub_4f_rgba_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -365,6 +382,7 @@ static INLINE void insert_4ub_4f_bgra_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -375,6 +393,7 @@ static INLINE void insert_4ub_4f_bgra_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -385,6 +404,7 @@ static INLINE void insert_4ub_4f_bgra_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    v[1] = 0;
@@ -395,6 +415,7 @@ static INLINE void insert_4ub_4f_bgra_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -405,6 +426,7 @@ static INLINE void insert_4ub_4f_argb_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -415,6 +437,7 @@ static INLINE void insert_4ub_4f_argb_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -425,6 +448,7 @@ static INLINE void insert_4ub_4f_argb_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    v[2] = 0x00;
@@ -435,6 +459,7 @@ static INLINE void insert_4ub_4f_argb_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -445,6 +470,7 @@ static INLINE void insert_4ub_4f_abgr_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -455,6 +481,7 @@ static INLINE void insert_4ub_4f_abgr_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -465,6 +492,7 @@ static INLINE void insert_4ub_4f_abgr_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    v[2] = 0x00;
@@ -475,6 +503,7 @@ static INLINE void insert_4ub_4f_abgr_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_3ub_3f_rgb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			       const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -484,6 +513,7 @@ static INLINE void insert_3ub_3f_rgb_3( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_rgb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			       const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -493,6 +523,7 @@ static INLINE void insert_3ub_3f_rgb_2( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_rgb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			       const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    v[1] = 0;
@@ -502,6 +533,7 @@ static INLINE void insert_3ub_3f_rgb_1( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_bgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				 const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -511,6 +543,7 @@ static INLINE void insert_3ub_3f_bgr_3( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_bgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				 const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -520,6 +553,7 @@ static INLINE void insert_3ub_3f_bgr_2( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_bgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				 const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    v[1] = 0;
@@ -530,6 +564,7 @@ static INLINE void insert_3ub_3f_bgr_1( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_1ub_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			   const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
 }
@@ -551,6 +586,7 @@ static void extract_4f_viewport( const struct tnl_clipspace_attr *a, GLfloat *ou
    /* Although included for completeness, the position coordinate is
     * usually handled differently during clipping.
     */
+   DEBUG_INSERT;
    out[0] = (in[0] - vp[12]) / vp[0];
    out[1] = (in[1] - vp[13]) / vp[5];
    out[2] = (in[2] - vp[14]) / vp[10];
@@ -562,7 +598,7 @@ static void extract_3f_viewport( const struct tnl_clipspace_attr *a, GLfloat *ou
 {
    const GLfloat *in = (const GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = (in[0] - vp[12]) / vp[0];
    out[1] = (in[1] - vp[13]) / vp[5];
    out[2] = (in[2] - vp[14]) / vp[10];
@@ -575,7 +611,7 @@ static void extract_2f_viewport( const struct tnl_clipspace_attr *a, GLfloat *ou
 {
    const GLfloat *in = (const GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = (in[0] - vp[12]) / vp[0];
    out[1] = (in[1] - vp[13]) / vp[5];
    out[2] = 0;
-- 
cgit v1.2.3


From 5c4bd76cb65245467d4ba04e893157055d738b2d Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 14:02:24 -0600
Subject: mesa: in _mesa_combine_programs() take new STATE_CURRENT_ATTRIB color
 into account

Commit 1680ef869625dc1fe9cf481b180382a34e0738e7 changed the texenv program
to get color from a state register instead of a constant-valued vertex
attribute.  This broke program concatenation (so glDraw/CopyPixels broke).
Now check if the second program get's color from a constant register and
handle that case appropriately.
---
 src/mesa/shader/program.c | 41 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/src/mesa/shader/program.c b/src/mesa/shader/program.c
index b03dd24d11..723c46ee8c 100644
--- a/src/mesa/shader/program.c
+++ b/src/mesa/shader/program.c
@@ -594,17 +594,47 @@ _mesa_combine_programs(GLcontext *ctx,
 
    if (newProg->Target == GL_FRAGMENT_PROGRAM_ARB) {
       struct gl_fragment_program *fprogA, *fprogB, *newFprog;
+      GLbitfield progB_inputsRead = progB->InputsRead;
+      GLint progB_colorFile, progB_colorIndex;
+
       fprogA = (struct gl_fragment_program *) progA;
       fprogB = (struct gl_fragment_program *) progB;
       newFprog = (struct gl_fragment_program *) newProg;
 
       newFprog->UsesKill = fprogA->UsesKill || fprogB->UsesKill;
 
+      /* We'll do a search and replace for instances
+       * of progB_colorFile/progB_colorIndex below...
+       */
+      progB_colorFile = PROGRAM_INPUT;
+      progB_colorIndex = FRAG_ATTRIB_COL0;
+
+      /*
+       * The fragment program may get color from a state var rather than
+       * a fragment input (vertex output) if it's constant.
+       * See the texenvprogram.c code.
+       * So, search the program's parameter list now to see if the program
+       * gets color from a state var instead of a conventional fragment
+       * input register.
+       */
+      for (i = 0; i < progB->Parameters->NumParameters; i++) {
+         struct gl_program_parameter *p = &progB->Parameters->Parameters[i];
+         if (p->Type == PROGRAM_STATE_VAR &&
+             p->StateIndexes[0] == STATE_INTERNAL &&
+             p->StateIndexes[1] == STATE_CURRENT_ATTRIB &&
+             p->StateIndexes[2] == VERT_ATTRIB_COLOR0) {
+            progB_inputsRead |= FRAG_BIT_COL0;
+            progB_colorFile = PROGRAM_STATE_VAR;
+            progB_colorIndex = i;
+            break;
+         }
+      }
+
       /* Connect color outputs of fprogA to color inputs of fprogB, via a
        * new temporary register.
        */
       if ((progA->OutputsWritten & (1 << FRAG_RESULT_COLR)) &&
-          (progB->InputsRead & (1 << FRAG_ATTRIB_COL0))) {
+          (progB_inputsRead & FRAG_BIT_COL0)) {
          GLint tempReg = _mesa_find_free_register(newProg, PROGRAM_TEMPORARY);
          if (tempReg < 0) {
             _mesa_problem(ctx, "No free temp regs found in "
@@ -615,13 +645,14 @@ _mesa_combine_programs(GLcontext *ctx,
          replace_registers(newInst, lenA,
                            PROGRAM_OUTPUT, FRAG_RESULT_COLR,
                            PROGRAM_TEMPORARY, tempReg);
-         /* replace reads from input.color[0] with tempReg */
+         /* replace reads from the input color with tempReg */
          replace_registers(newInst + lenA, lenB,
-                           PROGRAM_INPUT, FRAG_ATTRIB_COL0,
-                           PROGRAM_TEMPORARY, tempReg);
+                           progB_colorFile, progB_colorIndex, /* search for */
+                           PROGRAM_TEMPORARY, tempReg  /* replace with */ );
       }
 
-      inputsB = progB->InputsRead;
+      /* compute combined program's InputsRead */
+      inputsB = progB_inputsRead;
       if (progA->OutputsWritten & (1 << FRAG_RESULT_COLR)) {
          inputsB &= ~(1 << FRAG_ATTRIB_COL0);
       }
-- 
cgit v1.2.3


From 73d00b9e93a9e8a5fecb0de224552741e389fc11 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 16:33:04 -0600
Subject: gallium: better instruction printing for SPE code

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 46 ++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 8a87e9abb1..a6dd7ef311 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -164,6 +164,24 @@ rem_prefix(const char *longname)
 }
 
 
+static const char *
+reg_name(int reg)
+{
+   switch (reg) {
+   case SPE_REG_SP:
+      return "$sp";
+   case SPE_REG_RA:
+      return "$lr";
+   default:
+      {
+         static char buf[10];
+         sprintf(buf, "$%d", reg);
+         return buf;
+      }
+   }
+}
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
 		    unsigned rA, unsigned rB, const char *name)
 {
@@ -176,7 +194,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB);
+       printf("%s\t%s, %s, %s\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
     }
 }
 
@@ -194,7 +213,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB, rC);
+       printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
+              reg_name(rA), reg_name(rB), reg_name(rC));
     }
 }
 
@@ -211,7 +231,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
@@ -229,7 +250,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
@@ -248,10 +270,14 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     if (p->print) {
        indent(p);
        if (strcmp(name, "spe_lqd") == 0 ||
-           strcmp(name, "spe_stqd") == 0)
-          printf("%s\t$%d, 0x%x($%d)\n", rem_prefix(name), rT, imm, rA);
-       else
-          printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+           strcmp(name, "spe_stqd") == 0) {
+          printf("%s\t%s, %d(%s)\n",
+                 rem_prefix(name), reg_name(rT), imm, reg_name(rA));
+       }
+       else {
+          printf("%s\t%s, %s, 0x%x\n",
+                 rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+       }
     }
 }
 
@@ -267,7 +293,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
     }
 }
 
@@ -283,7 +309,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
     }
 }
 
-- 
cgit v1.2.3


From 5c57cbec32136c25f104872179d979098be9a1a7 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 16:35:40 -0600
Subject: gallium: asst. clean-ups

Don't use register qualifier.  Doxygen-ize comments.  Remove 'extern'.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a6dd7ef311..c442b1f6aa 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -385,7 +385,7 @@ void _name (struct spe_function *p, int imm) \
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
-    register unsigned int i;
+    unsigned int i;
 
     p->store = align_malloc(code_size, 16);
     p->num_inst = 0;
@@ -475,7 +475,7 @@ void spe_release_register(struct spe_function *p, int reg)
  */
 void spe_allocate_register_set(struct spe_function *p)
 {
-   register unsigned int i;
+   unsigned int i;
 
    /* Keep track of the set count.  If it ever wraps around to 0, 
     * we're in trouble.
@@ -489,7 +489,8 @@ void spe_allocate_register_set(struct spe_function *p)
     * when the register set is released.
     */
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      if (p->regs[i] > 0) p->regs[i]++;
+      if (p->regs[i] > 0)
+         p->regs[i]++;
    }
 }
 
@@ -506,7 +507,8 @@ void spe_release_register_set(struct spe_function *p)
     * available.
     */
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      if (p->regs[i] > 0) p->regs[i]--;
+      if (p->regs[i] > 0)
+         p->regs[i]--;
    }
 }
 
@@ -525,7 +527,7 @@ spe_indent(struct spe_function *p, int spaces)
 }
 
 
-extern void
+void
 spe_comment(struct spe_function *p, int rel_indent, const char *s)
 {
    if (p->print) {
@@ -710,10 +712,12 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
    }
 }
 
-/* This function is constructed identically to spe_sor_uint() below.
+/**
+ * This function is constructed identically to spe_sor_uint() below.
  * Changes to one should be made in the other.
  */
-void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
 {
    /* If we can, emit a single instruction, either And Byte Immediate
     * (which uses the same constant across each byte), And Halfword Immediate
@@ -723,7 +727,7 @@ void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int
     *
     * Otherwise, we'll need to use a temporary register.
     */
-   register unsigned int tmp;
+   unsigned int tmp;
 
    /* If the upper 23 bits are all 0s or all 1s, sign extension
     * will work and we can use And Word Immediate
@@ -760,10 +764,12 @@ void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int
    spe_release_register(p, tmp_reg);
 }
 
-/* This function is constructed identically to spe_and_uint() above.
+/**
+ * This function is constructed identically to spe_and_uint() above.
  * Changes to one should be made in the other.
  */
-void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
 {
    /* If we can, emit a single instruction, either Exclusive Or Byte 
     * Immediate (which uses the same constant across each byte), Exclusive 
@@ -773,7 +779,7 @@ void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int
     *
     * Otherwise, we'll need to use a temporary register.
     */
-   register unsigned int tmp;
+   unsigned int tmp;
 
    /* If the upper 23 bits are all 0s or all 1s, sign extension
     * will work and we can use Exclusive Or Word Immediate
-- 
cgit v1.2.3


From feb5a26bb1e39099abd1caf4a405776ea0124315 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 20:33:24 -0600
Subject: cell: increase SPU_MAX_FRAGMENT_PROGRAM_INSTS

---
 src/gallium/drivers/cell/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index d261c1a640..5dc756023f 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -128,7 +128,7 @@ struct cell_command_fragment_ops
 
 
 /** Max instructions for fragment programs */
-#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
 
 /**
  * Command to send a fragment program to SPUs.
-- 
cgit v1.2.3


From a4e477433f485a39b5de448d0a9cb6f4bf9bb90f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 20:34:35 -0600
Subject: cell: implement more built-in shader functions, link spu code with
 -lm

---
 configs/linux-cell                       |  2 +-
 src/gallium/drivers/cell/spu/spu_funcs.c | 65 +++++++++++++++++++++-----------
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/configs/linux-cell b/configs/linux-cell
index 86651b83d7..8d74ee469d 100644
--- a/configs/linux-cell
+++ b/configs/linux-cell
@@ -53,7 +53,7 @@ SPU_CFLAGS = $(OPT_FLAGS) -W -Wall -Winline -Wmissing-prototypes -Wno-main \
 	-DSPU_MAIN_PARAM_LONG_LONG \
 	-include spu_intrinsics.h
 
-SPU_LFLAGS = -L$(SDK)/spu/lib -Wl,-N -lmisc
+SPU_LFLAGS = -L$(SDK)/spu/lib -Wl,-N -lmisc -lm
 
 SPU_AR = ppu-ar
 SPU_AR_FLAGS = -qcs
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index b57ad3f3b8..1adf9de0e8 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -35,41 +35,61 @@
 
 #include <string.h>
 #include <libmisc.h>
-#include <cos8_v.h>
-#include <sin8_v.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
 
 #include "cell/common.h"
 #include "spu_main.h"
 #include "spu_funcs.h"
 
 
-#define M_PI 3.1415926
-
-
 static vector float
 spu_cos(vector float x)
 {
-#if 0
-   static const float scale = 1.0 / (2.0 * M_PI);
-   x = x * spu_splats(scale); /* normalize */
-   return _cos8_v(x);
-#else
-   /* just pass-through to avoid trashing caller's stack */
-   return x;
-#endif
+   return _cos14_v(x);
 }
 
 static vector float
 spu_sin(vector float x)
 {
-#if 0
-   static const float scale = 1.0 / (2.0 * M_PI);
-   x = x * spu_splats(scale); /* normalize */
-   return _sin8_v(x);   /* 8-bit accuracy enough?? */
-#else
-   /* just pass-through to avoid trashing caller's stack */
-   return x;
-#endif
+   return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+   float z0 = powf(spu_extract(x,0), spu_extract(y,0));
+   float z1 = powf(spu_extract(x,1), spu_extract(y,1));
+   float z2 = powf(spu_extract(x,2), spu_extract(y,2));
+   float z3 = powf(spu_extract(x,3), spu_extract(y,3));
+   return (vector float) {z0, z1, z2, z3};
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+   float z0 = powf(2.0f, spu_extract(x,0));
+   float z1 = powf(2.0f, spu_extract(x,1));
+   float z2 = powf(2.0f, spu_extract(x,2));
+   float z3 = powf(2.0f, spu_extract(x,3));
+   return (vector float) {z0, z1, z2, z3};
+}
+
+static vector float
+spu_log2(vector float x)
+{
+   /*
+    * log_base_2(x) = log(x) / log(2)
+    * 1.442695 = 1/log(2).
+    */
+   static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
+   float z0 = logf(spu_extract(x,0));
+   float z1 = logf(spu_extract(x,1));
+   float z2 = logf(spu_extract(x,2));
+   float z3 = logf(spu_extract(x,3));
+   vector float v = (vector float) {z0, z1, z2, z3};
+   return spu_mul(v, k);
 }
 
 
@@ -101,6 +121,9 @@ return_function_info(void)
    funcs.num = 0;
    add_func(&funcs, "spu_cos", &spu_cos);
    add_func(&funcs, "spu_sin", &spu_sin);
+   add_func(&funcs, "spu_pow", &spu_pow);
+   add_func(&funcs, "spu_exp2", &spu_exp2);
+   add_func(&funcs, "spu_log2", &spu_log2);
 
    /* Send the function info back to the PPU / main memory */
    mfc_put((void *) &funcs,  /* src in local store */
-- 
cgit v1.2.3


From d48a92e88040470f93e2186f8eb23e4797a09860 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 20:44:32 -0600
Subject: cell: implement function calls from shader code.  fslight demo runs
 now.

Used for SIN, COS, EXP2, LOG2, POW instructions.  TEX next.

Fixed some bugs in MIN, MAX, DP3, DP4, DPH instructions.

In rtasm code:
  Special-case spe_lqd(), spe_stqd() functions so they take byte offsets but
  low-order 4 bits are shifted out.  This makes things consistant with SPU
  assembly language conventions.
  Added spe_get_registers_used() function.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  76 ++++++++++--
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  11 +-
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 141 +++++++++++++++--------
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |  30 ++---
 4 files changed, 182 insertions(+), 76 deletions(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index c442b1f6aa..9274bc5e3c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -174,9 +174,12 @@ reg_name(int reg)
       return "$lr";
    default:
       {
-         static char buf[10];
-         sprintf(buf, "$%d", reg);
-         return buf;
+         /* cycle through four buffers to handle multiple calls per printf */
+         static char buf[4][10];
+         static int b = 0;
+         b = (b + 1) % 4;
+         sprintf(buf[b], "$%d", reg);
+         return buf[b];
       }
    }
 }
@@ -269,15 +272,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       if (strcmp(name, "spe_lqd") == 0 ||
-           strcmp(name, "spe_stqd") == 0) {
-          printf("%s\t%s, %d(%s)\n",
-                 rem_prefix(name), reg_name(rT), imm, reg_name(rA));
-       }
-       else {
-          printf("%s\t%s, %s, 0x%x\n",
-                 rem_prefix(name), reg_name(rT), reg_name(rA), imm);
-       }
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
@@ -379,6 +375,7 @@ void _name (struct spe_function *p, int imm) \
 #include "rtasm_ppc_spe.h"
 
 
+
 /**
  * Initialize an spe_function.
  * \param code_size  size of instruction buffer to allocate, in bytes.
@@ -513,6 +510,20 @@ void spe_release_register_set(struct spe_function *p)
 }
 
 
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+   unsigned i, num = 0;
+   /* only count registers in the range available to callers */
+   for (i = 2; i < 80; i++) {
+      if (p->regs[i]) {
+         used[num++] = i;
+      }
+   }
+   return num;
+}
+
+
 void
 spe_print_code(struct spe_function *p, boolean enable)
 {
@@ -539,6 +550,46 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
 }
 
 
+/**
+ * Load quad word.
+ * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   p->print = FALSE;
+   assert(offset % 4 == 0);
+   emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   p->print = FALSE;
+   assert(offset % 4 == 0);
+   emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
 /**
  * For branch instructions:
  * \param d  if 1, disable interupts if branch is taken
@@ -764,6 +815,7 @@ spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
    spe_release_register(p, tmp_reg);
 }
 
+
 /**
  * This function is constructed identically to spe_and_uint() above.
  * Changes to one should be made in the other.
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index cd2e245409..47dadb343c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -89,6 +89,9 @@ extern void spe_release_register(struct spe_function *p, int reg);
 extern void spe_allocate_register_set(struct spe_function *p);
 extern void spe_release_register_set(struct spe_function *p);
 
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
 extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
@@ -128,11 +131,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 /* Memory load / store instructions
  */
-EMIT_RI10(spe_lqd,  0x034);
 EMIT_RR  (spe_lqx,  0x1c4);
 EMIT_RI16(spe_lqa,  0x061);
 EMIT_RI16(spe_lqr,  0x067);
-EMIT_RI10(spe_stqd, 0x024);
 EMIT_RR  (spe_stqx, 0x144);
 EMIT_RI16(spe_stqa, 0x041);
 EMIT_RI16(spe_stqr, 0x047);
@@ -290,6 +291,12 @@ EMIT_RI16(spe_brz,       0x040);
 EMIT_RI16(spe_brhnz,     0x046);
 EMIT_RI16(spe_brhz,      0x044);
 
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
 extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3065869d04..640ebcadbb 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -84,6 +84,8 @@ struct codegen
    /** Index of execution mask register */
    int exec_mask_reg;
 
+   int frame_size;  /**< Stack frame size, in words */
+
    struct spe_function *f;
    boolean error;
 };
@@ -208,7 +210,7 @@ get_src_reg(struct codegen *gen,
             reg = get_itemp(gen);
             reg_is_itemp = TRUE;
             /* Load:  reg = memory[(machine_reg) + offset] */
-            spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
          }
          break;
       case TGSI_FILE_IMMEDIATE:
@@ -221,7 +223,7 @@ get_src_reg(struct codegen *gen,
             reg = get_itemp(gen);
             reg_is_itemp = TRUE;
             /* Load:  reg = memory[(machine_reg) + offset] */
-            spe_lqd(gen->f, reg, gen->constants_reg, offset);
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
          }
          break;
       default:
@@ -325,6 +327,7 @@ store_dest_reg(struct codegen *gen,
       }
       else {
          /* we're not inside a condition or loop: do nothing special */
+
       }
       break;
    case TGSI_FILE_OUTPUT:
@@ -337,17 +340,17 @@ store_dest_reg(struct codegen *gen,
             /* First read the current value from memory:
              * Load:  curval = memory[(machine_reg) + offset]
              */
-            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
             /* Mix curval with newvalue according to exec mask:
              * d[i] = mask_reg[i] ? value_reg : d_reg
              */
             spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
             /* Store: memory[(machine_reg) + offset] = curval */
-            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
          }
          else {
             /* Store: memory[(machine_reg) + offset] = reg */
-            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
          }
       }
       break;
@@ -357,6 +360,41 @@ store_dest_reg(struct codegen *gen,
 }
 
 
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 256+128; /* XXX temporary */
+
+   spe_comment(gen->f, -4, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* save stack pointer    # stqd $sp,-frameSize($sp) */
+   spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+   /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   spe_comment(gen->f, -4, "Function epilogue:");
+
+   /* restore stack pointer    # ai $sp,$sp,frameSize */
+   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
 static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
@@ -588,6 +626,7 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
    int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
    int tmp_reg = get_itemp(gen);
+
    /* t = x0 * x1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
@@ -603,7 +642,9 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
 
@@ -623,6 +664,7 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
    int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
    int tmp_reg = get_itemp(gen);
+
    /* t = x0 * x1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
@@ -643,6 +685,8 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
          store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
@@ -683,6 +727,8 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
          store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
@@ -1112,9 +1158,6 @@ emit_function_call(struct codegen *gen,
    uint addr;
    int ch;
 
-   /* XXX temporary value */
-   const int frameSize = 64; /* stack frame (activation record) size */
-
    assert(num_args <= 3);
 
    /* lookup function address */
@@ -1136,48 +1179,45 @@ emit_function_call(struct codegen *gen,
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         int s_regs[3];
-         uint a;
+         int s_regs[3], d_reg;
+         ubyte usedRegs[SPE_NUM_REGS];
+         uint a, i, numUsed;
+
          for (a = 0; a < num_args; a++) {
             s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
          }
+         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
-         /* Basically:
-          * save registers on stack
-          * move parameters to registers 3, 4, 5...
-          * call function
-          * save return value (reg 3)
-          * restore registers from stack
-          */
+         numUsed = spe_get_registers_used(gen->f, usedRegs);
+         assert(numUsed < gen->frame_size / 16 - 32);
 
-         /* XXX hack: load first function param */
-         spe_move(gen->f, 3, s_regs[0]);
-
-         /* save $lr on stack     # stqd $lr,16($sp) */
-         spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-         /* save stack pointer    # stqd $sp,-frameSize($sp) */
-         spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
-
-         /* XXX save registers to stack here */
+         /* save registers to stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            int offset = 2 + i;
+            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
 
-         /* adjust stack pointer  # ai $sp,$sp,-frameSize */
-         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
+         /* setup function arguments */
+         for (a = 0; a < num_args; a++) {
+            spe_move(gen->f, 3 + a, s_regs[a]);
+         }
 
          /* branch to function, save return addr */
          spe_brasl(gen->f, SPE_REG_RA, addr);
 
-         /* restore stack pointer # ai $sp,$sp,frameSize */
-         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, frameSize);
-
-         /* XXX restore registers from stack here */
-
-         /* restore $lr           # lqd $lr,16($sp) */
-         spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-
-         /* XXX hack: save function's return value */
+         /* save function's return value */
          spe_move(gen->f, d_reg, 3);
 
+         /* restore registers from stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            if (reg != d_reg) {
+               int offset = 2 + i;
+               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+         }
+
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -1202,10 +1242,11 @@ emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
          int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
 
          /* d = (s1 > s2) ? s1 : s2 */
-         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+         spe_fcgt(gen->f, tmp_reg, s1_reg, s2_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1230,10 +1271,11 @@ emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
          int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
 
          /* d = (s2 > s1) ? s1 : s2 */
-         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+         spe_fcgt(gen->f, tmp_reg, s2_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1346,8 +1388,7 @@ static boolean
 emit_END(struct codegen *gen)
 {
    spe_comment(gen->f, -4, "END:");
-   /* return from function call */
-   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+   emit_epilogue(gen);
    return true;
 }
 
@@ -1420,6 +1461,10 @@ emit_instruction(struct codegen *gen,
       return emit_function_call(gen, inst, "spu_sin", 1);
    case TGSI_OPCODE_POW:
       return emit_function_call(gen, inst, "spu_pow", 2);
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_function_call(gen, inst, "spu_exp2", 1);
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_function_call(gen, inst, "spu_log2", 1);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
@@ -1532,6 +1577,7 @@ emit_declaration(struct cell_context *cell,
 }
 
 
+
 /**
  * Translate TGSI shader code to SPE instructions.  This is done when
  * the state tracker gives us a new shader (via pipe->create_fs_state()).
@@ -1571,12 +1617,14 @@ cell_gen_fragment_program(struct cell_context *cell,
 
    tgsi_parse_init(&parse, tokens);
 
+   emit_prologue(&gen);
+
    while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
       tgsi_parse_token(&parse);
 
       switch (parse.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-         if (!emit_immediate(&gen,  &parse.FullToken.FullImmediate))
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
             gen.error = true;
          break;
 
@@ -1595,7 +1643,6 @@ cell_gen_fragment_program(struct cell_context *cell,
       }
    }
 
-
    if (gen.error) {
       /* terminate the SPE code */
       return emit_END(&gen);
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 566df7f59e..18969005b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -73,8 +73,8 @@ emit_matrix_transpose(struct spe_function *p,
    int col3;
 
 
-   spe_lqd(p, shuf_hi, shuf_ptr, 3);
-   spe_lqd(p, shuf_lo, shuf_ptr, 4);
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
    spe_shufb(p, t1, row0, row2, shuf_hi);
    spe_shufb(p, t2, row0, row2, shuf_lo);
 
@@ -122,13 +122,13 @@ emit_matrix_transpose(struct spe_function *p,
     */
    switch (count) {
    case 4:
-      spe_stqd(p, col3, dest_ptr, 3);
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
    case 3:
-      spe_stqd(p, col2, dest_ptr, 2);
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
    case 2:
-      spe_stqd(p, col1, dest_ptr, 1);
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
    case 1:
-      spe_stqd(p, col0, dest_ptr, 0);
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
    }
 
 
@@ -166,17 +166,17 @@ emit_fetch(struct spe_function *p,
    float scale_signed = 0.0;
    float scale_unsigned = 0.0;
 
-   spe_lqd(p, v0, in_ptr, 0 + offset[0]);
-   spe_lqd(p, v1, in_ptr, 1 + offset[0]);
-   spe_lqd(p, v2, in_ptr, 2 + offset[0]);
-   spe_lqd(p, v3, in_ptr, 3 + offset[0]);
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
    offset[0] += 4;
    
    switch (bytes) {
    case 1:
       scale_signed = 1.0f / 127.0f;
       scale_unsigned = 1.0f / 255.0f;
-      spe_lqd(p, tmp, shuf_ptr, 1);
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -185,7 +185,7 @@ emit_fetch(struct spe_function *p,
    case 2:
       scale_signed = 1.0f / 32767.0f;
       scale_unsigned = 1.0f / 65535.0f;
-      spe_lqd(p, tmp, shuf_ptr, 2);
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -241,11 +241,11 @@ emit_fetch(struct spe_function *p,
 
    switch (count) {
    case 1:
-      spe_stqd(p, float_zero, out_ptr, 1);
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
    case 2:
-      spe_stqd(p, float_zero, out_ptr, 2);
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
    case 3:
-      spe_stqd(p, float_one, out_ptr, 3);
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
    }
 
    if (float_zero != -1) {
-- 
cgit v1.2.3


From 9aec1288eeae8e87adc9a99f377be536892941b2 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Wed, 8 Oct 2008 23:34:38 -0700
Subject: i915: Accelerate depth textures with border color.

The fallback was introduced to fix bug #16697, but made the test it was
fixing run excessively long.
---
 src/mesa/drivers/dri/i915/i915_texstate.c       | 19 +++++++++++++++----
 src/mesa/drivers/dri/intel/intel_tex_validate.c |  5 +----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index ae42b102db..d1b0dcdf31 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -307,10 +307,21 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    }
 
 
-   state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
-                                                tObj->_BorderChan[1],
-                                                tObj->_BorderChan[2],
-                                                tObj->_BorderChan[3]);
+   if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
+      /* GL specs that border color for depth textures is taken from the
+       * R channel, while the hardware uses A.  Spam R into all the channels
+       * for safety.
+       */
+      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
+						   tObj->_BorderChan[0],
+						   tObj->_BorderChan[0],
+						   tObj->_BorderChan[0]);
+   } else {
+      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
+						   tObj->_BorderChan[1],
+						   tObj->_BorderChan[2],
+						   tObj->_BorderChan[3]);
+   }
 
 
    I915_ACTIVESTATE(i915, I915_UPLOAD_TEX(unit), GL_TRUE);
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index 3dae738ac2..820683d42e 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -141,10 +141,7 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
 
    /* Fallback case:
     */
-   if (firstImage->base.Border ||
-       ((firstImage->base._BaseFormat == GL_DEPTH_COMPONENT) &&
-        ((tObj->WrapS == GL_CLAMP_TO_BORDER) ||
-         (tObj->WrapT == GL_CLAMP_TO_BORDER)))) {
+   if (firstImage->base.Border) {
       if (intelObj->mt) {
          intel_miptree_release(intel, &intelObj->mt);
       }
-- 
cgit v1.2.3


From 91221483a633d6230a4f8d2500ed180428754215 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 9 Oct 2008 10:23:47 -0700
Subject: i965: Actually hook up the accelerated DrawPixels support.

---
 src/mesa/drivers/dri/i965/Makefile            | 3 ++-
 src/mesa/drivers/dri/intel/intel_pixel.c      | 2 +-
 src/mesa/drivers/dri/intel/intel_pixel_draw.c | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 7bc16a02c8..005460f354 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -18,8 +18,9 @@ DRIVER_SOURCES = \
 	intel_screen.c \
 	intel_span.c \
 	intel_pixel.c \
-	intel_pixel_copy.c \
 	intel_pixel_bitmap.c \
+	intel_pixel_copy.c \
+	intel_pixel_draw.c \
 	intel_state.c \
 	intel_tex.c \
 	intel_tex_copy.c \
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.c b/src/mesa/drivers/dri/intel/intel_pixel.c
index f39fac13cf..b267ffd890 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel.c
@@ -181,9 +181,9 @@ intelInitPixelFuncs(struct dd_function_table *functions)
    if (!getenv("INTEL_NO_BLIT")) {
       functions->Bitmap = intelBitmap;
       functions->CopyPixels = intelCopyPixels;
+      functions->DrawPixels = intelDrawPixels;
 #ifdef I915
       functions->ReadPixels = intelReadPixels;
-      functions->DrawPixels = intelDrawPixels;
 #endif
    }
 }
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_draw.c b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
index be213e7b96..b60dad7460 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
@@ -51,7 +51,6 @@
 #include "intel_regions.h"
 #include "intel_pixel.h"
 #include "intel_buffer_objects.h"
-#include "intel_tris.h"
 
 
 static GLboolean
-- 
cgit v1.2.3


From 7216679c1998b49ff5b08e6b43f8d5779415bf54 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 9 Oct 2008 11:45:58 -0700
Subject: i965: Accelerate depth textures with border color.

The fallback was introduced to fix bug #16697, but made the test it was
fixing run excessively long.
---
 src/mesa/drivers/dri/i965/brw_fallback.c         |  5 +----
 src/mesa/drivers/dri/i965/brw_wm_sampler_state.c | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c
index 2f6b7febbd..4ea660a51a 100644
--- a/src/mesa/drivers/dri/i965/brw_fallback.c
+++ b/src/mesa/drivers/dri/i965/brw_fallback.c
@@ -74,10 +74,7 @@ static GLboolean do_check_fallback(struct brw_context *brw)
       if (texUnit->_ReallyEnabled) {
 	 struct intel_texture_object *intelObj = intel_texture_object(texUnit->_Current);
 	 struct gl_texture_image *texImage = intelObj->base.Image[0][intelObj->firstLevel];
-	 if (texImage->Border ||
-         ((texImage->_BaseFormat == GL_DEPTH_COMPONENT) &&
-          ((texImage->TexObject->WrapS == GL_CLAMP_TO_BORDER) || 
-           (texImage->TexObject->WrapT == GL_CLAMP_TO_BORDER)))) {
+	 if (texImage->Border) {
 	    DBG("FALLBACK: texture border\n");
 	    return GL_TRUE;
 	 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index 977b90ad43..f12ef47a7d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -229,6 +229,9 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	 struct wm_sampler_entry *entry = &key->sampler[unit];
 	 struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[unit];
 	 struct gl_texture_object *texObj = texUnit->_Current;
+	 struct intel_texture_object *intelObj = intel_texture_object(texObj);
+	 struct gl_texture_image *firstImage =
+	    texObj->Image[0][intelObj->firstLevel];
 
 	 entry->wrap_r = texObj->WrapR;
 	 entry->wrap_s = texObj->WrapS;
@@ -244,8 +247,22 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
     entry->comparefunc = texObj->CompareFunc;
 
 	 dri_bo_unreference(brw->wm.sdc_bo[unit]);
-	 brw->wm.sdc_bo[unit] = upload_default_color(brw, texObj->BorderColor);
-
+	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
+	    float bordercolor[4] = {
+	       texObj->BorderColor[0],
+	       texObj->BorderColor[0],
+	       texObj->BorderColor[0],
+	       texObj->BorderColor[0]
+	    };
+	    /* GL specs that border color for depth textures is taken from the
+	     * R channel, while the hardware uses A.  Spam R into all the
+	     * channels for safety.
+	     */
+	    brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor);
+	 } else {
+	    brw->wm.sdc_bo[unit] = upload_default_color(brw,
+							texObj->BorderColor);
+	 }
 	 key->sampler_count = unit + 1;
       }
    }
-- 
cgit v1.2.3


From db9de99925ee7d16ef2e99d41510e7231aa25366 Mon Sep 17 00:00:00 2001
From: Stephane Marchesin <marchesin@icps.u-strasbg.fr>
Date: Thu, 9 Oct 2008 23:32:01 +0200
Subject: Gallivm: cleanup soa storage.

---
 src/gallium/auxiliary/gallivm/soabuiltins.c  |  1 -
 src/gallium/auxiliary/gallivm/storagesoa.cpp | 45 ++++++++--------------------
 2 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
index cb85e1734e..b20f3c4963 100644
--- a/src/gallium/auxiliary/gallivm/soabuiltins.c
+++ b/src/gallium/auxiliary/gallivm/soabuiltins.c
@@ -167,7 +167,6 @@ void min(float4 *res,
    res[3] = minvec(tmp0w, tmp1w);
 }
 
-
 void max(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
          float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index d4ecf97c36..4fc075cf6d 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -91,29 +91,19 @@ void StorageSoa::declareImmediates()
    for (unsigned int i = 0; i < m_immediatesToFlush.size(); ++i) {
       std::vector<float> vec = m_immediatesToFlush[i];
       std::vector<float> vals(4);
-      float val;
       std::vector<Constant*> channelArray;
 
-      val = vec[0];
-      llvm::Constant *xChannel = createConstGlobalFloat(val);
-      val = vec[1];
-      llvm::Constant *yChannel = createConstGlobalFloat(val);
-      val = vec[2];
-      llvm::Constant *zChannel = createConstGlobalFloat(val);
-      val = vec[3];
-      llvm::Constant *wChannel = createConstGlobalFloat(val);
+      vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
+      llvm::Constant *xChannel = createConstGlobalVector(vals);
 
-//      vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
-//      llvm::Constant *xChannel = createConstGlobalVector(vec[0]);
-
-/*      vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
+      vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
       llvm::Constant *yChannel = createConstGlobalVector(vals);
 
       vals[0] = vec[2]; vals[1] = vec[2]; vals[2] = vec[2]; vals[3] = vec[2];
       llvm::Constant *zChannel = createConstGlobalVector(vals);
 
       vals[0] = vec[3]; vals[1] = vec[3]; vals[2] = vec[3]; vals[3] = vec[3];
-      llvm::Constant *wChannel = createConstGlobalVector(vals);*/
+      llvm::Constant *wChannel = createConstGlobalVector(vals);
       channelArray.push_back(xChannel);
       channelArray.push_back(yChannel);
       channelArray.push_back(zChannel);
@@ -177,29 +167,18 @@ llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::
 
 std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
 {
-   std::vector<llvm::Value*> res(4);
+   llvm::Value* res;
    std::vector<llvm::Value*> res2(4);
    llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
 
    xChannel = elementPointer(m_consts, idx, 0);
-/*   yChannel = elementPointer(m_consts, idx, 1);
-   zChannel = elementPointer(m_consts, idx, 2);
-   wChannel = elementPointer(m_consts, idx, 3);*/
-
-   res[0] = alignedArrayLoad(xChannel);
-/* res[1] = alignedArrayLoad(xChannel);
-   res[2] = alignedArrayLoad(xChannel);
-   res[3] = alignedArrayLoad(xChannel);*/
-
-
-   res2[0]=unpackConstElement(m_builder, res[0],0);
-   res2[1]=unpackConstElement(m_builder, res[0],1);
-   res2[2]=unpackConstElement(m_builder, res[0],2);
-   res2[3]=unpackConstElement(m_builder, res[0],3);
-/*res[0] = alignedArrayLoad(xChannel);
-   res[1] = alignedArrayLoad(yChannel);
-   res[2] = alignedArrayLoad(zChannel);
-   res[3] = alignedArrayLoad(wChannel);*/
+
+   res = alignedArrayLoad(xChannel);
+
+   res2[0]=unpackConstElement(m_builder, res,0);
+   res2[1]=unpackConstElement(m_builder, res,1);
+   res2[2]=unpackConstElement(m_builder, res,2);
+   res2[3]=unpackConstElement(m_builder, res,3);
 
    return res2;
 }
-- 
cgit v1.2.3


From f7556fdd40ed2719beaba271eee4a7551e212ad1 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 16:39:59 -0600
Subject: mesa: rasterizer state depends on ST_NEW_VERTEX_PROGRAM

Check for per-vertex point size must be done when vertex program changes.
---
 src/mesa/state_tracker/st_atom_rasterizer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index fc47896c24..5eef4ebe92 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -215,6 +215,9 @@ static void update_raster_state( struct st_context *st )
          raster->sprite_coord_mode[i] = PIPE_SPRITE_COORD_NONE;
       }
    }
+
+   /* ST_NEW_VERTEX_PROGRAM
+    */
    if (vertProg) {
       if (vertProg->Base.Id == 0) {
          if (vertProg->Base.OutputsWritten & (1 << VERT_RESULT_PSIZ)) {
@@ -277,7 +280,7 @@ const struct st_tracked_state st_update_rasterizer = {
        _NEW_POLYGON |
        _NEW_PROGRAM |
        _NEW_SCISSOR),      /* mesa state dependencies*/
-      0,                   /* state tracker dependencies */
+      ST_NEW_VERTEX_PROGRAM,  /* state tracker dependencies */
    },
    update_raster_state     /* update function */
 };
-- 
cgit v1.2.3


From ca5224945ae11d3c2e80fd39b7e08464d019bbdd Mon Sep 17 00:00:00 2001
From: Alan Hourihane <alanh@tungstengraphics.com>
Date: Fri, 10 Oct 2008 01:31:34 +0100
Subject: gallium: silence warning

---
 src/gallium/auxiliary/draw/draw_private.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 626a2e3e30..40a72c9dcf 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -193,7 +193,7 @@ struct draw_context
 
       const float (*aligned_constants)[4];
 
-      float (*aligned_constant_storage)[4];
+      const float (*aligned_constant_storage)[4];
       unsigned const_storage_size;
 
 
-- 
cgit v1.2.3


From 2a3fa97be3d10a6d4e36c6d232afb884efd69d55 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 08:24:03 -0600
Subject: cell: more accurate comments

---
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index f920ae13b4..de170d1036 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1841,7 +1841,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
-      spe_comment(f, 0, "Computing tile location in memory");
+      spe_comment(f, 0, "Compute quad offset within tile");
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
       spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
@@ -1869,7 +1869,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       boolean fbS_reg_set = false, fbZ_reg_set = false;
       unsigned int fbS_reg, fbZ_reg = 0;
 
-      spe_comment(f, 0, "Loading Z/stencil tile");
+      spe_comment(f, 0, "Fetch quad's Z/stencil values from tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
@@ -1993,7 +1993,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
           * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
           * fbS_reg has four 8-bit Z values in bits [7..0].
           */
-         spe_comment(f, 0, "Storing depth/stencil values");
+         spe_comment(f, 0, "Store quad's depth/stencil values in tile");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
             if (fbS_reg_set) {
@@ -2038,6 +2038,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
     * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
     * we could skip this load.
     */
+   spe_comment(f, 0, "Fetch quad colors from tile");
    spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
 
    if (blend->blend_enable) {
@@ -2055,7 +2056,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       int rgba_reg = spe_allocate_available_register(f);
 
       /* Pack four float colors as four 32-bit int colors */
-      spe_comment(f, 0, "Convert fragment colors to framebuffer colors");
+      spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
       gen_pack_colors(f, color_format,
                       fragR_reg, fragG_reg, fragB_reg, fragA_reg,
                       rgba_reg);
@@ -2081,7 +2082,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       /* Store updated quad in tile:
        * memory[color_tile + quad_offset] = rgba_reg;
        */
-      spe_comment(f, 0, "Store framebuffer colors");
+      spe_comment(f, 0, "Store quad colors into color tile");
       spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
 
       spe_release_register(f, rgba_reg);
-- 
cgit v1.2.3


From b9689791ddd1030f7cd25af21701f56d89e0f3b0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 08:52:31 -0600
Subject: cell: massage the emit functions to get better instruction scheduling

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 130 ++++++++++++++++-------------
 1 file changed, 74 insertions(+), 56 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 640ebcadbb..e6d994205c 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -398,15 +398,19 @@ emit_epilogue(struct codegen *gen)
 static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, src_reg[4], dst_reg[4];
    spe_comment(gen->f, -4, "MOV:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          /* XXX we don't always need to actually emit a mov instruction here */
-         spe_move(gen->f, dst_reg, src_reg);
-         store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+         store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
    }
@@ -421,22 +425,25 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
    spe_comment(gen->f, -4, "ADD:");
-   /* Loop over Red/Green/Blue/Alpha channels */
+   /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
    for (ch = 0; ch < 4; ch++) {
       /* If the dest R, G, B or A writemask is enabled... */
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         /* get indexes of the two src, one dest SPE registers */
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          /* Emit actual SPE instruction: d = s1 + s2 */
-         spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-
+         spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
          /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          /* Free any intermediate temps we allocated */
          free_itemps(gen);
       }
@@ -450,23 +457,20 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
    spe_comment(gen->f, -4, "SUB:");
-   /* Loop over Red/Green/Blue/Alpha channels */
    for (ch = 0; ch < 4; ch++) {
-      /* If the dest R, G, B or A writemask is enabled... */
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         /* get indexes of the two src, one dest SPE registers */
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
-         /* Emit actual SPE instruction: d = s1 - s2 */
-         spe_fs(gen->f, d_reg, s1_reg, s2_reg);
-
-         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         /* Free any intermediate temps we allocated */
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 - s2 */
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
    }
@@ -479,17 +483,21 @@ emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
    spe_comment(gen->f, -4, "MAD:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          /* d = s1 * s2 + s3 */
-         spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg);
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
    }
@@ -527,16 +535,20 @@ emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
    spe_comment(gen->f, -4, "MUL:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          /* d = s1 * s2 */
-         spe_fm(gen->f, d_reg, s1_reg, s2_reg);
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
    }
@@ -621,29 +633,35 @@ static boolean
 emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+   int s1x_reg, s1y_reg, s1z_reg;
+   int s2x_reg, s2y_reg, s2z_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
    spe_comment(gen->f, -4, "DP3:");
 
-   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
-   int tmp_reg = get_itemp(gen);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
 
-   /* t = x0 * x1 */
-   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
-   /* t = y0 * y1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
-   /* t = z0 * z1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         spe_move(gen->f, d_reg, tmp_reg);
+         spe_move(gen->f, d_reg, t0_reg);
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
-- 
cgit v1.2.3


From c201e357eb95f9b18b8d9b8a534ae2594a176904 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 10:56:25 -0600
Subject: cell: better immediate value allocation, better comments

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 39 +++++++++++++++---------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index e6d994205c..5647bb23e6 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1526,16 +1526,23 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
 
    for (ch = 0; ch < 4; ch++) {
       float val = immed->u.ImmediateFloat32[ch].Float;
-      int reg = spe_allocate_available_register(gen->f);
 
-      if (reg < 0)
-         return false;
+      if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+         /* re-use previous register */
+         gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+      }
+      else {
+         int reg = spe_allocate_available_register(gen->f);
 
-      /* update immediate map */
-      gen->imm_regs[gen->num_imm][ch] = reg;
+         if (reg < 0)
+            return false;
 
-      /* emit initializer instruction */
-      spe_load_float(gen->f, reg, val);
+         /* update immediate map */
+         gen->imm_regs[gen->num_imm][ch] = reg;
+
+         /* emit initializer instruction */
+         spe_load_float(gen->f, reg, val);
+      }
    }
 
    gen->num_imm++;
@@ -1558,12 +1565,6 @@ emit_declaration(struct cell_context *cell,
 
    switch (decl->Declaration.File) {
    case TGSI_FILE_TEMPORARY:
-      if (cell->debug_flags & CELL_DEBUG_ASM) {
-         printf("Declare temp reg %d .. %d\n",
-                decl->DeclarationRange.First,
-                decl->DeclarationRange.Last);
-      }
-
       for (i = decl->DeclarationRange.First;
            i <= decl->DeclarationRange.Last;
            i++) {
@@ -1578,12 +1579,12 @@ emit_declaration(struct cell_context *cell,
           * to SPU memory.  someday...
           */
 
-         if (cell->debug_flags & CELL_DEBUG_ASM) {
-            printf("  SPE regs: %d %d %d %d\n",
-                   gen->temp_regs[i][0],
-                   gen->temp_regs[i][1],
-                   gen->temp_regs[i][2],
-                   gen->temp_regs[i][3]);
+         {
+            char buf[100];
+            sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+                    gen->temp_regs[i][0], gen->temp_regs[i][1],
+                    gen->temp_regs[i][2], gen->temp_regs[i][3]);
+            spe_comment(gen->f, -4, buf);
          }
       }
       break;
-- 
cgit v1.2.3


From f6e806a2b8c3e54ac694810616e79924dfd84826 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 19:45:03 -0600
Subject: mesa: simple multiple textures test

---
 progs/demos/Makefile   |   1 +
 progs/demos/textures.c | 304 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 305 insertions(+)
 create mode 100644 progs/demos/textures.c

diff --git a/progs/demos/Makefile b/progs/demos/Makefile
index 123d1e59e9..467dbc5cdf 100644
--- a/progs/demos/Makefile
+++ b/progs/demos/Makefile
@@ -66,6 +66,7 @@ PROGS = \
 	texdown \
 	texenv \
 	texobj \
+	textures \
 	trispd \
 	tunnel \
 	tunnel2 \
diff --git a/progs/demos/textures.c b/progs/demos/textures.c
new file mode 100644
index 0000000000..9f11604635
--- /dev/null
+++ b/progs/demos/textures.c
@@ -0,0 +1,304 @@
+/*
+ * Simple test of multiple textures
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <GL/glut.h>
+#include "readtex.h"
+
+#define MAX_TEXTURES 8
+
+
+static int Win;
+static GLfloat Xrot = 0, Yrot = 0, Zrot = 0;
+static GLboolean Anim = GL_TRUE;
+static GLboolean Blend = GL_FALSE;
+static GLboolean MipMap = 1+GL_FALSE;
+
+static GLuint NumTextures;
+static GLuint Textures[MAX_TEXTURES];
+static float TexRot[MAX_TEXTURES][3];
+static float TexPos[MAX_TEXTURES][3];
+static float TexAspect[MAX_TEXTURES];
+
+static const char *DefaultFiles[] = {
+   "../images/arch.rgb",
+   "../images/reflect.rgb",
+   "../images/tree2.rgba",
+   "../images/tile.rgb"
+};
+
+
+static void
+Idle(void)
+{
+   Xrot = glutGet(GLUT_ELAPSED_TIME) * 0.02;
+   Yrot = glutGet(GLUT_ELAPSED_TIME) * 0.04;
+   //Zrot += 2.0;
+   glutPostRedisplay();
+}
+
+
+static void
+DrawTextures(void)
+{
+   GLuint i;
+
+   for (i = 0; i < NumTextures; i++) {
+      GLfloat ar = TexAspect[i];
+
+      glPushMatrix();
+      glTranslatef(TexPos[i][0], TexPos[i][1], TexPos[i][2]);
+      glRotatef(TexRot[i][0], 1, 0, 0);
+      glRotatef(TexRot[i][1], 0, 1, 0);
+      glRotatef(TexRot[i][2], 0, 0, 1);
+
+      glBindTexture(GL_TEXTURE_2D, Textures[i]);
+      glBegin(GL_POLYGON);
+      glTexCoord2f( 0.0, 0.0 );   glVertex2f( -ar, -1.0 );
+      glTexCoord2f( 1.0, 0.0 );   glVertex2f(  ar, -1.0 );
+      glTexCoord2f( 1.0, 1.0 );   glVertex2f(  ar,  1.0 );
+      glTexCoord2f( 0.0, 1.0 );   glVertex2f( -ar,  1.0 );
+      glEnd();
+
+      glPopMatrix();
+   }
+}
+
+static void
+Draw(void)
+{
+   glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+   if (Blend) {
+      glEnable(GL_BLEND);
+      glDisable(GL_DEPTH_TEST);
+   }
+   else {
+      glDisable(GL_BLEND);
+      glEnable(GL_DEPTH_TEST);
+   }
+
+   glPushMatrix();
+   glRotatef(Xrot, 1, 0, 0);
+   glRotatef(Yrot, 0, 1, 0);
+   glRotatef(Zrot, 0, 0, 1);
+
+   DrawTextures();
+
+   glPopMatrix();
+
+   glutSwapBuffers();
+}
+
+
+static void
+Reshape(int width, int height)
+{
+   glViewport(0, 0, width, height);
+   glMatrixMode(GL_PROJECTION);
+   glLoadIdentity();
+   glFrustum(-1.0, 1.0, -1.0, 1.0, 5.0, 50.0);
+   glMatrixMode(GL_MODELVIEW);
+   glLoadIdentity();
+   glTranslatef(0.0, 0.0, -10.0);
+}
+
+
+static GLfloat
+RandFloat(float min, float max)
+{
+   float x = (float) (rand() % 1000) * 0.001;
+   x = x * (max - min) + min;
+   return x;
+}
+
+
+static void
+Randomize(void)
+{
+   GLfloat k = 1.0;
+   GLuint i;
+
+   srand(glutGet(GLUT_ELAPSED_TIME));
+
+   for (i = 0; i < NumTextures; i++) {
+      TexRot[i][0] = RandFloat(0.0, 360);
+      TexRot[i][1] = RandFloat(0.0, 360);
+      TexRot[i][2] = RandFloat(0.0, 360);
+      TexPos[i][0] = RandFloat(-k, k);
+      TexPos[i][1] = RandFloat(-k, k);
+      TexPos[i][2] = RandFloat(-k, k);
+   }
+}
+
+
+static void
+SetTexFilters(void)
+{
+   GLuint i;
+   for (i = 0; i < NumTextures; i++) {
+      glBindTexture(GL_TEXTURE_2D, Textures[i]);
+      if (MipMap) {
+         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER,
+                         GL_LINEAR_MIPMAP_LINEAR);
+         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+      }
+      else {
+         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+      }
+   }
+}
+
+
+static void
+Key(unsigned char key, int x, int y)
+{
+   const GLfloat step = 3.0;
+   (void) x;
+   (void) y;
+   switch (key) {
+   case 'a':
+      Anim = !Anim;
+      if (Anim)
+         glutIdleFunc(Idle);
+      else
+         glutIdleFunc(NULL);
+      break;
+   case 'b':
+      Blend = !Blend;
+      break;
+   case 'm':
+      MipMap = !MipMap;
+      SetTexFilters();
+      break;      
+   case 'r':
+      Randomize();
+   case 'z':
+      Zrot -= step;
+      break;
+   case 'Z':
+      Zrot += step;
+      break;
+   case 27:
+      glutDestroyWindow(Win);
+      exit(0);
+      break;
+   }
+
+   printf("Blend=%s MipMap=%s\n",
+          Blend ? "Y" : "n",
+          MipMap ? "Y" : "n");
+
+   glutPostRedisplay();
+}
+
+
+static void
+SpecialKey(int key, int x, int y)
+{
+   const GLfloat step = 3.0;
+   (void) x;
+   (void) y;
+   switch (key) {
+      case GLUT_KEY_UP:
+         Xrot -= step;
+         break;
+      case GLUT_KEY_DOWN:
+         Xrot += step;
+         break;
+      case GLUT_KEY_LEFT:
+         Yrot -= step;
+         break;
+      case GLUT_KEY_RIGHT:
+         Yrot += step;
+         break;
+   }
+   glutPostRedisplay();
+}
+
+
+static void
+LoadTextures(GLuint n, const char *files[])
+{
+   GLuint i;
+
+   NumTextures = n < MAX_TEXTURES ? n : MAX_TEXTURES;
+
+   glGenTextures(n, Textures);
+
+   for (i = 0; i < n; i++) {
+      GLint w, h;
+      glBindTexture(GL_TEXTURE_2D, Textures[i]);
+      glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+      glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+      if (!LoadRGBMipmaps2(files[i], GL_TEXTURE_2D, GL_RGB, &w, &h)) {
+         printf("Error: couldn't load %s\n", files[i]);
+         exit(1);
+      }
+      TexAspect[i] = (float) w / (float) h;
+      printf("Loaded %s\n", files[i]);
+   }
+}
+
+
+static void
+Init(int argc, const char *argv[])
+{
+   if (argc == 1)
+      LoadTextures(4, DefaultFiles);
+   else
+      LoadTextures(argc - 1, argv + 1);
+
+   Randomize();
+
+   glEnable(GL_TEXTURE_2D);
+
+   glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+   glColor4f(1, 1, 1, 0.5);
+
+#if 0
+   /* setup lighting, etc */
+   glEnable(GL_LIGHTING);
+   glEnable(GL_LIGHT0);
+#endif
+}
+
+
+static void
+Usage(void)
+{
+   printf("Usage:\n");
+   printf("  textures [file.rgb] ...\n");
+   printf("Keys:\n");
+   printf("  a - toggle animation\n");
+   printf("  b - toggle blending\n");
+   printf("  m - toggle mipmapping\n");
+   printf("  r - randomize\n");
+   printf("  ESC - exit\n");
+}
+
+
+int
+main(int argc, char *argv[])
+{
+   glutInit(&argc, argv);
+   glutInitWindowPosition(0, 0);
+   glutInitWindowSize(400, 400);
+   glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE | GLUT_DEPTH);
+   Win = glutCreateWindow(argv[0]);
+   glutReshapeFunc(Reshape);
+   glutKeyboardFunc(Key);
+   glutSpecialFunc(SpecialKey);
+   glutDisplayFunc(Draw);
+   if (Anim)
+      glutIdleFunc(Idle);
+   Init(argc, (const char **) argv);
+   Usage();
+   glutMainLoop();
+   return 0;
+}
-- 
cgit v1.2.3


From 583098e3cb602fd9810a7c65718155fd9b0b3fda Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 19:48:53 -0600
Subject: cell: implement basic TXP instruction in fragment shaders

Lots of restrictions for now (one 2D texture, no mipmaps, etc.) for now
but basic texture demos work.
TEX, TXD, TXP do the same thing for the time being.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 109 ++++++++++++++++++++++++-----
 src/gallium/drivers/cell/spu/spu_funcs.c   |  51 ++++++++++++--
 src/gallium/drivers/cell/spu/spu_tri.c     |   2 +-
 3 files changed, 138 insertions(+), 24 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 5647bb23e6..c8125a8a05 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -226,6 +226,11 @@ get_src_reg(struct codegen *gen,
             spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
          }
          break;
+      case TGSI_FILE_SAMPLER:
+         {
+            reg = 3; /* XXX total hack */
+         }
+         break;
       default:
          assert(0);
       }
@@ -1162,6 +1167,21 @@ print_functions(struct cell_context *cell)
 #endif
 
 
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+   const struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i, addr = 0;
+   for (i = 0; i < funcs->num; i++) {
+      if (strcmp(funcs->names[i], funcname) == 0) {
+         addr = funcs->addrs[i];
+      }
+   }
+   assert(addr && "spu function not found");
+   return addr / 4;  /* discard 2 least significant bits */
+}
+
+
 /**
  * Emit code to call a SPU function.
  * Used to implement instructions like SIN/COS/POW/TEX/etc.
@@ -1171,27 +1191,12 @@ emit_function_call(struct codegen *gen,
                    const struct tgsi_full_instruction *inst,
                    char *funcname, uint num_args)
 {
-   const struct cell_spu_function_info *funcs = &gen->cell->spu_functions;
+   const uint addr = lookup_function(gen->cell, funcname);
    char comment[100];
-   uint addr;
    int ch;
 
    assert(num_args <= 3);
 
-   /* lookup function address */
-   {
-      uint i;
-      addr = 0;
-      for (i = 0; i < funcs->num; i++) {
-         if (strcmp(funcs->names[i], funcname) == 0) {
-            addr = funcs->addrs[i];
-         }
-      }
-      assert(addr && "spu function not found");
-   }
-
-   addr /= 4; /* discard 2 least significant bits */
-
    snprintf(comment, sizeof(comment), "CALL %s:", funcname);
    spe_comment(gen->f, -4, comment);
 
@@ -1245,6 +1250,72 @@ emit_function_call(struct codegen *gen,
 }
 
 
+static boolean
+emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const uint addr = lookup_function(gen->cell, "spu_txp");
+   int ch;
+   int coord_regs[4], d_regs[4];
+
+   spe_comment(gen->f, -4, "CALL txp:");
+
+   /* get src/dst reg info */
+   for (ch = 0; ch < 4; ch++) {
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   {
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      numUsed = spe_get_registers_used(gen->f, usedRegs);
+      assert(numUsed < gen->frame_size / 16 - 32);
+
+      /* save registers to stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         int offset = 2 + i;
+         spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+      }
+
+      /* setup function arguments */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, 3 + i, coord_regs[i]);
+      }
+
+      /* branch to function, save return addr */
+      spe_brasl(gen->f, SPE_REG_RA, addr);
+
+      /* save function's return values (four pixel's colors) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, d_regs[i], 3 + i);
+      }
+
+      /* restore registers from stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         if (reg != d_regs[0] &&
+             reg != d_regs[1] &&
+             reg != d_regs[2] &&
+             reg != d_regs[3]) {
+            int offset = 2 + i;
+            spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return TRUE;
+}
+
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
@@ -1483,6 +1554,12 @@ emit_instruction(struct codegen *gen,
       return emit_function_call(gen, inst, "spu_exp2", 1);
    case TGSI_OPCODE_LOGBASE2:
       return emit_function_call(gen, inst, "spu_log2", 1);
+   case TGSI_OPCODE_TEX:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXD:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXP:
+      return emit_TXP(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 1adf9de0e8..c7bcb3de9d 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -38,12 +38,20 @@
 #include <math.h>
 #include <cos14_v.h>
 #include <sin14_v.h>
+#include <transpose_matrix4x4.h>
 
 #include "cell/common.h"
 #include "spu_main.h"
 #include "spu_funcs.h"
 
 
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+   vector float v[4];
+};
+
+
 static vector float
 spu_cos(vector float x)
 {
@@ -92,16 +100,44 @@ spu_log2(vector float x)
    return spu_mul(v, k);
 }
 
+static struct vec_4x4
+spu_txp(vector float s, vector float t, vector float r, vector float q)
+{
+   const uint unit = 0;
+   struct vec_4x4 colors;
+   vector float coords[4];
+
+   coords[0] = s;
+   coords[1] = t;
+   coords[2] = r;
+   coords[3] = q;
+   _transpose_matrix4x4(coords, coords);
+
+   /* get four texture samples */
+   colors.v[0] = spu.sample_texture[unit](unit, coords[0]);
+   colors.v[1] = spu.sample_texture[unit](unit, coords[1]);
+   colors.v[2] = spu.sample_texture[unit](unit, coords[2]);
+   colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
+
+   _transpose_matrix4x4(colors.v, colors.v);
+   return colors;
+}
+
 
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
 static void
-add_func(struct cell_spu_function_info *spu_functions,
-             const char *name, void *addr)
+export_func(struct cell_spu_function_info *spu_functions,
+            const char *name, void *addr)
 {
    uint n = spu_functions->num;
    ASSERT(strlen(name) < 16);
    strcpy(spu_functions->names[n], name);
    spu_functions->addrs[n] = (uint) addr;
    spu_functions->num++;
+   ASSERT(spu_functions->num <= 16);
 }
 
 
@@ -119,11 +155,12 @@ return_function_info(void)
    ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
 
    funcs.num = 0;
-   add_func(&funcs, "spu_cos", &spu_cos);
-   add_func(&funcs, "spu_sin", &spu_sin);
-   add_func(&funcs, "spu_pow", &spu_pow);
-   add_func(&funcs, "spu_exp2", &spu_exp2);
-   add_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_cos", &spu_cos);
+   export_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_pow", &spu_pow);
+   export_func(&funcs, "spu_exp2", &spu_exp2);
+   export_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_txp", &spu_txp);
 
    /* Send the function info back to the PPU / main memory */
    mfc_put((void *) &funcs,  /* src in local store */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 6039cd80b2..87991c3136 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -286,7 +286,7 @@ emit_quad( int x, int y, mask_t mask)
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
-      if (spu.texture[0].start) {
+      if (0/*spu.texture[0].start*/) {
          /*
           * Temporary texture mapping path
           * This will go away when fragment programs support TEX inst.
-- 
cgit v1.2.3


From 02aea66b1ad7703f9c46e939eaa2d7aa91073c39 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 19:50:57 -0600
Subject: mesa: toggle colormask values with r/g/b keys in tri-mask-tri.c

Plus misc clean-up.
---
 progs/trivial/tri-mask-tri.c | 123 ++++++++++++++++++++++---------------------
 1 file changed, 63 insertions(+), 60 deletions(-)

diff --git a/progs/trivial/tri-mask-tri.c b/progs/trivial/tri-mask-tri.c
index 38ecd20a73..8333f7ed8a 100644
--- a/progs/trivial/tri-mask-tri.c
+++ b/progs/trivial/tri-mask-tri.c
@@ -28,48 +28,53 @@
 #include <GL/glut.h>
 
 
-#define CI_OFFSET_1 16
-#define CI_OFFSET_2 32
-
 GLint Width = 250, Height = 250;
-
 GLenum doubleBuffer;
+GLint Win;
+GLboolean Rmask = GL_TRUE, Gmask = GL_FALSE, Bmask = GL_TRUE;
+
 
 static void Init(void)
 {
    fprintf(stderr, "GL_RENDERER   = %s\n", (char *) glGetString(GL_RENDERER));
    fprintf(stderr, "GL_VERSION    = %s\n", (char *) glGetString(GL_VERSION));
    fprintf(stderr, "GL_VENDOR     = %s\n", (char *) glGetString(GL_VENDOR));
-
-    glClearColor(0.0, 0.0, 1.0, 0.0);
+   glClearColor(0.0, 0.0, 1.0, 0.0);
 }
 
 static void Reshape(int width, int height)
 {
-
-    glViewport(0, 0, (GLint)width, (GLint)height);
-
-    glMatrixMode(GL_PROJECTION);
-    glLoadIdentity();
-    glOrtho(-1.0, 1.0, -1.0, 1.0, -0.5, 1000.0);
-    glMatrixMode(GL_MODELVIEW);
+   glViewport(0, 0, (GLint)width, (GLint)height);
+   glMatrixMode(GL_PROJECTION);
+   glLoadIdentity();
+   glOrtho(-1.0, 1.0, -1.0, 1.0, -0.5, 1000.0);
+   glMatrixMode(GL_MODELVIEW);
 }
 
 static void Key(unsigned char key, int x, int y)
 {
-
-    switch (key) {
-      case 27:
-	exit(1);
-      default:
-	return;
-    }
-
-    glutPostRedisplay();
+   switch (key) {
+   case 'r':
+      Rmask = !Rmask;
+      break;
+   case 'g':
+      Gmask = !Gmask;
+      break;
+   case 'b':
+      Bmask = !Bmask;
+      break;
+   case 27:
+      glutDestroyWindow(Win);
+      exit(1);
+   default:
+      return;
+   }
+   glutPostRedisplay();
 }
 
 static void Draw(void)
 {
+   printf("ColorMask = %d, %d, %d\n", Rmask, Gmask, Bmask);
    glColorMask(1,1,1,1);
 
    glClear(GL_COLOR_BUFFER_BIT); 
@@ -82,7 +87,7 @@ static void Draw(void)
    glVertex3f(-0.9,  0.0, -30.0);
    glEnd();
 
-   glColorMask(1,0,1,0);
+   glColorMask(Rmask, Gmask, Bmask, 0);
 
    /* left triangle: white&mask: purple   middle region: white */
    glBegin(GL_TRIANGLES);
@@ -103,48 +108,46 @@ static void Draw(void)
 
 static GLenum Args(int argc, char **argv)
 {
-    GLint i;
-
-    doubleBuffer = GL_FALSE;
-
-    for (i = 1; i < argc; i++) {
-        if (strcmp(argv[i], "-sb") == 0) {
-	    doubleBuffer = GL_FALSE;
-	} else if (strcmp(argv[i], "-db") == 0) {
-	    doubleBuffer = GL_TRUE;
-	} else {
-	    fprintf(stderr, "%s (Bad option).\n", argv[i]);
-	    return GL_FALSE;
-	}
-    }
-    return GL_TRUE;
+   GLint i;
+
+   doubleBuffer = GL_FALSE;
+
+   for (i = 1; i < argc; i++) {
+      if (strcmp(argv[i], "-sb") == 0) {
+         doubleBuffer = GL_FALSE;
+      }
+      else if (strcmp(argv[i], "-db") == 0) {
+         doubleBuffer = GL_TRUE;
+      }
+      else {
+         fprintf(stderr, "%s (Bad option).\n", argv[i]);
+         return GL_FALSE;
+      }
+   }
+   return GL_TRUE;
 }
 
+
 int main(int argc, char **argv)
 {
-    GLenum type;
-
-    glutInit(&argc, argv);
+   GLenum type;
 
-    if (Args(argc, argv) == GL_FALSE) {
-	exit(1);
-    }
+   glutInit(&argc, argv);
 
-    glutInitWindowPosition(100, 0); glutInitWindowSize(Width, Height);
-
-    type = GLUT_RGB;
-    type |= (doubleBuffer) ? GLUT_DOUBLE : GLUT_SINGLE;
-    glutInitDisplayMode(type);
-
-    if (glutCreateWindow("First Tri") == GL_FALSE) {
-	exit(1);
-    }
-
-    Init();
+   if (Args(argc, argv) == GL_FALSE) {
+      exit(1);
+   }
 
-    glutReshapeFunc(Reshape);
-    glutKeyboardFunc(Key);
-    glutDisplayFunc(Draw);
-    glutMainLoop();
-	return 0;
+   type = GLUT_RGB;
+   type |= (doubleBuffer) ? GLUT_DOUBLE : GLUT_SINGLE;
+
+   glutInitWindowPosition(100, 0); glutInitWindowSize(Width, Height);
+   glutInitDisplayMode(type);
+   Win = glutCreateWindow("First Tri");
+   Init();
+   glutReshapeFunc(Reshape);
+   glutKeyboardFunc(Key);
+   glutDisplayFunc(Draw);
+   glutMainLoop();
+   return 0;
 }
-- 
cgit v1.2.3


From 7ac1fc77661faf0897507fef0437fe69d0ba53ac Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 19:54:46 -0600
Subject: cell: fix incorrect bitmask in spe_load_uint()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 9274bc5e3c..cc35f0ba5b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -727,7 +727,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
     * Bytes Immediate (fsmbi) to load the value in a single instruction.
     * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
-   if ((ui & 0xfffc0000) == ui) {
+   if ((ui & 0x3ffff) == ui) {
       spe_ila(p, rT, ui);
    }
    else if ((ui >> 16) == (ui & 0xffff)) {
-- 
cgit v1.2.3


From 086a56134f334505ca9cd6f57194280c1ebf44dc Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 08:44:29 -0600
Subject: cell: updates in response to draw's struct vertex_info changes

---
 src/gallium/drivers/cell/spu/spu_tri.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 87991c3136..a62d4f0f2f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -215,7 +215,7 @@ clip_emit_quad(struct setup_stage *setup)
 static INLINE void
 eval_coeff(uint slot, float x, float y, vector float result[4])
 {
-   switch (spu.vertex_info.interp_mode[slot]) {
+   switch (spu.vertex_info.attrib[slot].interp_mode) {
    case INTERP_CONSTANT:
       result[QUAD_TOP_LEFT] =
       result[QUAD_TOP_RIGHT] =
@@ -776,7 +776,7 @@ static void setup_tri_coefficients(void)
    uint i;
 
    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-      switch (spu.vertex_info.interp_mode[i]) {
+      switch (spu.vertex_info.attrib[i].interp_mode) {
       case INTERP_NONE:
          break;
       case INTERP_POS:
-- 
cgit v1.2.3


From 5e9cb42aa662022c63563b4bc7f9e1d99f6d81ee Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 10 Oct 2008 09:43:15 -0700
Subject: i965: Add missing intel_pixel_draw.c symlink to fix build.

---
 src/mesa/drivers/dri/i965/intel_pixel_draw.c | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 src/mesa/drivers/dri/i965/intel_pixel_draw.c

diff --git a/src/mesa/drivers/dri/i965/intel_pixel_draw.c b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
new file mode 120000
index 0000000000..8431a24edf
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
@@ -0,0 +1 @@
+../intel/intel_pixel_draw.c
\ No newline at end of file
-- 
cgit v1.2.3


From dc7d213c54b046ec03ddb1fcfb0d9d9e905ffedc Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 11:52:55 -0600
Subject: cell: fix bug in emit_FRC() when src register == dst register.

With this fix, the glsl/brick demo runs.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index c8125a8a05..dad74bbad8 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1119,7 +1119,7 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
 
-   spe_comment(gen->f, -4, "FLR:");
+   spe_comment(gen->f, -4, "FRC:");
 
    int zero_reg = get_itemp(gen);
    spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
@@ -1131,18 +1131,18 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int tmp_reg = get_itemp(gen);
 
          /* If negative, subtract 1.0 */
-         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
-         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
-         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+         spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+         spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
 
          /* Convert float to int */
-         spe_cflts(gen->f, d_reg, d_reg, 0);
+         spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
 
          /* Convert int to float */
-         spe_csflt(gen->f, d_reg, d_reg, 0);
+         spe_csflt(gen->f, tmp_reg, tmp_reg, 0);
 
          /* d = s1 - FLR(s1) */
-         spe_fs(gen->f, d_reg, s1_reg, d_reg);
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
-- 
cgit v1.2.3


From e43af05311acd979f43a75f8ba4d9152b453408e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 11:56:03 -0600
Subject: cell: fix bug in emit_FLR() when src reg == dst reg

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index dad74bbad8..7a4e8d20ba 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1092,15 +1092,15 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int tmp_reg = get_itemp(gen);
 
          /* If negative, subtract 1.0 */
-         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
-         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
-         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+         spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+         spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
 
          /* Convert float to int */
-         spe_cflts(gen->f, d_reg, d_reg, 0);
+         spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
 
          /* Convert int to float */
-         spe_csflt(gen->f, d_reg, d_reg, 0);
+         spe_csflt(gen->f, d_reg, tmp_reg, 0);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1111,8 +1111,7 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 /**
- * Emit frac.  
- * Input - FLR(Input)
+ * Compute frac = Input - FLR(Input)
  */
 static boolean
 emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
-- 
cgit v1.2.3


From a45d293fd9a1432404a7e26f97cb20b2a0c43654 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 12:04:19 -0600
Subject: cell: fix fm/fs copy & paste bug from a few commits ago

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 7a4e8d20ba..ab71336754 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -474,7 +474,7 @@ emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          /* d = s1 - s2 */
-         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
          store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
-- 
cgit v1.2.3


From b3a68b24bd601a4fcffb701bbd73864ed92a05e1 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 12:04:49 -0600
Subject: replace 1.0/sqrt() with inversesqrt()

---
 progs/glsl/CH11-bumpmap.frag | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/progs/glsl/CH11-bumpmap.frag b/progs/glsl/CH11-bumpmap.frag
index 063576f5a3..e12c5d374c 100644
--- a/progs/glsl/CH11-bumpmap.frag
+++ b/progs/glsl/CH11-bumpmap.frag
@@ -24,7 +24,7 @@ void main()
 
     float d, f;
     d = p.x * p.x + p.y * p.y;
-    f = 1.0 / sqrt(d + 1.0);
+    f = inversesqrt(d + 1.0);
 
     if (d >= BumpSize)
         { p = vec2(0.0); f = 1.0; }
-- 
cgit v1.2.3


From a13f61d34d40475a6f12fb8696b6e7d58aaa78b7 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 12:24:39 -0600
Subject: cell: fix LERP when dst reg is a src reg

Also, bump up frame size and fix some assertions.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index ab71336754..db54c7e57b 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -369,7 +369,7 @@ store_dest_reg(struct codegen *gen,
 static void
 emit_prologue(struct codegen *gen)
 {
-   gen->frame_size = 256+128; /* XXX temporary */
+   gen->frame_size = 1024; /* XXX temporary */
 
    spe_comment(gen->f, -4, "Function prologue:");
 
@@ -517,6 +517,7 @@ static boolean
 emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+   int tmp_reg = get_itemp(gen);
    spe_comment(gen->f, -4, "LERP:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
@@ -524,9 +525,10 @@ emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
          int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
          /* d = s3 + s1(s2 - s3) */
-         spe_fs(gen->f, d_reg, s2_reg, s3_reg);
-         spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg);
+         spe_fs(gen->f, tmp_reg, s2_reg, s3_reg);
+         spe_fma(gen->f, d_reg, tmp_reg, s1_reg, s3_reg);
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -1211,7 +1213,7 @@ emit_function_call(struct codegen *gen,
          d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
          numUsed = spe_get_registers_used(gen->f, usedRegs);
-         assert(numUsed < gen->frame_size / 16 - 32);
+         assert(numUsed < gen->frame_size / 16 - 2);
 
          /* save registers to stack */
          for (i = 0; i < numUsed; i++) {
@@ -1269,7 +1271,7 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
       uint i, numUsed;
 
       numUsed = spe_get_registers_used(gen->f, usedRegs);
-      assert(numUsed < gen->frame_size / 16 - 32);
+      assert(numUsed < gen->frame_size / 16 - 2);
 
       /* save registers to stack */
       for (i = 0; i < numUsed; i++) {
-- 
cgit v1.2.3


From 02931db3117cd064175a07412b860e8051d9ed58 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 12:38:27 -0600
Subject: cell: call cell_flush_int() at end of cell_create_context()

Ensures that SPUs are initialized/ready before proceeding.
This fixes a spurious assertion failure when the SPU-side shader function info
hasn't been returned to the PPU before shader codegen.
---
 src/gallium/drivers/cell/ppu/cell_context.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 30ce6f9762..35cd6874a2 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -162,5 +162,8 @@ cell_create_context(struct pipe_screen *screen,
 
    cell_init_batch_buffers(cell);
 
+   /* make sure SPU initializations are done before proceeding */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+
    return &cell->pipe;
 }
-- 
cgit v1.2.3


From e7002694418cd0decb1cd0d9121f634480e5f0d6 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Fri, 10 Oct 2008 11:47:43 -0700
Subject: intel: GLSL 1.20 is broken in Mesa, so disable it in the i965 driver

---
 src/mesa/drivers/dri/intel/intel_context.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index e6c0d3175e..2b3a9b9d37 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -398,7 +398,11 @@ static const struct dri_extension brw_extensions[] = {
    { "GL_ARB_point_sprite", 		  NULL },
    { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
    { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
+#if 0
+   /* Support for GLSL 1.20 is currently broken in core Mesa.
+    */
    { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
+#endif
    { "GL_ARB_shadow",                     NULL },
    { "GL_ARB_texture_non_power_of_two",   NULL },
    { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
-- 
cgit v1.2.3


From adeed0f90fdd46ea139d5c4b3b75d5dc79b2a0c7 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:13:13 -0600
Subject: CELL: fixing stencil bugs

These are the defects found and fixed so far.  Several more have
been observed; I'm working on them.

- Fixed an error in spe_load_uint() that caused incorrect values to be
  loaded if the given unsigned value had the low 18 bits as 0,
  and that caused inefficient code to be emitted if the given value
  had the high 14 bits as 0.

- Fixed a problem in stencil code generation where optional registers
  weren't tracked correctly.

- Fixed a problem that the stencil function NEVER was acting as ALWAYS.

- Fixed several problems that could occur if stenciling were enabled but
  depth was disabled.

- Fixed a problem with two-sided stencil writemask handling that could
  cause a stencil writemask to not be applied.

- Fixed several state permutations that were incorrectly flagged as
  not requiring stencil values to be calculated.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  4 +-
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 88 +++++++++++++++++++-----
 2 files changed, 72 insertions(+), 20 deletions(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index cc35f0ba5b..9bf3b9bf0c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -727,7 +727,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
     * Bytes Immediate (fsmbi) to load the value in a single instruction.
     * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
-   if ((ui & 0x3ffff) == ui) {
+   if ((ui & 0x0003ffff) == ui) {
       spe_ila(p, rT, ui);
    }
    else if ((ui >> 16) == (ui & 0xffff)) {
@@ -764,7 +764,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
 }
 
 /**
- * This function is constructed identically to spe_sor_uint() below.
+ * This function is constructed identically to spe_xor_uint() below.
  * Changes to one should be made in the other.
  */
 void
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index de170d1036..4e1e53ecdc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -247,6 +247,7 @@ setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigne
 {
    if (*is_already_set) return;
    *r = spe_allocate_available_register(f);
+   *is_already_set = true;
 }
 
 static inline void
@@ -1157,7 +1158,6 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
       /* stencil_pass = mask & (s == reference) */
       spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
       spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
-      /* stencil_fail = mask & ~stencil_pass */
       break;
 
    case PIPE_FUNC_NOTEQUAL:
@@ -1207,7 +1207,6 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
    case PIPE_FUNC_NEVER:
       /* stencil_pass = mask & 0 = 0 */
       spe_load_uint(f, stencil_pass_reg, 0);
-      spe_move(f, stencil_pass_reg, mask_reg);  /* zmask = mask */
       break;
 
    case PIPE_FUNC_ALWAYS:
@@ -1483,6 +1482,10 @@ gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_a
    } /* End of calculations for back-facing stencil */
 }
 
+/* Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled.  This function must not use
+ * the register if depth is not enabled.
+ */
 static boolean
 gen_stencil_depth_test(struct spe_function *f, 
                        const struct pipe_depth_stencil_alpha_state *dsa, 
@@ -1522,6 +1525,7 @@ gen_stencil_depth_test(struct spe_function *f,
     * have to spend the complexity to track the more difficult variant
     * register usage scenarios.
     */
+   spe_comment(f, 0, "Allocating stencil register set");
    spe_allocate_register_set(f);
 
    /* Calculate the writemask.  If the writemask is trivial (either
@@ -1538,7 +1542,7 @@ gen_stencil_depth_test(struct spe_function *f,
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
@@ -1556,10 +1560,12 @@ gen_stencil_depth_test(struct spe_function *f,
        * writemask, we'll have to generate code that merges the
        * two masks into a single effective mask based on fragment facing.
        */
+      spe_comment(f, 0, "Computing stencil writemask");
       stencil_writemask_reg = spe_allocate_available_register(f);
       spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
       if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
          unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Resolving two-sided stencil writemask");
          spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
          spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
          spe_release_register(f, back_write_mask_reg);
@@ -1575,6 +1581,7 @@ gen_stencil_depth_test(struct spe_function *f,
     * This test will *not* change the value in mask_reg (because we don't
     * yet know whether to apply the two-sided stencil or one-sided stencil).
     */
+   spe_comment(f, 0, "Running basic stencil test");
    stencil_pass_reg = spe_allocate_available_register(f);
    gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
 
@@ -1584,6 +1591,7 @@ gen_stencil_depth_test(struct spe_function *f,
     */
    if (dsa->stencil[1].enabled) {
       unsigned int temp_reg = spe_allocate_available_register(f);
+      spe_comment(f, 0, "Running backface stencil test");
       gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
       spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
       spe_release_register(f, temp_reg);
@@ -1597,6 +1605,7 @@ gen_stencil_depth_test(struct spe_function *f,
     * stencil test, and because the depth test will update the 
     * mask of valid fragments based on the results of the depth test).
     */
+   spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
    stencil_fail_reg = spe_allocate_available_register(f);
    spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
    /* Now remove the stenciled-out pixels from the valid fragment mask,
@@ -1623,6 +1632,7 @@ gen_stencil_depth_test(struct spe_function *f,
        * This function will allocate a variant number of registers that
        * will be released as part of the register set.
        */
+      spe_comment(f, 0, "Computing stencil values");
       gen_get_stencil_values(f, dsa, fbS_reg, 
          &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
          &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
@@ -1652,6 +1662,7 @@ gen_stencil_depth_test(struct spe_function *f,
          stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
       }
       else { /* two-sided stencil enabled */
+         spe_comment(f, 0, "Resolving backface stencil values");
          /* Allocate new registers for the needed merged values */
          stencil_fail_values = spe_allocate_available_register(f);
          spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
@@ -1678,11 +1689,13 @@ gen_stencil_depth_test(struct spe_function *f,
     * on the results of the test.
     */
    if (dsa->depth.enabled) {
+      spe_comment(f, 0, "Running stencil depth test");
       zmask_reg = spe_allocate_available_register(f);
       modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
    }
 
    if (need_to_calculate_stencil_values) {
+
       /* If we need to writemask the stencil values before going into
        * the stencil buffer, we'll have to use a new register to
        * hold the new values.  If not, we can just keep using the
@@ -1690,8 +1703,8 @@ gen_stencil_depth_test(struct spe_function *f,
        */
       if (need_to_writemask_stencil_values) {
          newS_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Saving current stencil values for writemasking");
          spe_move(f, newS_reg, fbS_reg);
-         modified_buffers = true;
       }
       else {
          newS_reg = fbS_reg;
@@ -1699,7 +1712,9 @@ gen_stencil_depth_test(struct spe_function *f,
 
       /* Merge in the selected stencil fail values */
       if (stencil_fail_values != fbS_reg) {
+         spe_comment(f, 0, "Loading stencil fail values");
          spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+         modified_buffers = true;
       }
 
       /* Same for the stencil pass/depth fail values.  If this calculation
@@ -1714,20 +1729,36 @@ gen_stencil_depth_test(struct spe_function *f,
           * set above if we're here.
           */
          unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Loading stencil pass/depth fail values");
          spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
 
          spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
 
          spe_release_register(f, stencil_pass_depth_fail_mask);
+         modified_buffers = true;
       }
 
-      /* Same for the stencil pass/depth pass mask */
+      /* Same for the stencil pass/depth pass mask.  Note that we
+       * *can* get here with zmask_reg being unset (if the depth
+       * test is off but the stencil test is on).  In this case,
+       * we assume the depth test passes, and don't need to mask
+       * the stencil pass mask with the Z mask.
+       */
       if (stencil_pass_depth_pass_values != fbS_reg) {
-         unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
-         spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
-
-         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
-         spe_release_register(f, stencil_pass_depth_pass_mask);
+         if (dsa->depth.enabled) {
+            unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+            /* We'll need a separate register */
+            spe_comment(f, 0, "Loading stencil pass/depth pass values");
+            spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+            spe_release_register(f, stencil_pass_depth_pass_mask);
+         }
+         else {
+            /* We can use the same stencil-pass register */
+            spe_comment(f, 0, "Loading stencil pass values");
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+         }
+         modified_buffers = true;
       }
 
       /* Almost done.  If we need to writemask, do it now, leaving the
@@ -1736,14 +1767,16 @@ gen_stencil_depth_test(struct spe_function *f,
        * so there's nothing more to do.
        */
 
-      if (need_to_writemask_stencil_values) {
+      if (need_to_writemask_stencil_values && modified_buffers) {
          /* The Select Bytes command makes a fine writemask.  Where
           * the mask is 0, the first (original) values are retained,
           * effectively masking out changes.  Where the mask is 1, the
           * second (new) values are retained, incorporating changes.
           */
+         spe_comment(f, 0, "Writemasking new stencil values");
          spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
       }
+
    } /* done calculating stencil values */
 
    /* The stencil and/or depth values have been applied, and the
@@ -1752,6 +1785,7 @@ gen_stencil_depth_test(struct spe_function *f,
     * of registers that we didn't bother tracking.  Release all
     * those registers as part of the register set, and go home.
     */
+   spe_comment(f, 0, "Releasing stencil register set");
    spe_release_register_set(f);
 
    /* Return true if we could have modified the stencil and/or
@@ -1869,7 +1903,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       boolean fbS_reg_set = false, fbZ_reg_set = false;
       unsigned int fbS_reg, fbZ_reg = 0;
 
-      spe_comment(f, 0, "Fetch quad's Z/stencil values from tile");
+      spe_comment(f, 0, "Fetching Z/stencil quad from tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
@@ -1973,13 +2007,18 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
           * tests.
           */
          ASSERT(fbS_reg_set);
-         ASSERT(fbZ_reg_set);
          spe_comment(f, 0, "Perform stencil test");
 
+         /* Note that fbZ_reg may not be set on entry, if stenciling
+          * is enabled but there's no Z-buffer.  The 
+          * gen_stencil_depth_test() function must ignore the
+          * fbZ_reg register if depth is not enabled.
+          */
          write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
+         ASSERT(fbZ_reg_set);
          spe_comment(f, 0, "Perform depth test");
          write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
          spe_release_register(f, zmask_reg);
@@ -1996,26 +2035,39 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_comment(f, 0, "Store quad's depth/stencil values in tile");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            if (fbS_reg_set) {
+            if (fbS_reg_set && fbZ_reg_set) {
                spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
                spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
             }
+            else if (fbS_reg_set) {
+               spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            }
             else {
                spe_move(f, fbZS_reg, fbZ_reg);
             }
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            if (fbS_reg_set) {
+            if (fbS_reg_set && fbZ_reg_set) {
+               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
                spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
             }
+            else if (fbS_reg_set) {
+               spe_move(f, fbZS_reg, fbS_reg);
+            }
+            else {
+               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            if (fbZ_reg_set) {
+               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            if (fbZ_reg_set) {
+               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_S8_UNORM) {
             ASSERT(0);   /* XXX to do */
-- 
cgit v1.2.3


From 53ae243869a9e1ff0f2b1c559ec51adff867b970 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:34:43 -0600
Subject: cell: fix function prologue/epilogue code for large stack frames

The ai instruction is limited to a 10-bit signed immediate value.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 44 +++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index db54c7e57b..3d0e7976df 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -369,18 +369,36 @@ store_dest_reg(struct codegen *gen,
 static void
 emit_prologue(struct codegen *gen)
 {
-   gen->frame_size = 1024; /* XXX temporary */
+   gen->frame_size = 1024; /* XXX temporary, should be dynamic */
 
    spe_comment(gen->f, -4, "Function prologue:");
 
    /* save $lr on stack     # stqd $lr,16($sp) */
    spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
 
-   /* save stack pointer    # stqd $sp,-frameSize($sp) */
-   spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      int sp_reg = spe_allocate_available_register(gen->f);
+      /* offset = -framesize */
+      spe_load_int(gen->f, offset_reg, -gen->frame_size);
+      /* sp = $sp */
+      spe_move(gen->f, sp_reg, SPE_REG_SP);
+      /* $sp = $sp + offset_reg */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* save $sp in stack frame */
+      spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+      spe_release_register(gen->f, sp_reg);
+   }
+   else {
+      /* save stack pointer    # stqd $sp,-frameSize($sp) */
+      spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
 
-   /* adjust stack pointer  # ai $sp,$sp,-frameSize */
-   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+      /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+   }
 }
 
 
@@ -389,8 +407,20 @@ emit_epilogue(struct codegen *gen)
 {
    spe_comment(gen->f, -4, "Function epilogue:");
 
-   /* restore stack pointer    # ai $sp,$sp,frameSize */
-   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      /* offset = framesize */
+      spe_load_int(gen->f, offset_reg, gen->frame_size);
+      /* $sp = $sp + offset */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+   }
+   else {
+      /* restore stack pointer    # ai $sp,$sp,frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+   }
 
    /* restore $lr              # lqd $lr,16($sp) */
    spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-- 
cgit v1.2.3


From 78c67a726fff052abeb03417283504a5dd521665 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:35:56 -0600
Subject: cell: fix assertions in spe_lqd(), spe_stqd()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 9bf3b9bf0c..5b0f6bdd48 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -559,7 +559,7 @@ void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
    const boolean pSave = p->print;
 
    p->print = FALSE;
-   assert(offset % 4 == 0);
+   assert(offset % 16 == 0);
    emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
    p->print = pSave;
 
@@ -579,7 +579,7 @@ void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
    const boolean pSave = p->print;
 
    p->print = FALSE;
-   assert(offset % 4 == 0);
+   assert(offset % 16 == 0);
    emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
    p->print = pSave;
 
-- 
cgit v1.2.3


From f42ef6f39d213b4c6315ba95791c16ca2b1a4b21 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:44:52 -0600
Subject: cell: additional 'offset' checking in spe_lqd(), spe_stqd()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 5b0f6bdd48..d0bacd08a6 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -552,14 +552,19 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
 
 /**
  * Load quad word.
- * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
  */
 void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
 {
    const boolean pSave = p->print;
 
-   p->print = FALSE;
+   /* offset must be a multiple of 16 */
    assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
    emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
    p->print = pSave;
 
@@ -572,14 +577,19 @@ void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
 
 /**
  * Store quad word.
- * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
  */
 void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
 {
    const boolean pSave = p->print;
 
-   p->print = FALSE;
+   /* offset must be a multiple of 16 */
    assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
    emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
    p->print = pSave;
 
-- 
cgit v1.2.3


From d3403b5482ee1c0faa0f42b8782ee3093a2f7b5e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:57:57 -0600
Subject: cell: add emit_RI10s() which does range checking on the 10-bit signed
 immediate field

This type of checking should be expanded to cover more instructions...
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 16 ++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 24 ++++++++++++++----------
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index d0bacd08a6..dea1aed032 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -278,6 +278,16 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
+/** As above, but do range checking on signed immediate value */
+static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
+                       unsigned rA, int imm, const char *name)
+{
+    assert(imm <= 511);
+    assert(imm >= -512);
+    emit_RI10(p, op, rT, rA, imm, name);
+}
+
+
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
 		      int imm, const char *name)
 {
@@ -354,6 +364,12 @@ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
    emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
+#define EMIT_RI10s(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__);             \
+}
+
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 47dadb343c..d6a3c02f20 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -119,6 +119,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT_RI10(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
 			   int imm)
+#define EMIT_RI10s(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
 #define EMIT_RI16(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, int imm)
 #define EMIT_RI18(_name, _op) \
@@ -163,7 +166,7 @@ EMIT_RI16(spe_fsmbi, 0x065);
 EMIT_RR  (spe_ah,      0x0c8);
 EMIT_RI10(spe_ahi,     0x01d);
 EMIT_RR  (spe_a,       0x0c0);
-EMIT_RI10(spe_ai,      0x01c);
+EMIT_RI10s(spe_ai,      0x01c);
 EMIT_RR  (spe_sfh,     0x048);
 EMIT_RI10(spe_sfhi,    0x00d);
 EMIT_RR  (spe_sf,      0x040);
@@ -201,19 +204,19 @@ EMIT_R   (spe_xshw,    0x2ae);
 EMIT_R   (spe_xswd,    0x2a6);
 EMIT_RR  (spe_and,     0x0c1);
 EMIT_RR  (spe_andc,    0x2c1);
-EMIT_RI10(spe_andbi,   0x016);
-EMIT_RI10(spe_andhi,   0x015);
-EMIT_RI10(spe_andi,    0x014);
+EMIT_RI10s(spe_andbi,   0x016);
+EMIT_RI10s(spe_andhi,   0x015);
+EMIT_RI10s(spe_andi,    0x014);
 EMIT_RR  (spe_or,      0x041);
 EMIT_RR  (spe_orc,     0x2c9);
-EMIT_RI10(spe_orbi,    0x006);
-EMIT_RI10(spe_orhi,    0x005);
-EMIT_RI10(spe_ori,     0x004);
+EMIT_RI10s(spe_orbi,    0x006);
+EMIT_RI10s(spe_orhi,    0x005);
+EMIT_RI10s(spe_ori,     0x004);
 EMIT_R   (spe_orx,     0x1f0);
 EMIT_RR  (spe_xor,     0x241);
-EMIT_RI10(spe_xorbi,   0x026);
-EMIT_RI10(spe_xorhi,   0x025);
-EMIT_RI10(spe_xori,    0x024);
+EMIT_RI10s(spe_xorbi,   0x026);
+EMIT_RI10s(spe_xorhi,   0x025);
+EMIT_RI10s(spe_xori,    0x024);
 EMIT_RR  (spe_nand,    0x0c9);
 EMIT_RR  (spe_nor,     0x049);
 EMIT_RR  (spe_eqv,     0x249);
@@ -422,6 +425,7 @@ EMIT_R   (spe_wrch,       0x10d);
 #undef EMIT_RI7
 #undef EMIT_RI8
 #undef EMIT_RI10
+#undef EMIT_RI10s
 #undef EMIT_RI16
 #undef EMIT_RI18
 #undef EMIT_I16
-- 
cgit v1.2.3


From f63594bfef883fa9e15ab7f3f69affe4901353aa Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 16:21:52 -0600
Subject: mesa: remove unneeded includes

---
 src/mesa/main/api_loopback.c | 1 -
 src/mesa/main/dlist.c        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/mesa/main/api_loopback.c b/src/mesa/main/api_loopback.c
index 924d7134a2..0e3f5ff957 100644
--- a/src/mesa/main/api_loopback.c
+++ b/src/mesa/main/api_loopback.c
@@ -31,7 +31,6 @@
 
 #include "glheader.h"
 #include "macros.h"
-#include "colormac.h"
 #include "api_loopback.h"
 #include "mtypes.h"
 #include "glapi/glapi.h"
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index f7660930a9..c7db435506 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -41,7 +41,6 @@
 #endif
 #include "arrayobj.h"
 #include "clip.h"
-#include "colormac.h"
 #include "colortab.h"
 #include "context.h"
 #include "convolve.h"
-- 
cgit v1.2.3


From 3210a6d6c7e3d5660d31f6c736e0d0f7e34bcf6d Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 16:23:31 -0600
Subject: mesa: rename macro params to emphasize that there's no particular
 color ordering

---
 src/mesa/main/colormac.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/mesa/main/colormac.h b/src/mesa/main/colormac.h
index a19521fc85..a34bd2ed38 100644
--- a/src/mesa/main/colormac.h
+++ b/src/mesa/main/colormac.h
@@ -1,8 +1,8 @@
 /*
  * Mesa 3-D graphics library
- * Version:  6.1
+ * Version:  7.3
  *
- * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -180,20 +180,20 @@ do {						\
  */
 /*@{*/
 
-#define PACK_COLOR_8888( R, G, B, A )					\
-   (((R) << 24) | ((G) << 16) | ((B) << 8) | (A))
+#define PACK_COLOR_8888( X, Y, Z, W ) \
+   (((X) << 24) | ((Y) << 16) | ((Z) << 8) | (W))
 
-#define PACK_COLOR_8888_REV( R, G, B, A )				\
-   (((A) << 24) | ((B) << 16) | ((G) << 8) | (R))
+#define PACK_COLOR_8888_REV( X, Y, Z, W ) \
+   (((W) << 24) | ((Z) << 16) | ((Y) << 8) | (X))
 
-#define PACK_COLOR_888( R, G, B )					\
-   (((R) << 16) | ((G) << 8) | (B))
+#define PACK_COLOR_888( X, Y, Z ) \
+   (((X) << 16) | ((Y) << 8) | (Z))
 
-#define PACK_COLOR_565( R, G, B )					\
-   ((((R) & 0xf8) << 8) | (((G) & 0xfc) << 3) | (((B) & 0xf8) >> 3))
+#define PACK_COLOR_565( X, Y, Z )                                  \
+   ((((X) & 0xf8) << 8) | (((Y) & 0xfc) << 3) | (((Z) & 0xf8) >> 3))
 
-#define PACK_COLOR_565_REV( R, G, B )					\
-   (((R) & 0xf8) | ((G) & 0xe0) >> 5 | (((G) & 0x1c) << 11) | (((B) & 0xf8) << 5))
+#define PACK_COLOR_565_REV( X, Y, Z ) \
+   (((X) & 0xf8) | ((Y) & 0xe0) >> 5 | (((Y) & 0x1c) << 11) | (((Z) & 0xf8) << 5))
 
 #define PACK_COLOR_1555( A, B, G, R )					\
    ((((B) & 0xf8) << 7) | (((G) & 0xf8) << 2) | (((R) & 0xf8) >> 3) |	\
-- 
cgit v1.2.3


From 85a3bf6dabc8b2d545dab078516fdfee9c4cd792 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 16:26:15 -0600
Subject: mesa: fix error codes in _mesa_GetObjectParameterivARB(), bug 17861

---
 src/mesa/main/shaders.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/shaders.c b/src/mesa/main/shaders.c
index aeb5d4ca2a..e5c54bb10d 100644
--- a/src/mesa/main/shaders.c
+++ b/src/mesa/main/shaders.c
@@ -247,7 +247,18 @@ _mesa_GetObjectParameterivARB(GLhandleARB object, GLenum pname, GLint *params)
       }
    }
    else {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glGetObjectParameterivARB");
+      /* error code depends on pname */
+      GLenum err;
+      switch (pname) {
+      case GL_OBJECT_TYPE_ARB:
+      case GL_OBJECT_DELETE_STATUS_ARB:
+      case GL_OBJECT_INFO_LOG_LENGTH_ARB:
+         err = GL_INVALID_OPERATION;
+         break;
+      default:
+         err = GL_INVALID_VALUE;
+      }
+      _mesa_error(ctx, err, "glGetObjectParameterivARB");
    }
 }
 
-- 
cgit v1.2.3


From f863ae1a040c358728d8608531ae3eb695f3af9e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 16:29:54 -0600
Subject: mesa: remove unneeded includes

---
 src/mesa/main/blend.c     | 1 -
 src/mesa/main/getstring.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index 4d4a897141..39cf6153e2 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -31,7 +31,6 @@
 
 #include "glheader.h"
 #include "blend.h"
-#include "colormac.h"
 #include "context.h"
 #include "enums.h"
 #include "macros.h"
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 1a82ccce59..94bf5de1e8 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -25,7 +25,6 @@
 
 
 #include "glheader.h"
-#include "colormac.h"
 #include "context.h"
 #include "get.h"
 #include "version.h"
-- 
cgit v1.2.3


From bf9d9a9d01b7697f4a30305cb9574430cba351fa Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 16:30:18 -0600
Subject: mesa: include needed header

---
 src/mesa/swrast/s_texcombine.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
index 4e3d329075..632d650007 100644
--- a/src/mesa/swrast/s_texcombine.c
+++ b/src/mesa/swrast/s_texcombine.c
@@ -26,6 +26,7 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/colormac.h"
+#include "main/image.h"
 #include "main/imports.h"
 #include "main/macros.h"
 #include "main/pixel.h"
-- 
cgit v1.2.3


From 24748268a3ac7bedc2c9ae5bf76c4c741d539f80 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 16:30:43 -0600
Subject: mesa: fix asst. issues in _mesa_texstore_argb8888()

If we shift bytes into the texel word (or use the PACK_COLOR_8888 macro),
we don't have to worry about big vs. little endian.  See comments about
texel formats in texformat.h.
Remove an unneeded/incorrect else-if clause that produced wrong results
on big-endian systems.
---
 src/mesa/main/texstore.c | 50 ++++++++++--------------------------------------
 1 file changed, 10 insertions(+), 40 deletions(-)

diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 56f5b2ebaa..75b14c2bf0 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -1,6 +1,6 @@
 /*
  * Mesa 3-D graphics library
- * Version:  7.1
+ * Version:  7.3
  *
  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
  *
@@ -1536,10 +1536,10 @@ _mesa_texstore_argb8888(TEXSTORE_PARAMS)
          for (row = 0; row < srcHeight; row++) {
             GLuint *d4 = (GLuint *) dstRow;
             for (col = 0; col < srcWidth; col++) {
-               d4[col] = ((0xff                    << 24) |
-                          (srcRow[col * 3 + RCOMP] << 16) |
-                          (srcRow[col * 3 + GCOMP] <<  8) |
-                          (srcRow[col * 3 + BCOMP] <<  0));
+               d4[col] = PACK_COLOR_8888(0xff,
+                                         srcRow[col * 3 + RCOMP],
+                                         srcRow[col * 3 + GCOMP],
+                                         srcRow[col * 3 + BCOMP]);
             }
             dstRow += dstRowStride;
             srcRow += srcRowStride;
@@ -1551,8 +1551,7 @@ _mesa_texstore_argb8888(TEXSTORE_PARAMS)
 	    dstFormat == &_mesa_texformat_argb8888 &&
             srcFormat == GL_RGBA &&
 	    baseInternalFormat == GL_RGBA &&
-            srcType == GL_UNSIGNED_BYTE &&
-            littleEndian) {
+            srcType == GL_UNSIGNED_BYTE) {
       /* same as above case, but src data has alpha too */
       GLint img, row, col;
       /* For some reason, streaming copies to write-combined regions
@@ -1573,39 +1572,10 @@ _mesa_texstore_argb8888(TEXSTORE_PARAMS)
          for (row = 0; row < srcHeight; row++) {
             GLuint *d4 = (GLuint *) dstRow;
             for (col = 0; col < srcWidth; col++) {
-               d4[col] = ((srcRow[col * 4 + ACOMP] << 24) |
-                          (srcRow[col * 4 + RCOMP] << 16) |
-                          (srcRow[col * 4 + GCOMP] <<  8) |
-                          (srcRow[col * 4 + BCOMP] <<  0));
-            }
-            dstRow += dstRowStride;
-            srcRow += srcRowStride;
-         }
-      }
-   }
-   else if (!ctx->_ImageTransferState &&
-            !srcPacking->SwapBytes &&
-	    dstFormat == &_mesa_texformat_argb8888 &&
-            srcFormat == GL_RGBA &&
-	    baseInternalFormat == GL_RGBA &&
-            srcType == GL_UNSIGNED_BYTE) {
-
-      GLint img, row, col;
-      for (img = 0; img < srcDepth; img++) {
-         const GLint srcRowStride = _mesa_image_row_stride(srcPacking,
-                                                 srcWidth, srcFormat, srcType);
-         GLubyte *srcRow = (GLubyte *) _mesa_image_address(dims, srcPacking,
-                  srcAddr, srcWidth, srcHeight, srcFormat, srcType, img, 0, 0);
-         GLubyte *dstRow = (GLubyte *) dstAddr
-            + dstImageOffsets[dstZoffset + img] * dstFormat->TexelBytes
-            + dstYoffset * dstRowStride
-            + dstXoffset * dstFormat->TexelBytes;
-         for (row = 0; row < srcHeight; row++) {
-            for (col = 0; col < srcWidth; col++) {
-               dstRow[col * 4 + 0] = srcRow[col * 4 + BCOMP];
-               dstRow[col * 4 + 1] = srcRow[col * 4 + GCOMP];
-               dstRow[col * 4 + 2] = srcRow[col * 4 + RCOMP];
-               dstRow[col * 4 + 3] = srcRow[col * 4 + ACOMP];
+               d4[col] = PACK_COLOR_8888(srcRow[col * 4 + ACOMP],
+                                         srcRow[col * 4 + RCOMP],
+                                         srcRow[col * 4 + GCOMP],
+                                         srcRow[col * 4 + BCOMP]);
             }
             dstRow += dstRowStride;
             srcRow += srcRowStride;
-- 
cgit v1.2.3


From c011a9ca8c83f27eb2c9198f9b4a95ccbf9accf9 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Fri, 10 Oct 2008 09:43:15 -0700
Subject: i965: Add missing intel_pixel_draw.c symlink to fix build.

---
 src/mesa/drivers/dri/i965/intel_pixel_draw.c | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 src/mesa/drivers/dri/i965/intel_pixel_draw.c

diff --git a/src/mesa/drivers/dri/i965/intel_pixel_draw.c b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
new file mode 120000
index 0000000000..8431a24edf
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
@@ -0,0 +1 @@
+../intel/intel_pixel_draw.c
\ No newline at end of file
-- 
cgit v1.2.3


From 33ff407874d6fed998b1f01dab68a2b4f1df988a Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Fri, 10 Oct 2008 11:47:43 -0700
Subject: intel: GLSL 1.20 is broken in Mesa, so disable it in the i965 driver

---
 src/mesa/drivers/dri/intel/intel_context.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index e6c0d3175e..2b3a9b9d37 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -398,7 +398,11 @@ static const struct dri_extension brw_extensions[] = {
    { "GL_ARB_point_sprite", 		  NULL },
    { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
    { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
+#if 0
+   /* Support for GLSL 1.20 is currently broken in core Mesa.
+    */
    { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
+#endif
    { "GL_ARB_shadow",                     NULL },
    { "GL_ARB_texture_non_power_of_two",   NULL },
    { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
-- 
cgit v1.2.3


From 01e312a73b68dc5ddffca0d1b1472fc5dcb6f59e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 16:36:40 -0600
Subject: cell: pass texture unit (sampler number) to txp() function

The glsl/multitex demo runs now.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 4 ++++
 src/gallium/drivers/cell/spu/spu_funcs.c   | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3d0e7976df..ef84059d8f 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1285,9 +1285,12 @@ static boolean
 emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    const uint addr = lookup_function(gen->cell, "spu_txp");
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
    int ch;
    int coord_regs[4], d_regs[4];
 
+   assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+
    spe_comment(gen->f, -4, "CALL txp:");
 
    /* get src/dst reg info */
@@ -1314,6 +1317,7 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
       for (i = 0; i < 4; i++) {
          spe_move(gen->f, 3 + i, coord_regs[i]);
       }
+      spe_load_uint(gen->f, 7, unit); /* sampler unit */
 
       /* branch to function, save return addr */
       spe_brasl(gen->f, SPE_REG_RA, addr);
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index c7bcb3de9d..7dd7fcd253 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -101,9 +101,10 @@ spu_log2(vector float x)
 }
 
 static struct vec_4x4
-spu_txp(vector float s, vector float t, vector float r, vector float q)
+spu_txp(vector float s, vector float t, vector float r, vector float q,
+        unsigned unit)
 {
-   const uint unit = 0;
+   //const uint unit = 0;
    struct vec_4x4 colors;
    vector float coords[4];
 
-- 
cgit v1.2.3


From ecac7996d4c5a1e492ce97c5f5cac885941fc711 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 17:48:16 -0600
Subject: cell: more instruction scheduling optimizations (MIN/MAX/LERP/etc)

Also, optimize register->memory stores.
---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 201 ++++++++++++++++++++---------
 1 file changed, 140 insertions(+), 61 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index ef84059d8f..68093d9e83 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -77,7 +77,7 @@ struct codegen
 
    /** Per-instruction temps / intermediate temps */
    int num_itemps;
-   int itemps[10];
+   int itemps[12];
 
    /** Current IF/ELSE/ENDIF nesting level */
    int if_nesting;
@@ -167,6 +167,37 @@ get_exec_mask_reg(struct codegen *gen)
 }
 
 
+static boolean
+is_register_src(struct codegen *gen, int channel,
+                const struct tgsi_full_src_register *src)
+{
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+
+   if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
+      return FALSE;
+   }
+   if (src->SrcRegister.File == TGSI_FILE_TEMPORARY ||
+       src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+      return TRUE;
+   }
+   return FALSE;
+}
+
+  
+static boolean
+is_memory_dst(struct codegen *gen, int channel,
+              const struct tgsi_full_dst_register *dst)
+{
+   if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+      return TRUE;
+   }
+   else {
+      return FALSE;
+   }
+}
+
+  
 /**
  * Return the index of the SPU temporary containing the named TGSI
  * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
@@ -226,11 +257,6 @@ get_src_reg(struct codegen *gen,
             spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
          }
          break;
-      case TGSI_FILE_SAMPLER:
-         {
-            reg = 3; /* XXX total hack */
-         }
-         break;
       default:
          assert(0);
       }
@@ -257,7 +283,7 @@ get_src_reg(struct codegen *gen,
       }
 
       /* mask with bit 31 set, the rest cleared */
-      spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+      spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
 
       if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
          spe_andc(gen->f, result_reg, reg, bit31mask_reg);
@@ -434,6 +460,7 @@ static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch, src_reg[4], dst_reg[4];
+
    spe_comment(gen->f, -4, "MOV:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
@@ -441,11 +468,18 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
          dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
       }
    }
+
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         /* XXX we don't always need to actually emit a mov instruction here */
-         spe_move(gen->f, dst_reg[ch], src_reg[ch]);
-         store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+         if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
+             is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+            /* special-case: register to memory store */
+            store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+         }
+         else {
+            spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+            store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+         }
          free_itemps(gen);
       }
    }
@@ -546,23 +580,37 @@ emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
-   int tmp_reg = get_itemp(gen);
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
    spe_comment(gen->f, -4, "LERP:");
+   /* setup/get src/dst/temp regs */
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);
+      }
+   }
 
-         /* d = s3 + s1(s2 - s3) */
-         spe_fs(gen->f, tmp_reg, s2_reg, s3_reg);
-         spe_fma(gen->f, d_reg, tmp_reg, s1_reg, s3_reg);
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
+   /* d = s3 + s1(s2 - s3) */
+   /* do all subtracts, then all fma, then all stores to better pipeline */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
       }
    }
+   free_itemps(gen);
    return true;
 }
 
@@ -651,7 +699,7 @@ emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
          const int bit31mask_reg = get_itemp(gen);
 
          /* mask with bit 31 set, the rest cleared */  
-         spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+         spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
 
          /* d = sign bit cleared in s1 */
          spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
@@ -714,35 +762,41 @@ static boolean
 emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+   int s0x_reg, s0y_reg, s0z_reg, s0w_reg;
+   int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
    spe_comment(gen->f, -4, "DP4:");
 
-   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
-   int tmp_reg = get_itemp(gen);
+   s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+   s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
 
-   /* t = x0 * x1 */
-   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
-   /* t = y0 * y1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
-   /* t = z0 * z1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
-   /* t = w0 * w1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t1 = w0 * w1 + t1 */
+   spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         spe_move(gen->f, d_reg, tmp_reg);
-         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, t0_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
 
@@ -756,6 +810,7 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
+   /* XXX rewrite this function to look more like DP3/DP4 */
    int ch;
    spe_comment(gen->f, -4, "DPH:");
 
@@ -1357,26 +1412,38 @@ emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
 
    spe_comment(gen->f, -4, "MAX:");
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         int tmp_reg = get_itemp(gen);
+         s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);         
+      }
+   }
 
-         /* d = (s1 > s2) ? s1 : s2 */
-         spe_fcgt(gen->f, tmp_reg, s1_reg, s2_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
+   /* d = (s0 > s1) ? s0 : s1 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+      }
+   }
 
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
       }
    }
 
+   free_itemps(gen);
    return true;
 }
 
@@ -1386,26 +1453,38 @@ emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
 
    spe_comment(gen->f, -4, "MIN:");
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         int tmp_reg = get_itemp(gen);
+         s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);         
+      }
+   }
 
-         /* d = (s2 > s1) ? s1 : s2 */
-         spe_fcgt(gen->f, tmp_reg, s2_reg, s1_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
+   /* d = (s1 > s0) ? s0 : s1 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+      }
+   }
 
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
       }
    }
 
+   free_itemps(gen);
    return true;
 }
 
-- 
cgit v1.2.3


From 5bc8ebb12be99ac769a0f2ad1f77a16ebb2bf41f Mon Sep 17 00:00:00 2001
From: Alan Hourihane <alanh@tungstengraphics.com>
Date: Mon, 13 Oct 2008 12:30:40 +0100
Subject: mesa: when emitting an address load instruction, for       indexed
 elements ensure we write to a single register.

---
 src/mesa/shader/slang/slang_emit.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/shader/slang/slang_emit.c b/src/mesa/shader/slang/slang_emit.c
index 9e8daa1051..f3c3fa6c5b 100644
--- a/src/mesa/shader/slang/slang_emit.c
+++ b/src/mesa/shader/slang/slang_emit.c
@@ -1579,13 +1579,17 @@ emit_array_element(slang_emit_info *emitInfo, slang_ir_node *n)
    else {
       /* Variable array index */
       struct prog_instruction *inst;
+      slang_ir_storage dstStore = *n->Store;
 
       /* do codegen for array index expression */
       emit(emitInfo, n->Children[1]);
 
       inst = new_instruction(emitInfo, OPCODE_ARL);
 
-      storage_to_dst_reg(&inst->DstReg, n->Store, n->Writemask);
+      if (dstStore.Size > 4)
+         dstStore.Size = 4; /* only emit one instruction */
+
+      storage_to_dst_reg(&inst->DstReg, &dstStore, n->Writemask);
       storage_to_src_reg(&inst->SrcReg[0], n->Children[1]->Store);
 
       inst->DstReg.File = PROGRAM_ADDRESS;
-- 
cgit v1.2.3


From 734685549ca7dbee78845fdef1d65aceaa729845 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 10:54:11 -0600
Subject: cell: added spu_unpack_A8R8G8B8_transpose4()

Plus, clearer shuffle masks in other funcs.
---
 src/gallium/drivers/cell/spu/spu_colorpack.h | 49 ++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index fd8dc6ded3..d7ce005524 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -31,6 +31,7 @@
 #define SPU_COLORPACK_H
 
 
+#include <transpose_matrix4x4.h>
 #include <spu_intrinsics.h>
 
 
@@ -84,10 +85,10 @@ spu_unpack_B8G8R8A8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             10, 10, 10, 10,
-                             5, 5, 5, 5,
+                             2, 2, 2, 2,
+                             1, 1, 1, 1,
                              0, 0, 0, 0,
-                             15, 15, 15, 15}) );
+                             3, 3, 3, 3}) );
    return spu_convtf(color_u4, 32);
 }
 
@@ -98,13 +99,47 @@ spu_unpack_A8R8G8B8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             5, 5, 5, 5,
-                             10, 10, 10, 10,
-                             15, 15, 15, 15,
+                             1, 1, 1, 1,
+                             2, 2, 2, 2,
+                             3, 3, 3, 3,
                              0, 0, 0, 0}) );
-
    return spu_convtf(color_u4, 32);
 }
 
 
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+                               vector float color_out[4])
+{
+   vector unsigned int c0;
+
+   c0 = spu_shuffle(color_in[0], color_in[0],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[0] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[1], color_in[1],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[1] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[2], color_in[2],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[2] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[3], color_in[3],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[3] = spu_convtf(c0, 32);
+
+   _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
 #endif /* SPU_COLORPACK_H */
-- 
cgit v1.2.3


From 3b07c28dee74c7aa3be5efac8084d610675af291 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 10:55:08 -0600
Subject: cell: do texture sampling/filtering for four pixels at a time.

---
 src/gallium/drivers/cell/spu/spu_command.c |  11 ++-
 src/gallium/drivers/cell/spu/spu_funcs.c   |   4 +
 src/gallium/drivers/cell/spu/spu_main.h    |  19 ++++-
 src/gallium/drivers/cell/spu/spu_texture.c | 125 ++++++++++++++++++++++++++++-
 src/gallium/drivers/cell/spu/spu_texture.h |  12 +++
 5 files changed, 161 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 91a4c137e7..c59be7defd 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,10 +301,14 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
       spu.sample_texture[sampler->unit] = sample_texture_bilinear;
-   else
+      spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
+   }
+   else {
       spu.sample_texture[sampler->unit] = sample_texture_nearest;
+      spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
+   }
 }
 
 
@@ -323,6 +327,9 @@ cmd_state_texture(const struct cell_command_texture *texture)
    spu.texture[unit].width = width;
    spu.texture[unit].height = height;
 
+   spu.texture[unit].width4 = spu_splats((float) width);
+   spu.texture[unit].height4 = spu_splats((float) height);
+
    spu.texture[unit].tiles_per_row = width / TILE_SIZE;
 
    spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 7dd7fcd253..13c234ea2e 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,6 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
 {
    //const uint unit = 0;
    struct vec_4x4 colors;
+#if 0
    vector float coords[4];
 
    coords[0] = s;
@@ -121,6 +122,9 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
    colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
 
    _transpose_matrix4x4(colors.v, colors.v);
+#else
+   spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
+#endif
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 82c9c69a3a..5d14be51c2 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -67,6 +67,14 @@ typedef union {
 typedef vector float (*spu_sample_texture_func)(uint unit,
                                                 vector float texcoord);
 
+typedef void (*spu_sample_texture4_func)(vector float s,
+                                         vector float t,
+                                         vector float r,
+                                         vector float q,
+                                         uint unit,
+                                         vector float colors[4]);
+
+
 /** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       tile_t *colorTile,
@@ -107,10 +115,12 @@ struct spu_texture
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float tex_size;
-   vector unsigned int tex_size_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+   vector float tex_size; /**< == {width, height, 0, 0} */
+   vector float width4;   /**< == {width, width, width, width} */
+   vector float height4;  /**< == {height, height, height, height} */
+   vector unsigned int tex_size_mask; /**< == {width-1, height-1, 0, 0 } */
+   vector unsigned int tex_size_x_mask; /**< splat(width-1) */
+   vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
 
 
@@ -159,6 +169,7 @@ struct spu_global
 
    /** Current texture sampler function */
    spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 117b8a36f8..12e6ed1ba1 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <transpose_matrix4x4.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -91,10 +93,10 @@ static void
 get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 {
    const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);
-   vec_uint4 tile_y = spu_rlmask(y, -5);
-   const qword offset_x = si_andi((qword) x, 0x1f);
-   const qword offset_y = si_andi((qword) y, 0x1f);
+   vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
    const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
@@ -132,6 +134,31 @@ sample_texture_nearest(uint unit, vector float texcoord)
 }
 
 
+/**
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, vector float colors[4])
+{
+   vector float ss = spu_mul(s, spu.texture[unit].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].height4);
+   vector unsigned int is = spu_convtu(ss, 0);
+   vector unsigned int it = spu_convtu(tt, 0);
+   vec_uint4 texels[4];
+
+   /* GL_REPEAT wrap mode: */
+   is = spu_and(is, spu.texture[unit].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].tex_size_y_mask);
+
+   get_four_texels(unit, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
+}
+
+
 vector float
 sample_texture_bilinear(uint unit, vector float texcoord)
 {
@@ -198,3 +225,93 @@ sample_texture_bilinear(uint unit, vector float texcoord)
 
    return texel_sum;
 }
+
+
+void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4])
+{
+   vector float ss = spu_madd(s, spu.texture[unit].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f));
+
+   vector unsigned int is0 = spu_convtu(ss, 0);
+   vector unsigned int it0 = spu_convtu(tt, 0);
+
+   /* is + 1, it + 1 */
+   vector unsigned int is1 = spu_add(is0, 1);
+   vector unsigned int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+
+   /* XXX possibly rework following code to compute the weighted sample
+    * colors with integer arithmetic for fewer int->float conversions.
+    */
+
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f7c9738be8..f019e7d8ef 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -40,8 +40,20 @@ extern vector float
 sample_texture_nearest(uint unit, vector float texcoord);
 
 
+extern void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, vector float colors[4]);
+
+
 extern vector float
 sample_texture_bilinear(uint unit, vector float texcoord);
 
 
+extern void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4]);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From c8fb3682619ea49c5fefdf8b88cdb95eac7478ff Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 11:16:04 -0600
Subject: cell: remove old texture code

---
 src/gallium/drivers/cell/spu/spu_command.c |  2 -
 src/gallium/drivers/cell/spu/spu_funcs.c   | 19 -------
 src/gallium/drivers/cell/spu/spu_main.h    |  4 --
 src/gallium/drivers/cell/spu/spu_texture.c | 88 ++----------------------------
 src/gallium/drivers/cell/spu/spu_texture.h |  8 ---
 src/gallium/drivers/cell/spu/spu_tri.c     | 67 +----------------------
 6 files changed, 7 insertions(+), 181 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c59be7defd..d4cc9a2146 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -302,11 +302,9 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
 
    spu.sampler[sampler->unit] = sampler->state;
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
-      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
    else {
-      spu.sample_texture[sampler->unit] = sample_texture_nearest;
       spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
    }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 13c234ea2e..4c90b701ee 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -104,27 +104,8 @@ static struct vec_4x4
 spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
-   //const uint unit = 0;
    struct vec_4x4 colors;
-#if 0
-   vector float coords[4];
-
-   coords[0] = s;
-   coords[1] = t;
-   coords[2] = r;
-   coords[3] = q;
-   _transpose_matrix4x4(coords, coords);
-
-   /* get four texture samples */
-   colors.v[0] = spu.sample_texture[unit](unit, coords[0]);
-   colors.v[1] = spu.sample_texture[unit](unit, coords[1]);
-   colors.v[2] = spu.sample_texture[unit](unit, coords[2]);
-   colors.v[3] = spu.sample_texture[unit](unit, coords[3]);
-
-   _transpose_matrix4x4(colors.v, colors.v);
-#else
    spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
-#endif
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 5d14be51c2..2a8cb00f8d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -64,9 +64,6 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef vector float (*spu_sample_texture_func)(uint unit,
-                                                vector float texcoord);
-
 typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
@@ -168,7 +165,6 @@ struct spu_global
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
    spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 12e6ed1ba1..ba62ad27fd 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -120,21 +120,9 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 }
 
 
-/**
- * Get texture sample at texcoord.
- */
-vector float
-sample_texture_nearest(uint unit, vector float texcoord)
-{
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
-   uint texel = get_texel(unit, itc);
-   return spu_unpack_A8R8G8B8(texel);
-}
-
 
 /**
+ * Do nearest texture sampling for four pixels.
  * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
 void
@@ -148,7 +136,7 @@ sample_texture4_nearest(vector float s, vector float t,
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
-   /* GL_REPEAT wrap mode: */
+   /* PIPE_TEX_WRAP_REPEAT */
    is = spu_and(is, spu.texture[unit].tex_size_x_mask);
    it = spu_and(it, spu.texture[unit].tex_size_y_mask);
 
@@ -159,74 +147,10 @@ sample_texture4_nearest(vector float s, vector float t,
 }
 
 
-vector float
-sample_texture_bilinear(uint unit, vector float texcoord)
-{
-   static const vec_uint4 offset_x = {0, 0, 1, 1};
-   static const vec_uint4 offset_y = {0, 1, 0, 1};
-
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
-
-   /* integer texcoords S,T: */
-   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
-
-   vec_uint4 texels[4];
-   
-   /* setup texcoords for quad:
-    *  +-----+-----+
-    *  |x0,y0|x1,y1|
-    *  +-----+-----+
-    *  |x2,y2|x3,y3|
-    *  +-----+-----+
-    */
-   vec_uint4 x = spu_splats(spu_extract(itc, 0));
-   vec_uint4 y = spu_splats(spu_extract(itc, 1));
-   x = spu_add(x, offset_x);
-   y = spu_add(y, offset_y);
-
-   /* GL_REPEAT wrap mode: */
-   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
-   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
-
-   get_four_texels(unit, x, y, texels);
-
-   /* integer A8R8G8B8 to float texel conversion */
-   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
-   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
-   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
-
-
-   /* Compute weighting factors in [0,1]
-    * Multiply texcoord by 1024, AND with 1023, convert back to float.
-    */
-   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
-   vector signed int itc1024 = spu_convts(tc1024, 0);
-   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
-   vector float weight = spu_convtf(itc1024, 10);
-
-   /* smeared frac and 1-frac */
-   vector float sfrac = spu_splats(spu_extract(weight, 0));
-   vector float tfrac = spu_splats(spu_extract(weight, 1));
-   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
-   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
-
-   /* multiply the samples (colors) by the S/T weights */
-   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
-   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
-   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
-   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
-
-   /* compute sum of weighted samples */
-   vector float texel_sum = spu_add(texel00, texel01);
-   texel_sum = spu_add(texel_sum, texel10);
-   texel_sum = spu_add(texel_sum, texel11);
-
-   return texel_sum;
-}
-
-
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f019e7d8ef..d576aed719 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -36,20 +36,12 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern vector float
-sample_texture_nearest(uint unit, vector float texcoord);
-
-
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, vector float colors[4]);
 
 
-extern vector float
-sample_texture_bilinear(uint unit, vector float texcoord);
-
-
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index a62d4f0f2f..022d21ba8f 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -286,72 +286,7 @@ emit_quad( int x, int y, mask_t mask)
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
-      if (0/*spu.texture[0].start*/) {
-         /*
-          * Temporary texture mapping path
-          * This will go away when fragment programs support TEX inst.
-          */
-         const uint unit = 0;
-         vector float colors[4];
-         vector float texcoords[4];
-         eval_coeff(2, (float) x, (float) y, texcoords);
-
-         if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
-         if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
-         if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
-         if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-
-         if (spu.texture[1].start) {
-            /* multi-texture mapping */
-            const uint unit = 1;
-            vector float colors1[4];
-
-            eval_coeff(2, (float) x, (float) y, texcoords);
-
-            if (spu_extract(mask, 0))
-               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
-            if (spu_extract(mask, 1))
-               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
-            if (spu_extract(mask, 2))
-               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
-            if (spu_extract(mask, 3))
-               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-            /* hack: modulate first texture by second */
-            colors[0] = spu_mul(colors[0], colors1[0]);
-            colors[1] = spu_mul(colors[1], colors1[1]);
-            colors[2] = spu_mul(colors[2], colors1[2]);
-            colors[3] = spu_mul(colors[3], colors1[3]);
-         }
-
-         {
-            /* Convert fragment data from AoS to SoA format.
-             * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-             * This is temporary!
-             */
-            vector float soa_frag[4];
-            _transpose_matrix4x4(soa_frag, colors);
-
-            vector float fragZ = eval_z((float) x, (float) y);
-
-            /* Do all per-fragment/quad operations here, including:
-             * alpha test, z test, stencil test, blend and framebuffer writing.
-             */
-            spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                             fragZ,
-                             soa_frag[0], soa_frag[1],
-                             soa_frag[2], soa_frag[3],
-                             mask,
-                             setup.facing);
-         }
-
-      }
-      else {
+      {
          /*
           * Run fragment shader, execute per-fragment ops, update fb/tile.
           */
-- 
cgit v1.2.3


From 67425aaa09df9cab76d7cc5c66e9e4595f0ccf40 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 14:09:54 -0600
Subject: cell: bilinear texture filtering using integer arithmetic

Fewer float/int conversions involved.
---
 src/gallium/drivers/cell/spu/spu_texture.c | 144 +++++++++++++++++++++++++++++
 src/gallium/drivers/cell/spu/spu_texture.h |   5 +
 2 files changed, 149 insertions(+)

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index ba62ad27fd..c10268131d 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -239,3 +239,147 @@ sample_texture4_bilinear(vector float s, vector float t,
    colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
                        spu_add(ftexels[11], ftexels[15]));
 }
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut, vector unsigned int *mIn)
+{
+  vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
+  vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
+  vector unsigned int aibj, ckdl, emfn, gohp;	/* intermediate vectors */
+
+  vector unsigned char shufflehi = ((vector unsigned char) {
+					       0x00, 0x01, 0x02, 0x03,
+					       0x10, 0x11, 0x12, 0x13,
+					       0x04, 0x05, 0x06, 0x07,
+					       0x14, 0x15, 0x16, 0x17});
+  vector unsigned char shufflelo = ((vector unsigned char) {
+					       0x08, 0x09, 0x0A, 0x0B,
+					       0x18, 0x19, 0x1A, 0x1B,
+					       0x0C, 0x0D, 0x0E, 0x0F,
+					       0x1C, 0x1D, 0x1E, 0x1F});
+  abcd = *(mIn+0);
+  efgh = *(mIn+1);
+  ijkl = *(mIn+2);
+  mnop = *(mIn+3);
+
+  aibj = spu_shuffle(abcd, ijkl, shufflehi);
+  ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+  emfn = spu_shuffle(efgh, mnop, shufflehi);
+  gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+  aeim = spu_shuffle(aibj, emfn, shufflehi);
+  bfjn = spu_shuffle(aibj, emfn, shufflelo);
+  cgko = spu_shuffle(ckdl, gohp, shufflehi);
+  dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+  *(mOut+0) = aeim;
+  *(mOut+1) = bfjn;
+  *(mOut+2) = cgko;
+  *(mOut+3) = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int intead of float arithmetic
+ */
+void
+sample_texture4_bilinear_2(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4])
+{
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+   /* Scale texcoords by size of texture, and add half pixel bias */
+   vector float ss = spu_madd(s, spu.texture[unit].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].height4, half);
+
+   /* convert float coords to fixed-pt coords with 8 fraction bits */
+   vector unsigned int is = spu_convtu(ss, 8);
+   vector unsigned int it = spu_convtu(tt, 8);
+
+   /* compute integer texel weights in [0, 255] */
+   vector signed int sWeights0 = spu_and((vector signed int) is, 255);
+   vector signed int tWeights0 = spu_and((vector signed int) it, 255);
+   vector signed int sWeights1 = spu_sub(255, sWeights0);
+   vector signed int tWeights1 = spu_sub(255, tWeights0);
+
+   /* texel coords: is0 = is / 256, it0 = is / 256 */
+   vector unsigned int is0 = spu_rlmask(is, -8);
+   vector unsigned int it0 = spu_rlmask(it, -8);
+
+   /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+   vector unsigned int is1 = spu_add(is0, 1);
+   vector unsigned int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+
+   /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+   {
+      static const unsigned char ZERO = 0x80;
+      int i;
+      for (i = 0; i < 16; i++) {
+         texels[i] = spu_shuffle(texels[i], texels[i],
+                                 ((vector unsigned char) {
+                                    ZERO, ZERO, ZERO, 1,
+                                    ZERO, ZERO, ZERO, 2,
+                                    ZERO, ZERO, ZERO, 3,
+                                    ZERO, ZERO, ZERO, 0}));
+      }
+   }
+
+   /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+   transpose(texels + 0, texels + 0);
+   transpose(texels + 4, texels + 4);
+   transpose(texels + 8, texels + 8);
+   transpose(texels + 12, texels + 12);
+
+   /* computed weighted colors */
+   vector unsigned int c0, c1, c2, c3, cSum;
+
+   /* red */
+   c0 = (vector unsigned int) si_mpyu((qword) texels[ 0], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texels[ 4], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texels[ 8], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texels[12], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[0] = spu_convtf(cSum, 24);
+
+   /* green */
+   c0 = (vector unsigned int) si_mpyu((qword) texels[ 1], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texels[ 5], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texels[ 9], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texels[13], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[1] = spu_convtf(cSum, 24);
+
+   /* blue */
+   c0 = (vector unsigned int) si_mpyu((qword) texels[ 2], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texels[ 6], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texels[10], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texels[14], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[2] = spu_convtf(cSum, 24);
+
+   /* alpha */
+   c0 = (vector unsigned int) si_mpyu((qword) texels[ 3], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texels[ 7], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texels[11], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texels[15], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[3] = spu_convtf(cSum, 24);
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index d576aed719..38a17deda2 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -47,5 +47,10 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4]);
 
+extern void
+sample_texture4_bilinear_2(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, vector float colors[4]);
+
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 420e8cdf25501dd82e1c178e6300d7b416798e25 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 14:10:36 -0600
Subject: cell: remove more old texture code

---
 src/gallium/drivers/cell/spu/spu_texture.c | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index c10268131d..3f2280436c 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -50,32 +50,6 @@ invalidate_tex_cache(void)
 }
 
 
-/**
- * XXX look into getting texels for all four pixels in a quad at once.
- */
-static uint
-get_texel(uint unit, vec_uint4 coordinate)
-{
-   /*
-    * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
-    * SIMD since X and Y are already in a SIMD register.
-    */
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   ushort x = spu_extract(coordinate, 0);
-   ushort y = spu_extract(coordinate, 1);
-   unsigned tile_offset = sizeof(tile_t)
-      * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
-   ushort texel_offset = (ushort) 4
-      * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
-   vec_uint4 tmp;
-
-   spu_dcache_fetch_unaligned((qword *) & tmp,
-                              texture_ea + tile_offset + texel_offset,
-                              4);
-   return spu_extract(tmp, 0);
-}
-
-
 /**
  * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
  *
-- 
cgit v1.2.3


From c05cabd646f1c7384b5187e3427064096aef4673 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 14:31:11 -0600
Subject: cell: use fewer memory references in sample_texture4_bilinear_2()

---
 src/gallium/drivers/cell/spu/spu_texture.c | 56 +++++++++++++++++-------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 3f2280436c..96ef88822a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -220,7 +220,11 @@ sample_texture4_bilinear(vector float s, vector float t,
  * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
  */
 static INLINE void
-transpose(vector unsigned int *mOut, vector unsigned int *mIn)
+transpose(vector unsigned int *mOut0,
+          vector unsigned int *mOut1,
+          vector unsigned int *mOut2,
+          vector unsigned int *mOut3,
+          vector unsigned int *mIn)
 {
   vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
   vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
@@ -251,10 +255,10 @@ transpose(vector unsigned int *mOut, vector unsigned int *mIn)
   cgko = spu_shuffle(ckdl, gohp, shufflehi);
   dhlp = spu_shuffle(ckdl, gohp, shufflelo);
 
-  *(mOut+0) = aeim;
-  *(mOut+1) = bfjn;
-  *(mOut+2) = cgko;
-  *(mOut+3) = dhlp;
+  *mOut0 = aeim;
+  *mOut1 = bfjn;
+  *mOut2 = cgko;
+  *mOut3 = dhlp;
 }
 
 
@@ -317,43 +321,45 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    }
 
    /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
-   transpose(texels + 0, texels + 0);
-   transpose(texels + 4, texels + 4);
-   transpose(texels + 8, texels + 8);
-   transpose(texels + 12, texels + 12);
+   vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+      texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+   transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+   transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+   transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+   transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
 
    /* computed weighted colors */
    vector unsigned int c0, c1, c2, c3, cSum;
 
    /* red */
-   c0 = (vector unsigned int) si_mpyu((qword) texels[ 0], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texels[ 4], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texels[ 8], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texels[12], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[0] = spu_convtf(cSum, 24);
 
    /* green */
-   c0 = (vector unsigned int) si_mpyu((qword) texels[ 1], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texels[ 5], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texels[ 9], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texels[13], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[1] = spu_convtf(cSum, 24);
 
    /* blue */
-   c0 = (vector unsigned int) si_mpyu((qword) texels[ 2], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texels[ 6], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texels[10], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texels[14], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[2] = spu_convtf(cSum, 24);
 
    /* alpha */
-   c0 = (vector unsigned int) si_mpyu((qword) texels[ 3], si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
-   c1 = (vector unsigned int) si_mpyu((qword) texels[ 7], si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
-   c2 = (vector unsigned int) si_mpyu((qword) texels[11], si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
-   c3 = (vector unsigned int) si_mpyu((qword) texels[15], si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[3] = spu_convtf(cSum, 24);
 }
-- 
cgit v1.2.3


From b0c136cfb1fcbcea35e17dc699a96acbb24738f5 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 15:17:01 -0600
Subject: cell: remove old texture-related fields

---
 src/gallium/drivers/cell/spu/spu_command.c | 3 ---
 src/gallium/drivers/cell/spu/spu_main.h    | 2 --
 2 files changed, 5 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d4cc9a2146..64890f6dbd 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -330,9 +330,6 @@ cmd_state_texture(const struct cell_command_texture *texture)
 
    spu.texture[unit].tiles_per_row = width / TILE_SIZE;
 
-   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
-   spu.texture[unit].tex_size_mask = (vector unsigned int)
-         { width - 1, height - 1, 0, 0 };
    spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
    spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 2a8cb00f8d..e3960dbe8b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -112,10 +112,8 @@ struct spu_texture
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float tex_size; /**< == {width, height, 0, 0} */
    vector float width4;   /**< == {width, width, width, width} */
    vector float height4;  /**< == {height, height, height, height} */
-   vector unsigned int tex_size_mask; /**< == {width-1, height-1, 0, 0 } */
    vector unsigned int tex_size_x_mask; /**< splat(width-1) */
    vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
-- 
cgit v1.2.3


From 978799beb2a9c51550abb1f37bb6f63d06bc4717 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 16:43:11 -0600
Subject: cell: initial work for mipmap texture filtering

---
 src/gallium/drivers/cell/common.h              |   6 +-
 src/gallium/drivers/cell/ppu/cell_screen.c     |   4 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  18 ++--
 src/gallium/drivers/cell/ppu/cell_texture.c    |  48 ++++++----
 src/gallium/drivers/cell/ppu/cell_texture.h    |   6 +-
 src/gallium/drivers/cell/spu/spu_command.c     |  37 +++++---
 src/gallium/drivers/cell/spu/spu_funcs.c       |   1 +
 src/gallium/drivers/cell/spu/spu_main.h        |   7 +-
 src/gallium/drivers/cell/spu/spu_texture.c     | 120 ++++++++++++++++++-------
 src/gallium/drivers/cell/spu/spu_texture.h     |   6 ++
 10 files changed, 176 insertions(+), 77 deletions(-)

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 5dc756023f..e4de9a551d 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -67,6 +67,7 @@
 #define CELL_MAX_SPUS 6
 
 #define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
 
 #define TILE_SIZE 32
 
@@ -251,8 +252,9 @@ struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
    uint unit;
-   void *start;         /**< Address in main memory */
-   ushort width, height;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 47ba6fa290..d223557950 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -76,11 +76,11 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
       return 10;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
       return 8;  /* max 128x128x128 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    default:
       return 10;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index cbfa393cfb..7090b4c99f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -211,14 +211,20 @@ cell_emit_state(struct cell_context *cell)
          texture->opcode = CELL_CMD_STATE_TEXTURE;
          texture->unit = i;
          if (cell->texture[i]) {
-            texture->start = cell->texture[i]->tiled_data;
-            texture->width = cell->texture[i]->base.width[0];
-            texture->height = cell->texture[i]->base.height[0];
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = cell->texture[i]->tiled_data[level];
+               texture->width[level] = cell->texture[i]->base.width[level];
+               texture->height[level] = cell->texture[i]->base.height[level];
+            }
          }
          else {
-            texture->start = NULL;
-            texture->width = 1;
-            texture->height = 1;
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = NULL;
+               texture->width[level] = 1;
+               texture->height[level] = 1;
+            }
          }
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index b6590dfb86..f5f81ac3cc 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -66,6 +66,8 @@ cell_texture_layout(struct cell_texture * spt)
       unsigned size;
       unsigned w_tile, h_tile;
 
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
       /* width, height, rounded up to tile size */
       w_tile = align(width, TILE_SIZE);
       h_tile = align(height, TILE_SIZE);
@@ -249,33 +251,41 @@ cell_tile_texture(struct cell_context *cell,
                   struct cell_texture *texture)
 {
    struct pipe_screen *screen = cell->pipe.screen;
-   uint face = 0, level = 0, zslice = 0;
-   struct pipe_surface *surf;
-   const uint w = texture->base.width[0], h = texture->base.height[0];
+   uint face = 0, level, zslice = 0;
    const uint *src;
 
-   /* temporary restrictions: */
-   assert(w >= TILE_SIZE);
-   assert(h >= TILE_SIZE);
-   assert(w % TILE_SIZE == 0);
-   assert(h % TILE_SIZE == 0);
+   for (level = 0; level <= texture->base.last_level; level++) {
+      if (!texture->tiled_data[level]) {
+         struct pipe_surface *surf;
 
-   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
-                                  PIPE_BUFFER_USAGE_CPU_WRITE);
-   ASSERT(surf);
+         const uint w = texture->base.width[level], h = texture->base.height[level];
 
-   src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
+         if (w < 32 || h < 32)
+            continue;
+         /* temporary restrictions: */
+         assert(w >= TILE_SIZE);
+         assert(h >= TILE_SIZE);
+         assert(w % TILE_SIZE == 0);
+         assert(h % TILE_SIZE == 0);
 
-   if (texture->tiled_data) {
-      align_free(texture->tiled_data);
-   }
-   texture->tiled_data = align_malloc(w * h * 4, 16);
+         surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+         ASSERT(surf);
+         
+         src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
 
-   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+         if (texture->tiled_data[level]) {
+            align_free(texture->tiled_data[level]);
+         }
+         texture->tiled_data[level] = align_malloc(w * h * 4, 16);
 
-   pipe_surface_unmap(surf);
+         tile_copy_data(w, h, TILE_SIZE, texture->tiled_data[level], src);
 
-   pipe_surface_reference(&surf, NULL);
+         pipe_surface_unmap(surf);
+
+         pipe_surface_reference(&surf, NULL);
+      }
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d37e95ebc..6d35736984 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -40,15 +40,15 @@ struct cell_texture
 {
    struct pipe_texture base;
 
-   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
-   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
 
    /* The data is held here:
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
+   void *tiled_data[CELL_MAX_TEXTURE_LEVELS];  /* XXX this may be temporary */ /*ALIGN16*/
 };
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 64890f6dbd..089af22415 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,6 +301,12 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
+#if 0
+   if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      spu.sample_texture4[sampler->unit] = sample_texture4_lod;
+   }
+   else
+#endif
    if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
@@ -314,24 +320,29 @@ static void
 cmd_state_texture(const struct cell_command_texture *texture)
 {
    const uint unit = texture->unit;
-   const uint width = texture->width;
-   const uint height = texture->height;
+   uint i;
 
-   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
-             texture->unit, texture->start,
-             texture->width, texture->height);
+   DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
-   spu.texture[unit].start = texture->start;
-   spu.texture[unit].width = width;
-   spu.texture[unit].height = height;
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
 
-   spu.texture[unit].width4 = spu_splats((float) width);
-   spu.texture[unit].height4 = spu_splats((float) height);
+      DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
 
-   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
 
-   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
-   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+      spu.texture[unit].level[i].tiles_per_row = width / TILE_SIZE;
+
+      spu.texture[unit].level[i].width4 = spu_splats((float) width);
+      spu.texture[unit].level[i].height4 = spu_splats((float) height);
+
+      spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
+      spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+   }
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 4c90b701ee..f2946010bd 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -100,6 +100,7 @@ spu_log2(vector float x)
    return spu_mul(v, k);
 }
 
+
 static struct vec_4x4
 spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e3960dbe8b..9515543efe 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,7 +107,7 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
-struct spu_texture
+struct spu_texture_level
 {
    void *start;
    ushort width, height;
@@ -118,6 +118,11 @@ struct spu_texture
    vector unsigned int tex_size_y_mask; /**< splat(height-1) */
 } ALIGN16_ATTRIB;
 
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+} ALIGN16_ATTRIB;
+
 
 /**
  * All SPU global/context state will be in a singleton object of this type:
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 96ef88822a..96c09e3ccb 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -27,6 +27,7 @@
 
 
 #include <transpose_matrix4x4.h>
+#include <math.h>
 
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
@@ -42,11 +43,12 @@
 void
 invalidate_tex_cache(void)
 {
+   uint lvl = 0;
    uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].width
-      * spu.texture[unit].height;
+   uint bytes = 4 * spu.texture[unit].level[lvl].width
+      * spu.texture[unit].level[lvl].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
+   spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
 }
 
 
@@ -64,15 +66,17 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
+                vec_uint4 *texels)
 {
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   const unsigned texture_ea = (uintptr_t) tlevel->start;
    vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -104,17 +108,18 @@ sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, vector float colors[4])
 {
-   vector float ss = spu_mul(s, spu.texture[unit].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].height4);
+   const uint lvl = 0;
+   vector float ss = spu_mul(s, spu.texture[unit].level[lvl].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].level[lvl].height4);
    vector unsigned int is = spu_convtu(ss, 0);
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].tex_size_y_mask);
+   is = spu_and(is, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].level[lvl].tex_size_y_mask);
 
-   get_four_texels(unit, is, it, texels);
+   get_four_texels(unit, lvl, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -130,8 +135,9 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4])
 {
-   vector float ss = spu_madd(s, spu.texture[unit].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].height4, spu_splats(-0.5f));
+   const uint lvl = 0;
+   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, spu_splats(-0.5f));
 
    vector unsigned int is0 = spu_convtu(ss, 0);
    vector unsigned int it0 = spu_convtu(tt, 0);
@@ -141,17 +147,17 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -270,10 +276,11 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, vector float colors[4])
 {
+   const uint lvl = 0;
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].height4, half);
+   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
    vector unsigned int is = spu_convtu(ss, 8);
@@ -294,17 +301,17 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -363,3 +370,54 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
    colors[3] = spu_convtf(cSum, 24);
 }
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static float
+compute_lambda(uint unit, vector float s, vector float t)
+{
+   uint lvl = 0;
+   float width = spu.texture[unit].level[lvl].width;
+   float height = spu.texture[unit].level[lvl].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+   float lambda = logf(rho) * 1.442695f;
+   return lambda;
+}
+
+
+
+/**
+ * Texture sampling with level of detail selection.
+ */
+void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, vector float colors[4])
+{
+   float lambda = compute_lambda(unit, s, t);
+
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   /* hack for now */
+   int level = (int) lambda;
+   if (level > 3)
+      level = 3;
+
+   /*
+   sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
+   */
+}
+
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 38a17deda2..4802f7c47c 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -53,4 +53,10 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                          uint unit, vector float colors[4]);
 
 
+extern void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, vector float colors[4]);
+
+
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From dee18a147d3adaf2578d27837c8f18c92d796c9d Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 19:41:26 -0600
Subject: cell: finish-up perspective-corrected interpolation

---
 src/gallium/drivers/cell/spu/spu_tri.c | 127 +++++++++++++++++++++------------
 1 file changed, 82 insertions(+), 45 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 022d21ba8f..3f1fb4f7c9 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -213,7 +213,7 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be computed (in AoS format).
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, vector float result[4])
+eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
 {
    switch (spu.vertex_info.attrib[slot].interp_mode) {
    case INTERP_CONSTANT:
@@ -222,23 +222,43 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
       result[QUAD_BOTTOM_LEFT] =
       result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
       break;
-
    case INTERP_LINEAR:
-      /* fall-through, for now */
-   default:
       {
-         register vector float dadx = setup.coef[slot].dadx.v;
-         register vector float dady = setup.coef[slot].dady.v;
-         register vector float topLeft
-            = spu_add(setup.coef[slot].a0.v,
-                      spu_add(spu_mul(spu_splats(x), dadx),
-                              spu_mul(spu_splats(y), dady)));
+         vector float dadx = setup.coef[slot].dadx.v;
+         vector float dady = setup.coef[slot].dady.v;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0.v,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
 
          result[QUAD_TOP_LEFT] = topLeft;
          result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
          result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
          result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
       }
+      break;
+   case INTERP_PERSPECTIVE:
+      {
+         vector float dadx = setup.coef[slot].dadx.v;
+         vector float dady = setup.coef[slot].dady.v;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0.v,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
+
+         vector float wInv = spu_re(w);  /* 1.0 / w */
+
+         result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
+         result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
+         result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
+         result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
+      }
+      break;
+   case INTERP_POS:
+   case INTERP_NONE:
+      break;
+   default:
+      ASSERT(0);
    }
 }
 
@@ -248,14 +268,14 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
  * XXX this will all be re-written someday.
  */
 static INLINE void
-eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
 {
-   eval_coeff(slot, x, y, result);
+   eval_coeff(slot, x, y, w, result);
    _transpose_matrix4x4(result, result);
 }
 
 
-
+/** Evalute coefficients to get Z for four pixels in a quad */
 static INLINE vector float
 eval_z(float x, float y)
 {
@@ -269,6 +289,20 @@ eval_z(float x, float y)
 }
 
 
+/** Evalute coefficients to get W for four pixels in a quad */
+static INLINE vector float
+eval_w(float x, float y)
+{
+   const uint slot = 0;
+   const float dwdx = setup.coef[slot].dadx.f[3];
+   const float dwdy = setup.coef[slot].dady.f[3];
+   const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -292,14 +326,15 @@ emit_quad( int x, int y, mask_t mask)
           */
          vector float inputs[4*4], outputs[2*4];
          vector float fragZ = eval_z((float) x, (float) y);
+         vector float fragW = eval_w((float) x, (float) y);
 
          /* setup inputs */
 #if 0
-         eval_coeff_soa(1, (float) x, (float) y, inputs);
+         eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
 #else
          uint i;
          for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-            eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
+            eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
          }
 #endif
          ASSERT(spu.fragment_program);
@@ -658,7 +693,6 @@ tri_linear_coeff4(uint slot)
 
 
-#if 0
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
  * for a triangle.
@@ -667,38 +701,41 @@ tri_linear_coeff4(uint slot)
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void tri_persp_coeff( unsigned slot,
-                             unsigned i )
+static void
+tri_persp_coeff4(uint slot)
 {
-   /* premultiply by 1/w:
-    */
-   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
-   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
-   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
-
-   float botda = mida - mina;
-   float majda = maxa - mina;
-   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-      
-   /*
-   printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup.vmin->data[slot][i],
-          setup.vmid->data[slot][i],
-          setup.vmax->data[slot][i]
-          );
-   */
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+   const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+   const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
 
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
-   assert(i <= 3);
+   vector float vmin_d = setup.vmin->data[slot];
+   vector float vmid_d = setup.vmid->data[slot];
+   vector float vmax_d = setup.vmax->data[slot];
 
-   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-   setup.coef[slot].a0.f[i] = (mina - 
-			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
-			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+   vmin_d = spu_mul(vmin_d, vmin_w);
+   vmid_d = spu_mul(vmid_d, vmid_w);
+   vmax_d = spu_mul(vmax_d, vmax_w);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+                         
+   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
-#endif
+
 
 
 /**
@@ -726,7 +763,7 @@ static void setup_tri_coefficients(void)
          tri_linear_coeff4(i);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff4(i);  /* temporary */
+         tri_persp_coeff4(i);
          break;
       default:
          ASSERT(0);
-- 
cgit v1.2.3


From 5d7cc6176de09e683e5b40a69df250d1abfaf6f0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 19:50:20 -0600
Subject: cell: remove dead code, clean-up, reformatting

---
 src/gallium/drivers/cell/spu/spu_tri.c | 114 +++++++--------------------------
 1 file changed, 24 insertions(+), 90 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 3f1fb4f7c9..438fae84a8 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -120,19 +120,11 @@ struct setup_stage {
 
    uint facing;
 
-   uint tx, ty;
+   uint tx, ty;  /**< position of current tile (x, y) */
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 
-#if 0
-   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#else
    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#endif
-
-#if 0
-   struct quad_header quad; 
-#endif
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
@@ -144,69 +136,9 @@ struct setup_stage {
 };
 
 
-
 static struct setup_stage setup;
 
 
-
-
-#if 0
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-#endif
-
-#if 0
-/**
- * Clip setup.quad against the scissor/surface bounds.
- */
-static INLINE void
-quad_clip(struct setup_stage *setup)
-{
-   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
-   const int minx = (int) cliprect->minx;
-   const int maxx = (int) cliprect->maxx;
-   const int miny = (int) cliprect->miny;
-   const int maxy = (int) cliprect->maxy;
-
-   if (setup.quad.x0 >= maxx ||
-       setup.quad.y0 >= maxy ||
-       setup.quad.x0 + 1 < minx ||
-       setup.quad.y0 + 1 < miny) {
-      /* totally clipped */
-      setup.quad.mask = 0x0;
-      return;
-   }
-   if (setup.quad.x0 < minx)
-      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup.quad.y0 < miny)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup.quad.x0 == maxx - 1)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup.quad.y0 == maxy - 1)
-      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-}
-#endif
-
-#if 0
-/**
- * Emit a quad (pass to next stage) with clipping.
- */
-static INLINE void
-clip_emit_quad(struct setup_stage *setup)
-{
-   quad_clip(setup);
-   if (setup.quad.mask) {
-      struct softpipe_context *sp = setup.softpipe;
-      sp->quad.first->run(sp->quad.first, &setup.quad);
-   }
-}
-#endif
-
 /**
  * Evaluate attribute coefficients (plane equations) to compute
  * attribute values for the four fragments in a quad.
@@ -363,7 +295,8 @@ emit_quad( int x, int y, mask_t mask)
  * Given an X or Y coordinate, return the block/quad coordinate that it
  * belongs to.
  */
-static INLINE int block( int x )
+static INLINE int
+block(int x)
 {
    return x & ~1;
 }
@@ -374,7 +307,8 @@ static INLINE int block( int x )
  * the triangle's bounds.
  * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
  */
-static INLINE mask_t calculate_mask( int x )
+static INLINE mask_t
+calculate_mask(int x)
 {
    /* This is a little tricky.
     * Use & instead of && to avoid branches.
@@ -392,7 +326,8 @@ static INLINE mask_t calculate_mask( int x )
 /**
  * Render a horizontal span of quads
  */
-static void flush_spans( void )
+static void
+flush_spans(void)
 {
    int minleft, maxright;
    int x;
@@ -420,7 +355,6 @@ static void flush_spans( void )
       return;
    }
 
-
    /* OK, we're very likely to need the tile data now.
     * clear or finish waiting if needed.
     */
@@ -456,9 +390,7 @@ static void flush_spans( void )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-#if 1
       emit_quad( x, setup.span.y, calculate_mask( x ));
-#endif
    }
 
    setup.span.y = 0;
@@ -467,8 +399,10 @@ static void flush_spans( void )
    setup.span.right[1] = 0;
 }
 
+
 #if DEBUG_VERTS
-static void print_vertex(const struct vertex_header *v)
+static void
+print_vertex(const struct vertex_header *v)
 {
    int i;
    fprintf(stderr, "Vertex: (%p)\n", v);
@@ -480,11 +414,11 @@ static void print_vertex(const struct vertex_header *v)
 #endif
 
 
-static boolean setup_sort_vertices(const struct vertex_header *v0,
-                                   const struct vertex_header *v1,
-                                   const struct vertex_header *v2)
+static boolean
+setup_sort_vertices(const struct vertex_header *v0,
+                    const struct vertex_header *v1,
+                    const struct vertex_header *v2)
 {
-
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
    print_vertex(v0);
@@ -692,7 +626,6 @@ tri_linear_coeff4(uint slot)
 }
 
 
-
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
  * for a triangle.
@@ -742,7 +675,8 @@ tri_persp_coeff4(uint slot)
  * Compute the setup.coef[] array dadx, dady, a0 values.
  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
  */
-static void setup_tri_coefficients(void)
+static void
+setup_tri_coefficients(void)
 {
 #if 1
    uint i;
@@ -779,7 +713,8 @@ static void setup_tri_coefficients(void)
 }
 
 
-static void setup_tri_edges(void)
+static void
+setup_tri_edges(void)
 {
    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
@@ -809,9 +744,8 @@ static void setup_tri_edges(void)
  * Render the upper or lower half of a triangle.
  * Scissoring/cliprect is applied here too.
  */
-static void subtriangle( struct edge *eleft,
-			 struct edge *eright,
-			 unsigned lines )
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 {
    const int minx = setup.cliprect_minx;
    const int maxx = setup.cliprect_maxx;
@@ -878,10 +812,9 @@ static void subtriangle( struct edge *eleft,
    eright->sy += lines;
 }
 
+
 static float
-determinant( const float *v0,
-             const float *v1,
-             const float *v2 )
+determinant(const float *v0, const float *v1, const float *v2)
 {
    /* edge vectors e = v0 - v2, f = v1 - v2 */
    const float ex = v0[0] - v2[0];
@@ -899,7 +832,8 @@ determinant( const float *v0,
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
+tri_draw(const float *v0, const float *v1, const float *v2,
+         uint tx, uint ty, uint front_winding)
 {
    setup.tx = tx;
    setup.ty = ty;
-- 
cgit v1.2.3


From fc562a7acd86bee4853d38961e29c8da3d56e548 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 13 Oct 2008 20:19:51 -0600
Subject: cell: more clean-up in spu_tri.c

---
 src/gallium/drivers/cell/spu/spu_tri.c | 100 ++++++---------------------------
 1 file changed, 16 insertions(+), 84 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 438fae84a8..03f094373d 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -116,7 +116,7 @@ struct setup_stage {
    struct edge etop;
    struct edge emaj;
 
-   float oneoverarea;
+   float oneOverArea;
 
    uint facing;
 
@@ -507,13 +507,13 @@ setup_sort_vertices(const struct vertex_header *v0,
     * use the prim->det value because its sign is correct.
     */
    {
-      const float area = (setup.emaj.dx * setup.ebot.dy - 
-			    setup.ebot.dx * setup.emaj.dy);
+      const float area = (setup.emaj.dx * setup.ebot.dy -
+                          setup.ebot.dx * setup.emaj.dy);
 
-      setup.oneoverarea = 1.0f / area;
+      setup.oneOverArea = 1.0f / area;
       /*
       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup.oneoverarea, area, prim->det );
+                   __FUNCTION__, setup.oneOverArea, area, prim->det );
       */
    }
 
@@ -536,7 +536,7 @@ setup_sort_vertices(const struct vertex_header *v0,
  * \param slot  which attribute slot 
  */
 static INLINE void
-const_coeff(uint slot)
+const_coeff4(uint slot)
 {
    setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
    setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
@@ -544,58 +544,6 @@ const_coeff(uint slot)
 }
 
 
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static INLINE void
-tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
-{
-   uint i;
-   const float *vmin_d = (float *) &setup.vmin->data[slot];
-   const float *vmid_d = (float *) &setup.vmid->data[slot];
-   const float *vmax_d = (float *) &setup.vmax->data[slot];
-   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
-   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
-
-   for (i = firstComp; i < lastComp; i++) {
-      float botda = vmid_d[i] - vmin_d[i];
-      float majda = vmax_d[i] - vmin_d[i];
-      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-   
-      ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
-      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-
-      /* calculate a0 as the value which would be sampled for the
-       * fragment at (0,0), taking into account that we want to sample at
-       * pixel centers, in other words (0.5, 0.5).
-       *
-       * this is neat but unfortunately not a good way to do things for
-       * triangles with very large values of dadx or dady as it will
-       * result in the subtraction and re-addition from a0 of a very
-       * large number, which means we'll end up loosing a lot of the
-       * fractional bits and precision from a0.  the way to fix this is
-       * to define a0 as the sample at a pixel center somewhere near vmin
-       * instead - i'll switch to this later.
-       */
-      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
-                                 (setup.coef[slot].dadx.f[i] * x + 
-                                  setup.coef[slot].dady.f[i] * y));
-   }
-
-   /*
-   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-		slot, "xyzw"[i], 
-		setup.coef[slot].a0[i],
-		setup.coef[slot].dadx.f[i],
-		setup.coef[slot].dady.f[i]);
-   */
-}
-
-
 /**
  * As above, but interp setup all four vector components.
  */
@@ -616,8 +564,8 @@ tri_linear_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
 
    vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
    vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
@@ -660,8 +608,8 @@ tri_persp_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
 
    vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
    vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
@@ -678,21 +626,17 @@ tri_persp_coeff4(uint slot)
 static void
 setup_tri_coefficients(void)
 {
-#if 1
    uint i;
 
    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
       switch (spu.vertex_info.attrib[i].interp_mode) {
       case INTERP_NONE:
          break;
-      case INTERP_POS:
-         /*tri_linear_coeff(i, 2, 3);*/
-         /* XXX interp W if PERSPECTIVE... */
-         tri_linear_coeff4(i);
-         break;
       case INTERP_CONSTANT:
-         const_coeff(i);
+         const_coeff4(i);
          break;
+      case INTERP_POS:
+         /* fall-through */
       case INTERP_LINEAR:
          tri_linear_coeff4(i);
          break;
@@ -703,13 +647,6 @@ setup_tri_coefficients(void)
          ASSERT(0);
       }
    }
-#else
-   ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
-   ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
-          spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
-   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
-   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
-#endif
 }
 
 
@@ -863,19 +800,14 @@ tri_draw(const float *v0, const float *v1, const float *v2,
    setup.span.y_flags = 0;
    setup.span.right[0] = 0;
    setup.span.right[1] = 0;
-   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
 
-   /*   init_constant_attribs( setup ); */
-      
-   if (setup.oneoverarea < 0.0) {
-      /* emaj on left:
-       */
+   if (setup.oneOverArea < 0.0) {
+      /* emaj on left */
       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
    }
    else {
-      /* emaj on right:
-       */
+      /* emaj on right */
       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
    }
-- 
cgit v1.2.3


From 5d4f3b841753e16fbb33e444b6a787663e54fce5 Mon Sep 17 00:00:00 2001
From: Alan Hourihane <alanh@tungstengraphics.com>
Date: Tue, 14 Oct 2008 11:54:20 +0100
Subject: dri: don't check the number of cliprects before swap, let      the
 swap handle the requirements.

---
 src/mesa/drivers/dri/common/dri_util.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index ceedd914fb..a16cb504c7 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -315,9 +315,6 @@ static void driSwapBuffers(__DRIdrawable *dPriv)
 {
     __DRIscreen *psp = dPriv->driScreenPriv;
 
-    if (!dPriv->numClipRects)
-        return;
-
     psp->DriverAPI.SwapBuffers(dPriv);
 
     driReportDamage(dPriv, dPriv->pClipRects, dPriv->numClipRects);
-- 
cgit v1.2.3


From b7609be0f1cc8d7a822a29a2ecc165cd848df2b3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 10:05:23 -0600
Subject: cell: remove old code, clean-ups, etc.

---
 src/gallium/drivers/cell/ppu/cell_context.c    |  1 -
 src/gallium/drivers/cell/ppu/cell_pipe_state.c |  2 +-
 src/gallium/drivers/cell/ppu/cell_texture.c    | 72 ++++++++++++--------------
 src/gallium/drivers/cell/ppu/cell_texture.h    |  4 --
 4 files changed, 35 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 35cd6874a2..b66aa9c9d9 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -128,7 +128,6 @@ cell_create_context(struct pipe_screen *screen,
    cell_init_state_functions(cell);
    cell_init_shader_functions(cell);
    cell_init_surface_functions(cell);
-   cell_init_texture_functions(cell);
    cell_init_vertex_functions(cell);
 
    cell->draw = cell_draw_create(cell);
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 8c55b8e093..02721e8f38 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -258,7 +258,7 @@ cell_set_sampler_textures(struct pipe_context *pipe,
    }
    cell->num_textures = num;
 
-   cell_update_texture_mapping(cell);
+   cell_update_texture_mapping(cell); /* XXX temporary! */
 
    cell->dirty |= CELL_NEW_TEXTURE;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index f5f81ac3cc..ad3344aacd 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -52,15 +52,15 @@ static unsigned minify( unsigned d )
 
 
 static void
-cell_texture_layout(struct cell_texture * spt)
+cell_texture_layout(struct cell_texture *ct)
 {
-   struct pipe_texture *pt = &spt->base;
+   struct pipe_texture *pt = &ct->base;
    unsigned level;
    unsigned width = pt->width[0];
    unsigned height = pt->height[0];
    unsigned depth = pt->depth[0];
 
-   spt->buffer_size = 0;
+   ct->buffer_size = 0;
 
    for ( level = 0 ; level <= pt->last_level ; level++ ) {
       unsigned size;
@@ -78,9 +78,9 @@ cell_texture_layout(struct cell_texture * spt)
       pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
       pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
 
-      spt->stride[level] = pt->nblocksx[level] * pt->block.size;
+      ct->stride[level] = pt->nblocksx[level] * pt->block.size;
 
-      spt->level_offset[level] = spt->buffer_size;
+      ct->level_offset[level] = ct->buffer_size;
 
       size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
       if (pt->target == PIPE_TEXTURE_CUBE)
@@ -88,7 +88,7 @@ cell_texture_layout(struct cell_texture * spt)
       else
          size *= depth;
 
-      spt->buffer_size += size;
+      ct->buffer_size += size;
 
       width  = minify(width);
       height = minify(height);
@@ -102,26 +102,25 @@ cell_texture_create(struct pipe_screen *screen,
                     const struct pipe_texture *templat)
 {
    struct pipe_winsys *ws = screen->winsys;
-   struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
-   if (!spt)
+   struct cell_texture *ct = CALLOC_STRUCT(cell_texture);
+   if (!ct)
       return NULL;
 
-   spt->base = *templat;
-   spt->base.refcount = 1;
-   spt->base.screen = screen;
+   ct->base = *templat;
+   ct->base.refcount = 1;
+   ct->base.screen = screen;
 
-   cell_texture_layout(spt);
+   cell_texture_layout(ct);
 
-   spt->buffer = ws->buffer_create(ws, 32,
-                                   PIPE_BUFFER_USAGE_PIXEL,
-                                   spt->buffer_size);
+   ct->buffer = ws->buffer_create(ws, 32, PIPE_BUFFER_USAGE_PIXEL,
+                                  ct->buffer_size);
 
-   if (!spt->buffer) {
-      FREE(spt);
+   if (!ct->buffer) {
+      FREE(ct);
       return NULL;
    }
 
-   return &spt->base;
+   return &ct->base;
 }
 
 
@@ -137,15 +136,15 @@ cell_texture_release(struct pipe_screen *screen,
        __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
    */
    if (--(*pt)->refcount <= 0) {
-      struct cell_texture *spt = cell_texture(*pt);
+      struct cell_texture *ct = cell_texture(*pt);
 
       /*
-      DBG("%s deleting %p\n", __FUNCTION__, (void *) spt);
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
       */
 
-      pipe_buffer_reference(screen, &spt->buffer, NULL);
+      pipe_buffer_reference(screen, &ct->buffer, NULL);
 
-      FREE(spt);
+      FREE(ct);
    }
    *pt = NULL;
 }
@@ -169,22 +168,22 @@ cell_get_tex_surface(struct pipe_screen *screen,
                      unsigned usage)
 {
    struct pipe_winsys *ws = screen->winsys;
-   struct cell_texture *spt = cell_texture(pt);
+   struct cell_texture *ct = cell_texture(pt);
    struct pipe_surface *ps;
 
    ps = ws->surface_alloc(ws);
    if (ps) {
       assert(ps->refcount);
       assert(ps->winsys);
-      winsys_buffer_reference(ws, &ps->buffer, spt->buffer);
+      winsys_buffer_reference(ws, &ps->buffer, ct->buffer);
       ps->format = pt->format;
       ps->block = pt->block;
       ps->width = pt->width[level];
       ps->height = pt->height[level];
       ps->nblocksx = pt->nblocksx[level];
       ps->nblocksy = pt->nblocksy[level];
-      ps->stride = spt->stride[level];
-      ps->offset = spt->level_offset[level];
+      ps->stride = ct->stride[level];
+      ps->offset = ct->level_offset[level];
       ps->usage = usage;
 
       /* XXX may need to override usage flags (see sp_texture.c) */
@@ -268,11 +267,13 @@ cell_tile_texture(struct cell_context *cell,
          assert(w % TILE_SIZE == 0);
          assert(h % TILE_SIZE == 0);
 
-         surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
+         surf = screen->get_tex_surface(screen, &texture->base, face,
+                                        level, zslice,
                                         PIPE_BUFFER_USAGE_CPU_WRITE);
          ASSERT(surf);
          
-         src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
+         src = (const uint *) pipe_surface_map(surf,
+                                               PIPE_BUFFER_USAGE_CPU_WRITE);
 
          if (texture->tiled_data[level]) {
             align_free(texture->tiled_data[level]);
@@ -289,6 +290,7 @@ cell_tile_texture(struct cell_context *cell,
 }
 
 
+/** XXX temporary hack */
 void
 cell_update_texture_mapping(struct cell_context *cell)
 {
@@ -335,9 +337,9 @@ cell_tex_surface_release(struct pipe_screen *screen,
 
 
 static void *
-cell_surface_map( struct pipe_screen *screen,
-                  struct pipe_surface *surface,
-                  unsigned flags )
+cell_surface_map(struct pipe_screen *screen,
+                 struct pipe_surface *surface,
+                 unsigned flags)
 {
    ubyte *map;
 
@@ -364,7 +366,7 @@ cell_surface_map( struct pipe_screen *screen,
 #endif
    }
    
-   return map + surface->offset;
+   return (void *) (map + surface->offset);
 }
 
 
@@ -376,12 +378,6 @@ cell_surface_unmap(struct pipe_screen *screen,
 }
 
 
-void
-cell_init_texture_functions(struct cell_context *cell)
-{
-   /*cell->pipe.texture_update = cell_texture_update;*/
-}
-
 
 void
 cell_init_screen_texture_funcs(struct pipe_screen *screen)
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d35736984..4647509743 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -65,10 +65,6 @@ extern void
 cell_update_texture_mapping(struct cell_context *cell);
 
 
-extern void
-cell_init_texture_functions(struct cell_context *cell);
-
-
 extern void
 cell_init_screen_texture_funcs(struct pipe_screen *screen);
 
-- 
cgit v1.2.3


From 0bee156d8518419befb50ba57d22fed4037797ce Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 10:55:38 -0600
Subject: cell: now do texture twiddling in the right way, at the right time.

Also handles images smaller than 32x32 now.
---
 src/gallium/drivers/cell/ppu/cell_pipe_state.c |   2 -
 src/gallium/drivers/cell/ppu/cell_state_emit.c |   4 +-
 src/gallium/drivers/cell/ppu/cell_texture.c    | 221 +++++++++----------------
 src/gallium/drivers/cell/ppu/cell_texture.h    |   4 -
 4 files changed, 83 insertions(+), 148 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 02721e8f38..2e3086c4fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -258,8 +258,6 @@ cell_set_sampler_textures(struct pipe_context *pipe,
    }
    cell->num_textures = num;
 
-   cell_update_texture_mapping(cell); /* XXX temporary! */
-
    cell->dirty |= CELL_NEW_TEXTURE;
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 7090b4c99f..cae546b700 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -222,8 +222,8 @@ cell_emit_state(struct cell_context *cell)
             uint level;
             for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
                texture->start[level] = NULL;
-               texture->width[level] = 1;
-               texture->height[level] = 1;
+               texture->width[level] = 0;
+               texture->height[level] = 0;
             }
          }
       }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index ad3344aacd..4bd87590cb 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -150,15 +150,77 @@ cell_texture_release(struct pipe_screen *screen,
 }
 
 
-#if 0
+
+/**
+ * Convert image from linear layout to tiled layout.  4-byte pixels.
+ */
+static void
+swizzle_image_uint(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   /* loop over dest tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of dest tile: */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+         /* loop over texels in the tile */
+         uint tile_width = MIN2(tile_size, w);
+         uint tile_height = MIN2(tile_size, h);
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               ASSERT(srci < w);
+               ASSERT(srcj < h);
+               tdst[i * TILE_SIZE + j] = src[srci * w + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ */
 static void
-cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
-                    uint face, uint levelsMask)
+cell_twiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
 {
-   /* XXX TO DO:  re-tile the texture data ... */
+   struct cell_texture *texture = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = texture->base.width[level];
+   const uint texHeight = texture->base.height[level];
+   const uint bufWidth = MAX2(texWidth, TILE_SIZE);
+   const uint bufHeight = MAX2(texHeight, TILE_SIZE);
+   const uint *src =
+      (const uint *) pipe_buffer_map(screen, surface->buffer,
+                                     PIPE_BUFFER_USAGE_CPU_READ);
+
+   switch (texture->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      /* free old tiled data */
+      if (texture->tiled_data[level]) {
+         align_free(texture->tiled_data[level]);
+      }
+      /* alloc new tiled data */
+      texture->tiled_data[level] = align_malloc(bufWidth * bufHeight * 4, 16);
+      swizzle_image_uint(texWidth, texHeight, TILE_SIZE,
+                         texture->tiled_data[level], src);
+      break;
+   default:
+      printf("Unsupported texture format\n");
+      ;
+   }
 
+   pipe_buffer_unmap(screen, surface->buffer);
 }
-#endif
 
 
 static struct pipe_surface *
@@ -207,129 +269,12 @@ cell_get_tex_surface(struct pipe_screen *screen,
 }
 
 
-
-/**
- * Copy tile data from linear layout to tiled layout.
- * XXX this should be rolled into the future surface-creation code.
- * XXX also need "untile" code...
- */
-static void
-tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
-{
-   const uint tile_size2 = tile_size * tile_size;
-   const uint h_t = h / tile_size, w_t = w / tile_size;
-
-   uint it, jt;  /* tile counters */
-   uint i, j;    /* intra-tile counters */
-
-   /* loop over dest tiles */
-   for (it = 0; it < h_t; it++) {
-      for (jt = 0; jt < w_t; jt++) {
-         /* start of dest tile: */
-         uint *tdst = dst + (it * w_t + jt) * tile_size2;
-         /* loop over texels in the tile */
-         for (i = 0; i < tile_size; i++) {
-            for (j = 0; j < tile_size; j++) {
-               const uint srci = it * tile_size + i;
-               const uint srcj = jt * tile_size + j;
-               *tdst++ = src[srci * w + srcj];
-            }
-         }
-      }
-   }
-}
-
-
-
-/**
- * Convert linear texture image data to tiled format for SPU usage.
- * XXX recast this in terms of pipe_surfaces (aka texture views).
- */
-static void
-cell_tile_texture(struct cell_context *cell,
-                  struct cell_texture *texture)
-{
-   struct pipe_screen *screen = cell->pipe.screen;
-   uint face = 0, level, zslice = 0;
-   const uint *src;
-
-   for (level = 0; level <= texture->base.last_level; level++) {
-      if (!texture->tiled_data[level]) {
-         struct pipe_surface *surf;
-
-         const uint w = texture->base.width[level], h = texture->base.height[level];
-
-         if (w < 32 || h < 32)
-            continue;
-         /* temporary restrictions: */
-         assert(w >= TILE_SIZE);
-         assert(h >= TILE_SIZE);
-         assert(w % TILE_SIZE == 0);
-         assert(h % TILE_SIZE == 0);
-
-         surf = screen->get_tex_surface(screen, &texture->base, face,
-                                        level, zslice,
-                                        PIPE_BUFFER_USAGE_CPU_WRITE);
-         ASSERT(surf);
-         
-         src = (const uint *) pipe_surface_map(surf,
-                                               PIPE_BUFFER_USAGE_CPU_WRITE);
-
-         if (texture->tiled_data[level]) {
-            align_free(texture->tiled_data[level]);
-         }
-         texture->tiled_data[level] = align_malloc(w * h * 4, 16);
-
-         tile_copy_data(w, h, TILE_SIZE, texture->tiled_data[level], src);
-
-         pipe_surface_unmap(surf);
-
-         pipe_surface_reference(&surf, NULL);
-      }
-   }
-}
-
-
-/** XXX temporary hack */
-void
-cell_update_texture_mapping(struct cell_context *cell)
-{
-#if 0
-   uint face = 0, level = 0, zslice = 0;
-#endif
-   uint i;
-
-   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-      if (cell->texture[i])
-         cell_tile_texture(cell, cell->texture[i]);
-   }
-
-#if 0
-   if (cell->tex_surf && cell->tex_map) {
-      pipe_surface_unmap(cell->tex_surf);
-      cell->tex_map = NULL;
-   }
-
-   /* XXX free old surface */
-
-   cell->tex_surf = cell_get_tex_surface(&cell->pipe,
-                                         &cell->texture[0]->base,
-                                         face, level, zslice);
-
-   cell->tex_map = pipe_surface_map(cell->tex_surf);
-#endif
-}
-
-
 static void 
 cell_tex_surface_release(struct pipe_screen *screen, 
                          struct pipe_surface **s)
 {
-   /* Effectively do the texture_update work here - if texture images
-    * needed post-processing to put them into hardware layout, this is
-    * where it would happen.  For softpipe, nothing to do.
-    */
-   assert ((*s)->texture);
+   /* XXX if done rendering to teximage, re-tile */
+
    pipe_texture_reference(&(*s)->texture, NULL); 
 
    screen->winsys->surface_release(screen->winsys, s);
@@ -351,22 +296,8 @@ cell_surface_map(struct pipe_screen *screen,
    map = pipe_buffer_map( screen, surface->buffer, flags );
    if (map == NULL)
       return NULL;
-
-   /* May want to different things here depending on read/write nature
-    * of the map:
-    */
-   if (surface->texture &&
-       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
-   {
-      /* Do something to notify sharing contexts of a texture change.
-       * In softpipe, that would mean flushing the texture cache.
-       */
-#if 0
-      cell_screen(screen)->timestamp++;
-#endif
-   }
-   
-   return (void *) (map + surface->offset);
+   else
+      return (void *) (map + surface->offset);
 }
 
 
@@ -374,6 +305,16 @@ static void
 cell_surface_unmap(struct pipe_screen *screen,
                    struct pipe_surface *surface)
 {
+   struct cell_texture *ct = cell_texture(surface->texture);
+
+   assert(ct);
+
+   if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
+       (surface->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
+      /* convert from linear to tiled layout */
+      cell_twiddle_texture(screen, surface);
+   }
+
    pipe_buffer_unmap( screen, surface->buffer );
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 4647509743..a0757091b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -61,10 +61,6 @@ cell_texture(struct pipe_texture *pt)
 
 
-extern void
-cell_update_texture_mapping(struct cell_context *cell);
-
-
 extern void
 cell_init_screen_texture_funcs(struct pipe_screen *screen);
 
-- 
cgit v1.2.3


From 3baf83db3c60be8185bc68a0aa3adbce80d9025e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:10:27 -0600
Subject: cell: fix tex image stride bugs

---
 src/gallium/drivers/cell/ppu/cell_texture.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 4bd87590cb..608bda35f7 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -155,7 +155,8 @@ cell_texture_release(struct pipe_screen *screen,
  * Convert image from linear layout to tiled layout.  4-byte pixels.
  */
 static void
-swizzle_image_uint(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+swizzle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                   uint src_stride, const uint *src)
 {
    const uint tile_size2 = tile_size * tile_size;
    const uint h_t = (h + tile_size - 1) / tile_size;
@@ -164,6 +165,8 @@ swizzle_image_uint(uint w, uint h, uint tile_size, uint *dst, const uint *src)
    uint it, jt;  /* tile counters */
    uint i, j;    /* intra-tile counters */
 
+   src_stride /= 4; /* convert from bytes to pixels */
+
    /* loop over dest tiles */
    for (it = 0; it < h_t; it++) {
       for (jt = 0; jt < w_t; jt++) {
@@ -178,7 +181,7 @@ swizzle_image_uint(uint w, uint h, uint tile_size, uint *dst, const uint *src)
                const uint srcj = jt * tile_size + j;
                ASSERT(srci < w);
                ASSERT(srcj < h);
-               tdst[i * TILE_SIZE + j] = src[srci * w + srcj];
+               tdst[i * tile_size + j] = src[srci * src_stride + srcj];
             }
          }
       }
@@ -199,9 +202,9 @@ cell_twiddle_texture(struct pipe_screen *screen,
    const uint texHeight = texture->base.height[level];
    const uint bufWidth = MAX2(texWidth, TILE_SIZE);
    const uint bufHeight = MAX2(texHeight, TILE_SIZE);
-   const uint *src =
-      (const uint *) pipe_buffer_map(screen, surface->buffer,
+   const void *map = pipe_buffer_map(screen, surface->buffer,
                                      PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
 
    switch (texture->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
@@ -212,7 +215,8 @@ cell_twiddle_texture(struct pipe_screen *screen,
       /* alloc new tiled data */
       texture->tiled_data[level] = align_malloc(bufWidth * bufHeight * 4, 16);
       swizzle_image_uint(texWidth, texHeight, TILE_SIZE,
-                         texture->tiled_data[level], src);
+                         texture->tiled_data[level],
+                         surface->stride, src);
       break;
    default:
       printf("Unsupported texture format\n");
-- 
cgit v1.2.3


From f8bddf698d523f597fea0f721b064daee81d8005 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:11:52 -0600
Subject: cell: basic mipmap filtering works now

Though, only GL_MIPMAP_NEAREST / GL_LINEAR works right now.
---
 src/gallium/drivers/cell/spu/spu_command.c |  21 ++++--
 src/gallium/drivers/cell/spu/spu_funcs.c   |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h    |   3 +-
 src/gallium/drivers/cell/spu/spu_texture.c | 106 +++++++++++++++--------------
 src/gallium/drivers/cell/spu/spu_texture.h |   8 +--
 5 files changed, 79 insertions(+), 61 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 089af22415..4e98eea338 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,16 +301,18 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
 
    spu.sampler[sampler->unit] = sampler->state;
-#if 0
+
    if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      /* use lambda/lod to determine min vs. mag filter */
       spu.sample_texture4[sampler->unit] = sample_texture4_lod;
    }
-   else
-#endif
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+   else if (spu.sampler[sampler->unit].min_img_filter
+            == PIPE_TEX_FILTER_LINEAR) {
+      /* min = mag = bilinear */
       spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
    }
    else {
+      /* min = mag = inearest */
       spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
    }
 }
@@ -322,8 +324,12 @@ cmd_state_texture(const struct cell_command_texture *texture)
    const uint unit = texture->unit;
    uint i;
 
+   //if (spu.init.id==0) Debug=1;
+
    DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
+   spu.texture[unit].max_level = 0;
+
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
       uint height = texture->height[i];
@@ -335,14 +341,19 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].width = width;
       spu.texture[unit].level[i].height = height;
 
-      spu.texture[unit].level[i].tiles_per_row = width / TILE_SIZE;
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
 
       spu.texture[unit].level[i].width4 = spu_splats((float) width);
       spu.texture[unit].level[i].height4 = spu_splats((float) height);
 
       spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
       spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
    }
+   //Debug=0;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index f2946010bd..66b82f673d 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,7 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, colors.v);
+   spu.sample_texture4[unit](s, t, r, q, unit, 0, colors.v);
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 9515543efe..cfb645add0 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -68,7 +68,7 @@ typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
                                          vector float q,
-                                         uint unit,
+                                         uint unit, uint level,
                                          vector float colors[4]);
 
 
@@ -121,6 +121,7 @@ struct spu_texture_level
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 96c09e3ccb..10036330c6 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 
-#include <transpose_matrix4x4.h>
 #include <math.h>
 
 #include "pipe/p_compiler.h"
@@ -43,12 +42,14 @@
 void
 invalidate_tex_cache(void)
 {
-   uint lvl = 0;
-   uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].level[lvl].width
-      * spu.texture[unit].level[lvl].height;
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
 
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
 }
 
 
@@ -71,8 +72,8 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    const unsigned texture_ea = (uintptr_t) tlevel->start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
-   vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
@@ -106,20 +107,19 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, vector float colors[4])
+                        uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
-   vector float ss = spu_mul(s, spu.texture[unit].level[lvl].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].level[lvl].height4);
+   vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
+   vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
    vector unsigned int is = spu_convtu(ss, 0);
    vector unsigned int it = spu_convtu(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
+   it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
 
-   get_four_texels(unit, lvl, is, it, texels);
+   get_four_texels(unit, level, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -133,11 +133,10 @@ sample_texture4_nearest(vector float s, vector float t,
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4])
+                         uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
-   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, spu_splats(-0.5f));
+   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
 
    vector unsigned int is0 = spu_convtu(ss, 0);
    vector unsigned int it0 = spu_convtu(tt, 0);
@@ -147,17 +146,17 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -273,14 +272,13 @@ transpose(vector unsigned int *mOut0,
  */
 void
 sample_texture4_bilinear_2(vector float s, vector float t,
-                         vector float r, vector float q,
-                         uint unit, vector float colors[4])
+                           vector float r, vector float q,
+                           uint unit, uint level, vector float colors[4])
 {
-   const uint lvl = 0;
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].level[lvl].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].level[lvl].height4, half);
+   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
+   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
    vector unsigned int is = spu_convtu(ss, 8);
@@ -301,17 +299,17 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector unsigned int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[lvl].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[lvl].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[lvl].tex_size_y_mask);
+   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
+   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
+   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
+   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, lvl, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, lvl, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, lvl, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, lvl, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -379,9 +377,9 @@ sample_texture4_bilinear_2(vector float s, vector float t,
 static float
 compute_lambda(uint unit, vector float s, vector float t)
 {
-   uint lvl = 0;
-   float width = spu.texture[unit].level[lvl].width;
-   float height = spu.texture[unit].level[lvl].width;
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
    float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
    float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
    float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
@@ -402,22 +400,30 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, vector float colors[4])
+                    uint unit, uint level, vector float colors[4])
 {
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
    float lambda = compute_lambda(unit, s, t);
 
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
    if (lambda < spu.sampler[unit].min_lod)
       lambda = spu.sampler[unit].min_lod;
    else if (lambda > spu.sampler[unit].max_lod)
       lambda = spu.sampler[unit].max_lod;
 
-   /* hack for now */
-   int level = (int) lambda;
-   if (level > 3)
-      level = 3;
+   /* convert to int level */
+   level = (int) (lambda + 0.5f);
+   ASSERT(level >= 0);
+
+   if (level > spu.texture[unit].max_level)
+      level = spu.texture[unit].max_level;
 
-   /*
    sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
-   */
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 4802f7c47c..ec06a50b4a 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -39,24 +39,24 @@ invalidate_tex_cache(void);
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, vector float colors[4]);
+                        uint unit, uint level, vector float colors[4]);
 
 
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4]);
+                         uint unit, uint level, vector float colors[4]);
 
 extern void
 sample_texture4_bilinear_2(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, vector float colors[4]);
+                           uint unit, uint level, vector float colors[4]);
 
 
 extern void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, vector float colors[4]);
+                    uint unit, uint level, vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From 58ea98dc68605130dda2538027f941df39ccd514 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:41:46 -0600
Subject: cell: fix assertions

---
 src/gallium/drivers/cell/ppu/cell_texture.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 608bda35f7..87f1598dae 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -155,7 +155,7 @@ cell_texture_release(struct pipe_screen *screen,
  * Convert image from linear layout to tiled layout.  4-byte pixels.
  */
 static void
-swizzle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
                    uint src_stride, const uint *src)
 {
    const uint tile_size2 = tile_size * tile_size;
@@ -179,8 +179,8 @@ swizzle_image_uint(uint w, uint h, uint tile_size, uint *dst,
             for (j = 0; j < tile_width; j++) {
                const uint srci = it * tile_size + i;
                const uint srcj = jt * tile_size + j;
-               ASSERT(srci < w);
-               ASSERT(srcj < h);
+               ASSERT(srci < h);
+               ASSERT(srcj < w);
                tdst[i * tile_size + j] = src[srci * src_stride + srcj];
             }
          }
@@ -214,12 +214,12 @@ cell_twiddle_texture(struct pipe_screen *screen,
       }
       /* alloc new tiled data */
       texture->tiled_data[level] = align_malloc(bufWidth * bufHeight * 4, 16);
-      swizzle_image_uint(texWidth, texHeight, TILE_SIZE,
+      twiddle_image_uint(texWidth, texHeight, TILE_SIZE,
                          texture->tiled_data[level],
                          surface->stride, src);
       break;
    default:
-      printf("Unsupported texture format\n");
+      printf("Cell: twiddle unsupported texture format\n");
       ;
    }
 
-- 
cgit v1.2.3


From 6d2d5ceca21c87bea5e269e8099fb6f1d821b97a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:42:21 -0600
Subject: cell: use minify vs magnify filters

---
 src/gallium/drivers/cell/spu/spu_command.c | 50 +++++++++++++++++++++++-------
 src/gallium/drivers/cell/spu/spu_main.h    |  2 ++
 src/gallium/drivers/cell/spu/spu_texture.c | 22 +++++++------
 3 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 4e98eea338..fa78377c66 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -298,22 +298,48 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
-   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+   uint unit = sampler->unit;
 
-   spu.sampler[sampler->unit] = sampler->state;
+   DEBUG_PRINTF("SAMPLER [%u]\n", unit);
 
-   if (spu.sampler[sampler->unit].min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
-      /* use lambda/lod to determine min vs. mag filter */
-      spu.sample_texture4[sampler->unit] = sample_texture4_lod;
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
    }
-   else if (spu.sampler[sampler->unit].min_img_filter
-            == PIPE_TEX_FILTER_LINEAR) {
-      /* min = mag = bilinear */
-      spu.sample_texture4[sampler->unit] = sample_texture4_bilinear;
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
    }
-   else {
-      /* min = mag = inearest */
-      spu.sample_texture4[sampler->unit] = sample_texture4_nearest;
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture4[unit] = sample_texture4_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+      break;
+   default:
+      ASSERT(0);
    }
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index cfb645add0..56aac655e9 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -170,6 +170,8 @@ struct spu_global
 
    /** Current texture sampler function */
    spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
 
    /** Fragment program constants */
    vector float constants[4 * CELL_MAX_CONSTANTS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 10036330c6..267f2302f6 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -400,7 +400,7 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level, vector float colors[4])
+                    uint unit, uint level_ignored, vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -417,13 +417,17 @@ sample_texture4_lod(vector float s, vector float t,
    else if (lambda > spu.sampler[unit].max_lod)
       lambda = spu.sampler[unit].max_lod;
 
-   /* convert to int level */
-   level = (int) (lambda + 0.5f);
-   ASSERT(level >= 0);
-
-   if (level > spu.texture[unit].max_level)
-      level = spu.texture[unit].max_level;
-
-   sample_texture4_bilinear_2(s, t, r, q, unit, level, colors);
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
+   }
+   else {
+      /* minify */
+      int level = (int) (lambda + 0.5f);
+      if (level > (int) spu.texture[unit].max_level)
+         level = spu.texture[unit].max_level;
+      spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
+      /* XXX to do: mipmap level interpolation */
+   }
 }
 
-- 
cgit v1.2.3


From 4f56d5bbf2e52c815c820138eaad6c0fb93d47ba Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 12:52:16 -0600
Subject: cell: fix broken negative texcoord conversion

---
 src/gallium/drivers/cell/spu/spu_texture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 267f2302f6..83cf7dc394 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -138,8 +138,8 @@ sample_texture4_bilinear(vector float s, vector float t,
    vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
    vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
 
-   vector unsigned int is0 = spu_convtu(ss, 0);
-   vector unsigned int it0 = spu_convtu(tt, 0);
+   vector unsigned int is0 = (vector unsigned int) spu_convts(ss, 0);
+   vector unsigned int it0 = (vector unsigned int) spu_convts(tt, 0);
 
    /* is + 1, it + 1 */
    vector unsigned int is1 = spu_add(is0, 1);
@@ -281,8 +281,8 @@ sample_texture4_bilinear_2(vector float s, vector float t,
    vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
-   vector unsigned int is = spu_convtu(ss, 8);
-   vector unsigned int it = spu_convtu(tt, 8);
+   vector unsigned int is = (vector unsigned int) spu_convts(ss, 8);
+   vector unsigned int it = (vector unsigned int) spu_convts(tt, 8);
 
    /* compute integer texel weights in [0, 255] */
    vector signed int sWeights0 = spu_and((vector signed int) is, 255);
-- 
cgit v1.2.3


From 38d396e15aceaca299c5de571c4dd5b3d9b27242 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 14:02:07 -0600
Subject: cell: fix npot texture tiling bugs

---
 src/gallium/drivers/cell/ppu/cell_texture.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 87f1598dae..4fd66bdea0 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -172,9 +172,17 @@ twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
       for (jt = 0; jt < w_t; jt++) {
          /* start of dest tile: */
          uint *tdst = dst + (it * w_t + jt) * tile_size2;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
          /* loop over texels in the tile */
-         uint tile_width = MIN2(tile_size, w);
-         uint tile_height = MIN2(tile_size, h);
          for (i = 0; i < tile_height; i++) {
             for (j = 0; j < tile_width; j++) {
                const uint srci = it * tile_size + i;
@@ -200,8 +208,8 @@ cell_twiddle_texture(struct pipe_screen *screen,
    const uint level = surface->level;
    const uint texWidth = texture->base.width[level];
    const uint texHeight = texture->base.height[level];
-   const uint bufWidth = MAX2(texWidth, TILE_SIZE);
-   const uint bufHeight = MAX2(texHeight, TILE_SIZE);
+   const uint bufWidth = align(texWidth, TILE_SIZE);
+   const uint bufHeight = align(texHeight, TILE_SIZE);
    const void *map = pipe_buffer_map(screen, surface->buffer,
                                      PIPE_BUFFER_USAGE_CPU_READ);
    const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
-- 
cgit v1.2.3


From 85dc1aec9c5fc63a01bb8db07215b84790d15d8f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 15:19:01 -0600
Subject: cell: support NPOT textures, clamp/repeat mode, normalized/unorm
 texcoords

glDrawPixels works now.
---
 src/gallium/drivers/cell/spu/spu_command.c | 48 +++++++++++++--
 src/gallium/drivers/cell/spu/spu_main.h    | 12 ++--
 src/gallium/drivers/cell/spu/spu_texture.c | 99 ++++++++++++++++++++----------
 3 files changed, 117 insertions(+), 42 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index fa78377c66..b1efe97e76 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -295,6 +295,42 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+}
+
+
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
@@ -341,6 +377,8 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
    default:
       ASSERT(0);
    }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
 }
 
 
@@ -370,15 +408,15 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
-      spu.texture[unit].level[i].width4 = spu_splats((float) width);
-      spu.texture[unit].level[i].height4 = spu_splats((float) height);
-
-      spu.texture[unit].level[i].tex_size_x_mask = spu_splats(width - 1);
-      spu.texture[unit].level[i].tex_size_y_mask = spu_splats(height - 1);
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
 
       if (texture->start[i])
          spu.texture[unit].max_level = i;
    }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+
    //Debug=0;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 56aac655e9..45c6f4ced1 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -107,17 +107,21 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
+/** per-texture level info */
 struct spu_texture_level
 {
    void *start;
    ushort width, height;
    ushort tiles_per_row;
-   vector float width4;   /**< == {width, width, width, width} */
-   vector float height4;  /**< == {height, height, height, height} */
-   vector unsigned int tex_size_x_mask; /**< splat(width-1) */
-   vector unsigned int tex_size_y_mask; /**< splat(height-1) */
+   /** texcoord scale factors */
+   vector float scale_s, scale_t;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s, mask_t;
+   /** texcoord clamp limits */
+   vector signed int max_s, max_t;
 } ALIGN16_ATTRIB;
 
+
 struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 83cf7dc394..b21c43a467 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -67,13 +67,13 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
+get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    const unsigned texture_ea = (uintptr_t) tlevel->start;
-   const vec_uint4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
-   const vec_uint4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
    const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
@@ -99,6 +99,20 @@ get_four_texels(uint unit, uint level, vec_uint4 x, vec_uint4 y,
 }
 
 
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
 
 /**
  * Do nearest texture sampling for four pixels.
@@ -109,15 +123,20 @@ sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
                         uint unit, uint level, vector float colors[4])
 {
-   vector float ss = spu_mul(s, spu.texture[unit].level[level].width4);
-   vector float tt = spu_mul(t, spu.texture[unit].level[level].height4);
-   vector unsigned int is = spu_convtu(ss, 0);
-   vector unsigned int it = spu_convtu(tt, 0);
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
    vec_uint4 texels[4];
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is = spu_and(is, spu.texture[unit].level[level].tex_size_x_mask);
-   it = spu_and(it, spu.texture[unit].level[level].tex_size_y_mask);
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
 
    get_four_texels(unit, level, is, it, texels);
 
@@ -135,21 +154,28 @@ sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
                          uint unit, uint level, vector float colors[4])
 {
-   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, spu_splats(-0.5f));
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_madd(s, tlevel->scale_s,  spu_splats(-0.5f));
+   vector float tt = spu_madd(t, tlevel->scale_t, spu_splats(-0.5f));
 
-   vector unsigned int is0 = (vector unsigned int) spu_convts(ss, 0);
-   vector unsigned int it0 = (vector unsigned int) spu_convts(tt, 0);
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
 
    /* is + 1, it + 1 */
-   vector unsigned int is1 = spu_add(is0, 1);
-   vector unsigned int it1 = spu_add(it0, 1);
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
    /* get packed int texels */
    vector unsigned int texels[16];
@@ -275,34 +301,41 @@ sample_texture4_bilinear_2(vector float s, vector float t,
                            vector float r, vector float q,
                            uint unit, uint level, vector float colors[4])
 {
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
    /* Scale texcoords by size of texture, and add half pixel bias */
-   vector float ss = spu_madd(s, spu.texture[unit].level[level].width4, half);
-   vector float tt = spu_madd(t, spu.texture[unit].level[level].height4, half);
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
    /* convert float coords to fixed-pt coords with 8 fraction bits */
-   vector unsigned int is = (vector unsigned int) spu_convts(ss, 8);
-   vector unsigned int it = (vector unsigned int) spu_convts(tt, 8);
+   vector signed int is = spu_convts(ss, 8);
+   vector signed int it = spu_convts(tt, 8);
 
    /* compute integer texel weights in [0, 255] */
-   vector signed int sWeights0 = spu_and((vector signed int) is, 255);
-   vector signed int tWeights0 = spu_and((vector signed int) it, 255);
+   vector signed int sWeights0 = spu_and(is, 255);
+   vector signed int tWeights0 = spu_and(it, 255);
    vector signed int sWeights1 = spu_sub(255, sWeights0);
    vector signed int tWeights1 = spu_sub(255, tWeights0);
 
    /* texel coords: is0 = is / 256, it0 = is / 256 */
-   vector unsigned int is0 = spu_rlmask(is, -8);
-   vector unsigned int it0 = spu_rlmask(it, -8);
+   vector signed int is0 = spu_rlmask(is, -8);
+   vector signed int it0 = spu_rlmask(it, -8);
 
    /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
-   vector unsigned int is1 = spu_add(is0, 1);
-   vector unsigned int it1 = spu_add(it0, 1);
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
 
    /* PIPE_TEX_WRAP_REPEAT */
-   is0 = spu_and(is0, spu.texture[unit].level[level].tex_size_x_mask);
-   it0 = spu_and(it0, spu.texture[unit].level[level].tex_size_y_mask);
-   is1 = spu_and(is1, spu.texture[unit].level[level].tex_size_x_mask);
-   it1 = spu_and(it1, spu.texture[unit].level[level].tex_size_y_mask);
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
    /* get packed int texels */
    vector unsigned int texels[16];
-- 
cgit v1.2.3


From e0931e520a8d7cc5b4db8a4b887c5cf139b2647f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:09:56 -0600
Subject: cell: fall-through case for TGSI_OPCODE_TXB

---
 src/gallium/drivers/cell/ppu/cell_gen_fp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 68093d9e83..3dfd5f673d 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -1672,6 +1672,8 @@ emit_instruction(struct codegen *gen,
       /* fall-through for now */
    case TGSI_OPCODE_TXD:
       /* fall-through for now */
+   case TGSI_OPCODE_TXB:
+      /* fall-through for now */
    case TGSI_OPCODE_TXP:
       return emit_TXP(gen, inst);
 
-- 
cgit v1.2.3


From 8f7c6b55ae962e30f32cfec9a14a652d3b5b5943 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:11:29 -0600
Subject: cell: support for cubemaps

Though, progs/demos/cubemap.c doesn't quite work right...
---
 src/gallium/drivers/cell/common.h              |   1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c |   2 +
 src/gallium/drivers/cell/ppu/cell_texture.c    |  37 ++++--
 src/gallium/drivers/cell/spu/spu_command.c     |  17 ++-
 src/gallium/drivers/cell/spu/spu_funcs.c       |   2 +-
 src/gallium/drivers/cell/spu/spu_main.h        |   4 +-
 src/gallium/drivers/cell/spu/spu_texture.c     | 171 ++++++++++++++++++++++---
 src/gallium/drivers/cell/spu/spu_texture.h     |  21 ++-
 8 files changed, 214 insertions(+), 41 deletions(-)

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index e4de9a551d..c1e78f4db3 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -251,6 +251,7 @@ struct cell_command_sampler
 struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
    ushort width[CELL_MAX_TEXTURE_LEVELS];
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index cae546b700..d4a867ffcf 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -217,6 +217,7 @@ cell_emit_state(struct cell_context *cell)
                texture->width[level] = cell->texture[i]->base.width[level];
                texture->height[level] = cell->texture[i]->base.height[level];
             }
+            texture->target = cell->texture[i]->base.target;
          }
          else {
             uint level;
@@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell)
                texture->width[level] = 0;
                texture->height[level] = 0;
             }
+            texture->target = 0;
          }
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 4fd66bdea0..4c92ef154f 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -137,6 +137,7 @@ cell_texture_release(struct pipe_screen *screen,
    */
    if (--(*pt)->refcount <= 0) {
       struct cell_texture *ct = cell_texture(*pt);
+      uint i;
 
       /*
       DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
@@ -144,6 +145,12 @@ cell_texture_release(struct pipe_screen *screen,
 
       pipe_buffer_reference(screen, &ct->buffer, NULL);
 
+      for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+         if (ct->tiled_data[i]) {
+            FREE(ct->tiled_data[i]);
+         }
+      }
+
       FREE(ct);
    }
    *pt = NULL;
@@ -204,27 +211,33 @@ static void
 cell_twiddle_texture(struct pipe_screen *screen,
                      struct pipe_surface *surface)
 {
-   struct cell_texture *texture = cell_texture(surface->texture);
+   struct cell_texture *ct = cell_texture(surface->texture);
    const uint level = surface->level;
-   const uint texWidth = texture->base.width[level];
-   const uint texHeight = texture->base.height[level];
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
    const uint bufWidth = align(texWidth, TILE_SIZE);
    const uint bufHeight = align(texHeight, TILE_SIZE);
    const void *map = pipe_buffer_map(screen, surface->buffer,
                                      PIPE_BUFFER_USAGE_CPU_READ);
    const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
 
-   switch (texture->base.format) {
+   switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      /* free old tiled data */
-      if (texture->tiled_data[level]) {
-         align_free(texture->tiled_data[level]);
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = bufWidth * bufHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->tiled_data[level]) {
+            ct->tiled_data[level] =
+               align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
+
+         twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                            surface->stride, src);
       }
-      /* alloc new tiled data */
-      texture->tiled_data[level] = align_malloc(bufWidth * bufHeight * 4, 16);
-      twiddle_image_uint(texWidth, texHeight, TILE_SIZE,
-                         texture->tiled_data[level],
-                         surface->stride, src);
       break;
    default:
       printf("Cell: twiddle unsupported texture format\n");
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index b1efe97e76..c951fa6f31 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -301,7 +301,8 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
  */
 static void
 update_tex_masks(struct spu_texture *texture,
-                 const struct pipe_sampler_state *sampler)
+                 const struct pipe_sampler_state *sampler,
+                 uint unit)
 {
    uint i;
 
@@ -328,6 +329,11 @@ update_tex_masks(struct spu_texture *texture,
          texture->level[i].scale_t = spu_splats(1.0f);
       }
    }
+
+   /* XXX temporary hack */
+   if (texture->target == PIPE_TEXTURE_CUBE) {
+      spu.sample_texture4[unit] = sample_texture4_cube;
+   }
 }
 
 
@@ -378,7 +384,7 @@ cmd_state_sampler(const struct cell_command_sampler *sampler)
       ASSERT(0);
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
 }
 
 
@@ -393,6 +399,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
 
    spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
 
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
@@ -408,6 +415,10 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1))
+         * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1));
+
       spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
       spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
 
@@ -415,7 +426,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
          spu.texture[unit].max_level = i;
    }
 
-   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
 
    //Debug=0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index 66b82f673d..5c3ee305d4 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -106,7 +106,7 @@ spu_txp(vector float s, vector float t, vector float r, vector float q,
         unsigned unit)
 {
    struct vec_4x4 colors;
-   spu.sample_texture4[unit](s, t, r, q, unit, 0, colors.v);
+   spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
    return colors;
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 45c6f4ced1..8781041bff 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -68,7 +68,7 @@ typedef void (*spu_sample_texture4_func)(vector float s,
                                          vector float t,
                                          vector float r,
                                          vector float q,
-                                         uint unit, uint level,
+                                         uint unit, uint level, uint face,
                                          vector float colors[4]);
 
 
@@ -113,6 +113,7 @@ struct spu_texture_level
    void *start;
    ushort width, height;
    ushort tiles_per_row;
+   uint bytes_per_image;
    /** texcoord scale factors */
    vector float scale_s, scale_t;
    /** texcoord masks (if REPEAT then size-1, else ~0) */
@@ -126,6 +127,7 @@ struct spu_texture
 {
    struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
    uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index b21c43a467..2570f02c73 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -48,6 +48,9 @@ invalidate_tex_cache(void)
       uint bytes = 4 * spu.texture[unit].level[lvl].width
          * spu.texture[unit].level[lvl].height;
 
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+
       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
    }
 }
@@ -67,11 +70,11 @@ invalidate_tex_cache(void)
  * a time.
  */
 static void
-get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
+get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
                 vec_uint4 *texels)
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
-   const unsigned texture_ea = (uintptr_t) tlevel->start;
+   unsigned texture_ea = (uintptr_t) tlevel->start;
    const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
    const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
    const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
@@ -88,6 +91,8 @@ get_four_texels(uint unit, uint level, vec_int4 x, vec_int4 y,
    
    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
    
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
    spu_dcache_fetch_unaligned((qword *) & texels[0],
                               texture_ea + spu_extract(offset, 0), 4);
    spu_dcache_fetch_unaligned((qword *) & texels[1],
@@ -121,7 +126,8 @@ spu_clamp(vector signed int vec, vector signed int max)
 void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, uint level, vector float colors[4])
+                        uint unit, uint level, uint face,
+                        vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    vector float ss = spu_mul(s, tlevel->scale_s);
@@ -138,7 +144,7 @@ sample_texture4_nearest(vector float s, vector float t,
    is = spu_clamp(is, tlevel->max_s);
    it = spu_clamp(it, tlevel->max_t);
 
-   get_four_texels(unit, level, is, it, texels);
+   get_four_texels(unit, level, face, is, it, texels);
 
    /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
    spu_unpack_A8R8G8B8_transpose4(texels, colors);
@@ -152,11 +158,14 @@ sample_texture4_nearest(vector float s, vector float t,
 void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, uint level, vector float colors[4])
+                         uint unit, uint level, uint face,
+                         vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
-   vector float ss = spu_madd(s, tlevel->scale_s,  spu_splats(-0.5f));
-   vector float tt = spu_madd(t, tlevel->scale_t, spu_splats(-0.5f));
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
    vector signed int is0 = spu_convts(ss, 0);
    vector signed int it0 = spu_convts(tt, 0);
@@ -179,10 +188,10 @@ sample_texture4_bilinear(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
    /* XXX possibly rework following code to compute the weighted sample
     * colors with integer arithmetic for fewer int->float conversions.
@@ -299,10 +308,12 @@ transpose(vector unsigned int *mOut0,
 void
 sample_texture4_bilinear_2(vector float s, vector float t,
                            vector float r, vector float q,
-                           uint unit, uint level, vector float colors[4])
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
 {
    const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
    static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
    /* Scale texcoords by size of texture, and add half pixel bias */
    vector float ss = spu_madd(s, tlevel->scale_s, half);
    vector float tt = spu_madd(t, tlevel->scale_t, half);
@@ -339,10 +350,10 @@ sample_texture4_bilinear_2(vector float s, vector float t,
 
    /* get packed int texels */
    vector unsigned int texels[16];
-   get_four_texels(unit, level, is0, it0, texels + 0);  /* upper-left */
-   get_four_texels(unit, level, is1, it0, texels + 4);  /* upper-right */
-   get_four_texels(unit, level, is0, it1, texels + 8);  /* lower-left */
-   get_four_texels(unit, level, is1, it1, texels + 12); /* lower-right */
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
    /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
    {
@@ -433,7 +444,8 @@ compute_lambda(uint unit, vector float s, vector float t)
 void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level_ignored, vector float colors[4])
+                    uint unit, uint level_ignored, uint face,
+                    vector float colors[4])
 {
    /*
     * Note that we're computing a lambda/lod here that's used for all
@@ -452,15 +464,136 @@ sample_texture4_lod(vector float s, vector float t,
 
    if (lambda <= 0.0f) {
       /* magnify */
-      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, colors);
+      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
    }
    else {
       /* minify */
       int level = (int) (lambda + 0.5f);
       if (level > (int) spu.texture[unit].max_level)
          level = spu.texture[unit].max_level;
-      spu.min_sample_texture4[unit](s, t, r, q, unit, level, colors);
+      spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
       /* XXX to do: mipmap level interpolation */
    }
 }
 
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level, int face_ignored,
+                     vector float colors[4])
+{
+   static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
+   uint p, faces[4];
+   float newS[4], newT[4];
+
+   /* Compute cube face referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+                                 zero, zero, unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index ec06a50b4a..08b891a4a8 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -39,24 +39,35 @@ invalidate_tex_cache(void);
 extern void
 sample_texture4_nearest(vector float s, vector float t,
                         vector float r, vector float q,
-                        uint unit, uint level, vector float colors[4]);
+                        uint unit, uint level, uint face,
+                        vector float colors[4]);
 
 
 extern void
 sample_texture4_bilinear(vector float s, vector float t,
                          vector float r, vector float q,
-                         uint unit, uint level, vector float colors[4]);
+                         uint unit, uint level, uint face,
+                         vector float colors[4]);
 
 extern void
 sample_texture4_bilinear_2(vector float s, vector float t,
-                         vector float r, vector float q,
-                           uint unit, uint level, vector float colors[4]);
+                           vector float r, vector float q,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
 
 
 extern void
 sample_texture4_lod(vector float s, vector float t,
                     vector float r, vector float q,
-                    uint unit, uint level, vector float colors[4]);
+                    uint unit, uint level, uint face,
+                    vector float colors[4]);
+
+
+extern void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level_ignored, int face_ignored,
+                     vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
-- 
cgit v1.2.3


From e42a394ed5ca00a9d0a51a0c26d4fef9959ba43c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:19:57 -0600
Subject: cell: fix incorrect parameter type

---
 src/gallium/drivers/cell/spu/spu_texture.c | 2 +-
 src/gallium/drivers/cell/spu/spu_texture.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 2570f02c73..9e25094d13 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -552,7 +552,7 @@ choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
 void
 sample_texture4_cube(vector float s, vector float t,
                      vector float r, vector float q,
-                     uint unit, uint level, int face_ignored,
+                     uint unit, uint level, uint face_ignored,
                      vector float colors[4])
 {
    static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index 08b891a4a8..387484c3ad 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -66,7 +66,7 @@ sample_texture4_lod(vector float s, vector float t,
 extern void
 sample_texture4_cube(vector float s, vector float t,
                      vector float r, vector float q,
-                     uint unit, uint level_ignored, int face_ignored,
+                     uint unit, uint level_ignored, uint face_ignored,
                      vector float colors[4]);
 
 
-- 
cgit v1.2.3


From 6c017c2c3c3649650cd0dc89a3b4946eab0e5a8c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:22:06 -0600
Subject: cell: replace FREE() with align_free()

---
 src/gallium/drivers/cell/ppu/cell_texture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 4c92ef154f..230e192573 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -147,7 +147,7 @@ cell_texture_release(struct pipe_screen *screen,
 
       for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
          if (ct->tiled_data[i]) {
-            FREE(ct->tiled_data[i]);
+            align_free(ct->tiled_data[i]);
          }
       }
 
-- 
cgit v1.2.3


From 41ccdde767e7aba6e8e6a9a035eacd6338c03a95 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 14 Oct 2008 17:22:40 -0600
Subject: cell: initial bits for 3D texture support

---
 src/gallium/drivers/cell/common.h              |  1 +
 src/gallium/drivers/cell/ppu/cell_state_emit.c |  2 ++
 src/gallium/drivers/cell/spu/spu_command.c     | 13 +++++++++++--
 src/gallium/drivers/cell/spu/spu_main.h        |  8 ++++----
 src/gallium/drivers/cell/spu/spu_texture.c     |  2 ++
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index c1e78f4db3..b0169b8e32 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -256,6 +256,7 @@ struct cell_command_texture
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
    ushort width[CELL_MAX_TEXTURE_LEVELS];
    ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index d4a867ffcf..bb694aa107 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -216,6 +216,7 @@ cell_emit_state(struct cell_context *cell)
                texture->start[level] = cell->texture[i]->tiled_data[level];
                texture->width[level] = cell->texture[i]->base.width[level];
                texture->height[level] = cell->texture[i]->base.height[level];
+               texture->depth[level] = cell->texture[i]->base.depth[level];
             }
             texture->target = cell->texture[i]->base.target;
          }
@@ -225,6 +226,7 @@ cell_emit_state(struct cell_context *cell)
                texture->start[level] = NULL;
                texture->width[level] = 0;
                texture->height[level] = 0;
+               texture->depth[level] = 0;
             }
             texture->target = 0;
          }
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index c951fa6f31..c28677ebf8 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -59,6 +59,14 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
 
 
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -404,6 +412,7 @@ cmd_state_texture(const struct cell_command_texture *texture)
    for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
       uint width = texture->width[i];
       uint height = texture->height[i];
+      uint depth = texture->depth[i];
 
       DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
              texture->start[i], texture->width[i], texture->height[i]);
@@ -411,13 +420,13 @@ cmd_state_texture(const struct cell_command_texture *texture)
       spu.texture[unit].level[i].start = texture->start[i];
       spu.texture[unit].level[i].width = width;
       spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
 
       spu.texture[unit].level[i].tiles_per_row =
          (width + TILE_SIZE - 1) / TILE_SIZE;
 
       spu.texture[unit].level[i].bytes_per_image =
-         4 * ((width + TILE_SIZE - 1) & ~(TILE_SIZE-1))
-         * ((height + TILE_SIZE - 1) & ~(TILE_SIZE-1));
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
 
       spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
       spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 8781041bff..eff43b870c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -111,15 +111,15 @@ struct spu_framebuffer
 struct spu_texture_level
 {
    void *start;
-   ushort width, height;
+   ushort width, height, depth;
    ushort tiles_per_row;
    uint bytes_per_image;
    /** texcoord scale factors */
-   vector float scale_s, scale_t;
+   vector float scale_s, scale_t, scale_r;
    /** texcoord masks (if REPEAT then size-1, else ~0) */
-   vector signed int mask_s, mask_t;
+   vector signed int mask_s, mask_t, mask_r;
    /** texcoord clamp limits */
-   vector signed int max_s, max_t;
+   vector signed int max_s, max_t, max_r;
 } ALIGN16_ATTRIB;
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 9e25094d13..42eb06a362 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -50,6 +50,8 @@ invalidate_tex_cache(void)
 
       if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
          bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
 
       spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
    }
-- 
cgit v1.2.3