From 7888a2f82200738ac03c78d9900eb028d48725a1 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 20 Sep 2010 10:50:15 -0600
Subject: llvmpipe: fix query bug when no there's no scene

---
 src/gallium/drivers/llvmpipe/lp_query.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c
index ff0e207a54..84c66dd36e 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -92,8 +92,9 @@ llvmpipe_get_query_result(struct pipe_context *pipe,
    int i;
 
    if (!pq->fence) {
-      assert(0);                /* query not in issued state */
-      return FALSE;
+      /* no fence because there was no scene, so results is zero */
+      *result = 0;
+      return TRUE;
    }
 
    if (!lp_fence_signalled(pq->fence)) {
-- 
cgit v1.2.3


From 955d76c3d2004c058c326d68eddc5a06d1611a41 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 20 Sep 2010 12:52:16 -0600
Subject: llvmpipe: maintain fragment shader state for draw module

---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 12 ++++++++++++
 src/gallium/drivers/llvmpipe/lp_state_fs.h |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index e54dd9f0a3..fb673db6d0 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -886,6 +886,7 @@ static void *
 llvmpipe_create_fs_state(struct pipe_context *pipe,
                          const struct pipe_shader_state *templ)
 {
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
    struct lp_fragment_shader *shader;
    int nr_samplers;
 
@@ -902,6 +903,12 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
    /* we need to keep a local copy of the tokens */
    shader->base.tokens = tgsi_dup_tokens(templ->tokens);
 
+   shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
+   if (shader->draw_data == NULL) {
+      FREE((void *) shader->base.tokens);
+      return NULL;
+   }
+
    nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
 
    shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
@@ -938,6 +945,9 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
 
    draw_flush(llvmpipe->draw);
 
+   draw_bind_fragment_shader(llvmpipe->draw,
+                             (llvmpipe->fs ? llvmpipe->fs->draw_data : NULL));
+
    llvmpipe->fs = fs;
 
    llvmpipe->dirty |= LP_NEW_FS;
@@ -995,6 +1005,8 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
       li = next;
    }
 
+   draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
+
    assert(shader->variants_cached == 0);
    FREE((void *) shader->base.tokens);
    FREE(shader);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 2914e7d7ef..4999b8dca1 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -100,6 +100,8 @@ struct lp_fragment_shader
 
    struct lp_fs_variant_list_item variants;
 
+   struct draw_fragment_shader *draw_data;
+
    /* For debugging/profiling purposes */
    unsigned variant_key_size;
    unsigned no;
-- 
cgit v1.2.3


From ebba92875aca586b661f6547888a2ed95e70e0ff Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 20 Sep 2010 12:55:29 -0600
Subject: llvmpipe: indentation fix

---
 src/gallium/drivers/llvmpipe/lp_state_rasterizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
index 0bad7320f3..b81c2cfd15 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -79,7 +79,7 @@ llvmpipe_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
                    llvmpipe->rasterizer->point_size,
                    llvmpipe->rasterizer->point_size_per_vertex,
                    llvmpipe->rasterizer->sprite_coord_enable);
-       }
+   }
 
    llvmpipe->dirty |= LP_NEW_RASTERIZER;
 }
-- 
cgit v1.2.3


From 924c18da95bbc62492f8e54bd8273a4981a919dc Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 20 Sep 2010 13:07:59 -0600
Subject: llvmpipe: reformatting, remove trailing whitespace, etc

---
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 44 ++++++++++++++-------------
 1 file changed, 23 insertions(+), 21 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 5538987151..5521cbbe87 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -52,26 +52,29 @@ struct point_info {
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  */
-static void constant_coef( struct lp_setup_context *setup,
-                           struct lp_rast_triangle *point,
-                           unsigned slot,
-                           const float value,
-                           unsigned i )
+static void
+constant_coef(struct lp_setup_context *setup,
+              struct lp_rast_triangle *point,
+              unsigned slot,
+              const float value,
+              unsigned i)
 {
    point->inputs.a0[slot][i] = value;
    point->inputs.dadx[slot][i] = 0.0f;
    point->inputs.dady[slot][i] = 0.0f;
 }
 
-static void perspective_coef( struct lp_setup_context *setup,
-                              struct lp_rast_triangle *point,
-                              const struct point_info *info,
-                              unsigned slot,
-                              unsigned vert_attr,
-                              unsigned i)
+
+static void
+perspective_coef(struct lp_setup_context *setup,
+                 struct lp_rast_triangle *point,
+                 const struct point_info *info,
+                 unsigned slot,
+                 unsigned vert_attr,
+                 unsigned i)
 {
-   if (i == 0) {   
-      float dadx = FIXED_ONE / (float)info->dx12;  
+   if (i == 0) {
+      float dadx = FIXED_ONE / (float)info->dx12;
       float dady =  0.0f;
       point->inputs.dadx[slot][i] = dadx;
       point->inputs.dady[slot][i] = dady;
@@ -79,30 +82,26 @@ static void perspective_coef( struct lp_setup_context *setup,
                                   (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
                                    dady * ((float)info->v0[0][1] - setup->pixel_offset)));
    }
-
    else if (i == 1) {
-      float dadx =  0.0f; 
+      float dadx =  0.0f;
       float dady =  FIXED_ONE / (float)info->dx12;
-   
+
       point->inputs.dadx[slot][i] = dadx;
       point->inputs.dady[slot][i] = dady;
       point->inputs.a0[slot][i] = (0.5 -
                                   (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
                                    dady * ((float)info->v0[0][1] - setup->pixel_offset)));
    }
-
    else if (i == 2) {
       point->inputs.a0[slot][i] = 0.0f;
       point->inputs.dadx[slot][i] = 0.0f;
       point->inputs.dady[slot][i] = 0.0f;
    }
-      
    else if (i == 3) {
       point->inputs.a0[slot][i] = 1.0f;
       point->inputs.dadx[slot][i] = 0.0f;
       point->inputs.dady[slot][i] = 0.0f;
    }
-
 }
 
 
@@ -144,6 +143,7 @@ setup_point_fragcoord_coef(struct lp_setup_context *setup,
    }
 }
 
+
 /**
  * Compute the point->coef[] array dadx, dady, a0 values.
  */
@@ -203,6 +203,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
                               fragcoord_usage_mask);
 }
 
+
 static INLINE int
 subpixel_snap(float a)
 {
@@ -322,8 +323,9 @@ try_setup_point( struct lp_setup_context *setup,
 }
 
 
-static void lp_setup_point( struct lp_setup_context *setup,
-                           const float (*v0)[4] )
+static void 
+lp_setup_point(struct lp_setup_context *setup,
+               const float (*v0)[4])
 {
    if (!try_setup_point( setup, v0 ))
    {
-- 
cgit v1.2.3


From b7a5eac1f3723a369885bad369a04c456bdf1565 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 20 Sep 2010 13:26:27 -0600
Subject: llvmpipe: clean-up, comments in setup_point_coefficient()

---
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 5521cbbe87..fb4fb2c436 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -152,6 +152,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
                           struct lp_rast_triangle *point,
                           const struct point_info *info)
 {
+   const struct lp_fragment_shader *shader = setup->fs.current.variant->shader;
    unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
    unsigned slot;
 
@@ -172,12 +173,16 @@ setup_point_coefficients( struct lp_setup_context *setup,
          fragcoord_usage_mask |= usage_mask;
          break;
 
+      case LP_INTERP_LINEAR:
+         /* Sprite tex coords may use linear interpolation someday */
+         /* fall-through */
+
       case LP_INTERP_PERSPECTIVE:
-         /* For point sprite textures */        
-         if (setup->fs.current.variant->shader->info.input_semantic_name[slot] 
-             == TGSI_SEMANTIC_GENERIC) 
-         {
-            int index = setup->fs.current.variant->shader->info.input_semantic_index[slot];
+         /* check if the sprite coord flag is set for this attribute.
+          * If so, set it up so it up so x any y vary from 0 to 1.
+          */
+         if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
+            const int index = shader->info.input_semantic_index[slot];
             
             if (setup->sprite & (1 << index)) {
                for (i = 0; i < NUM_CHANNELS; i++)
-- 
cgit v1.2.3


From c3982c6bcdeb88f7fb1b20f8bd300db31cd7288d Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 20 Sep 2010 13:29:55 -0600
Subject: llvmpipe: rename sprite field, add sprite_coord_origin

---
 src/gallium/drivers/llvmpipe/lp_setup.c            | 6 ++++--
 src/gallium/drivers/llvmpipe/lp_setup.h            | 3 ++-
 src/gallium/drivers/llvmpipe/lp_setup_context.h    | 2 +-
 src/gallium/drivers/llvmpipe/lp_setup_point.c      | 2 +-
 src/gallium/drivers/llvmpipe/lp_state_rasterizer.c | 3 ++-
 5 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 6674d281d1..ea7002aafc 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -490,12 +490,14 @@ void
 lp_setup_set_point_state( struct lp_setup_context *setup,
                           float point_size,                          
                           boolean point_size_per_vertex,
-                          uint sprite)
+                          uint sprite_coord_enable,
+                          uint sprite_coord_origin)
 {
    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
    setup->point_size = point_size;
-   setup->sprite = sprite;
+   setup->sprite_coord_enable = sprite_coord_enable;
+   setup->sprite_coord_origin = sprite_coord_origin;
    setup->point_size_per_vertex = point_size_per_vertex;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index b94061b7d4..81ff43f8ad 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -107,7 +107,8 @@ void
 lp_setup_set_point_state( struct lp_setup_context *setup,
                           float point_size,                          
                           boolean point_size_per_vertex,
-                          uint sprite);
+                          uint sprite_coord_enable,
+                          uint sprite_coord_origin);
 
 void
 lp_setup_set_fs_inputs( struct lp_setup_context *setup,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 80b356476a..8506ed2dc9 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -73,7 +73,7 @@ struct lp_setup_context
    uint prim;
    uint vertex_size;
    uint nr_vertices;
-   uint sprite;
+   uint sprite_coord_enable, sprite_coord_origin;
    uint vertex_buffer_size;
    void *vertex_buffer;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index fb4fb2c436..f8f411f4f1 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -184,7 +184,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
          if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
             const int index = shader->info.input_semantic_index[slot];
             
-            if (setup->sprite & (1 << index)) {
+            if (setup->sprite_coord_enable & (1 << index)) {
                for (i = 0; i < NUM_CHANNELS; i++)
                   if (usage_mask & (1 << i))
                      perspective_coef(setup, point, info, slot+1, vert_attr, i);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
index b81c2cfd15..dbd73812e4 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -78,7 +78,8 @@ llvmpipe_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
       lp_setup_set_point_state( llvmpipe->setup,
                    llvmpipe->rasterizer->point_size,
                    llvmpipe->rasterizer->point_size_per_vertex,
-                   llvmpipe->rasterizer->sprite_coord_enable);
+                   llvmpipe->rasterizer->sprite_coord_enable,
+                   llvmpipe->rasterizer->sprite_coord_mode);
    }
 
    llvmpipe->dirty |= LP_NEW_RASTERIZER;
-- 
cgit v1.2.3


From 61fcd9aaa2bf91eb400eeb4df2ab2c7e48b3bb6c Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 20 Sep 2010 13:48:02 -0600
Subject: llvmpipe: implement sprite coord origin modes

---
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index f8f411f4f1..bb6b88069b 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -71,7 +71,8 @@ perspective_coef(struct lp_setup_context *setup,
                  const struct point_info *info,
                  unsigned slot,
                  unsigned vert_attr,
-                 unsigned i)
+                 unsigned i,
+                 unsigned sprite_coord_origin)
 {
    if (i == 0) {
       float dadx = FIXED_ONE / (float)info->dx12;
@@ -83,14 +84,18 @@ perspective_coef(struct lp_setup_context *setup,
                                    dady * ((float)info->v0[0][1] - setup->pixel_offset)));
    }
    else if (i == 1) {
-      float dadx =  0.0f;
-      float dady =  FIXED_ONE / (float)info->dx12;
+      float dadx = 0.0f;
+      float dady = FIXED_ONE / (float)info->dx12;
+
+      if (sprite_coord_origin == PIPE_SPRITE_COORD_LOWER_LEFT) {
+         dady = -dady;
+      }
 
       point->inputs.dadx[slot][i] = dadx;
       point->inputs.dady[slot][i] = dady;
       point->inputs.a0[slot][i] = (0.5 -
-                                  (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
-                                   dady * ((float)info->v0[0][1] - setup->pixel_offset)));
+                                   (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
+                                    dady * ((float)info->v0[0][1] - setup->pixel_offset)));
    }
    else if (i == 2) {
       point->inputs.a0[slot][i] = 0.0f;
@@ -187,7 +192,8 @@ setup_point_coefficients( struct lp_setup_context *setup,
             if (setup->sprite_coord_enable & (1 << index)) {
                for (i = 0; i < NUM_CHANNELS; i++)
                   if (usage_mask & (1 << i))
-                     perspective_coef(setup, point, info, slot+1, vert_attr, i);
+                     perspective_coef(setup, point, info, slot+1, vert_attr, i,
+                                      setup->sprite_coord_origin);
                fragcoord_usage_mask |= TGSI_WRITEMASK_W;
                break;                     
             }
-- 
cgit v1.2.3


From 1662c317032cf280701d7e55b028b7f0dc8afc65 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 20 Sep 2010 15:33:49 -0600
Subject: llvmpipe: check bitshift against PIPE_MAX_SHADER_OUTPUTS

---
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index bb6b88069b..774a3c80da 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -188,8 +188,11 @@ setup_point_coefficients( struct lp_setup_context *setup,
           */
          if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
             const int index = shader->info.input_semantic_index[slot];
-            
-            if (setup->sprite_coord_enable & (1 << index)) {
+            /* Note that sprite_coord enable is a bitfield of
+             * PIPE_MAX_SHADER_OUTPUTS bits.
+             */
+            if (index < PIPE_MAX_SHADER_OUTPUTS &&
+                (setup->sprite_coord_enable & (1 << index))) {
                for (i = 0; i < NUM_CHANNELS; i++)
                   if (usage_mask & (1 << i))
                      perspective_coef(setup, point, info, slot+1, vert_attr, i,
-- 
cgit v1.2.3


From 2ec86793bd43fe15d8f79d04e32d6c524e8ad844 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 21 Sep 2010 14:28:51 +0100
Subject: llvmpipe: fix flatshading in new line code

Calculate interpolants before rearranging the vertices.
---
 src/gallium/drivers/llvmpipe/lp_setup_line.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 9f090d1992..829eb8a5a0 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -292,6 +292,7 @@ try_setup_line( struct lp_setup_context *setup,
    float x2diff;
    float y2diff;
    float dx, dy;
+   float area;
 
    boolean draw_start;
    boolean draw_end;
@@ -311,6 +312,18 @@ try_setup_line( struct lp_setup_context *setup,
 
    dx = v1[0][0] - v2[0][0];
    dy = v1[0][1] - v2[0][1];
+   area = (dx * dx  + dy * dy);
+   if (area == 0) {
+      LP_COUNT(nr_culled_tris);
+      return TRUE;
+   }
+
+   info.oneoverarea = 1.0f / area;
+   info.dx = dx;
+   info.dy = dy;
+   info.v1 = v1;
+   info.v2 = v2;
+
   
    /* X-MAJOR LINE */
    if (fabsf(dx) >= fabsf(dy)) {
@@ -573,12 +586,6 @@ try_setup_line( struct lp_setup_context *setup,
    line->plane[3].dcdx = y[3] - y[0];
 
 
-   info.oneoverarea = 1.0f / (dx * dx  + dy * dy);    
-   info.dx = dx;
-   info.dy = dy;
-   info.v1 = v1;
-   info.v2 = v2;
-
    /* Setup parameter interpolants:
     */
    setup_line_coefficients( setup, line, &info); 
-- 
cgit v1.2.3


From 388c94195af41c2084f4882ab414c86b575818fb Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Tue, 21 Sep 2010 17:50:30 +0100
Subject: llvmpipe: Describe how to profile llvmpipe.

---
 src/gallium/drivers/llvmpipe/README | 38 +++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/README b/src/gallium/drivers/llvmpipe/README
index 8b5539d2c5..ec30d4d708 100644
--- a/src/gallium/drivers/llvmpipe/README
+++ b/src/gallium/drivers/llvmpipe/README
@@ -131,6 +131,44 @@ replacing the native ICD driver, but it's quite an advanced usage, so if you
 need to ask, don't even try it.
 
 
+Profiling
+=========
+
+To profile llvmpipe you should pass the options
+
+  scons debug=no profile=yes <same-as-before>
+
+This will ensure that frame pointers are used both in C and JIT functions, and
+that no tail call optimizations are done by gcc.
+
+
+To better profile JIT code you'll need to build LLVM with oprofile integration.
+
+  source_dir=$PWD/llvm-2.6
+  build_dir=$source_dir/build/profile
+  install_dir=$source_dir-profile
+
+  mkdir -p "$build_dir"
+  cd "$build_dir" && \
+  $source_dir/configure \
+      --prefix=$install_dir \
+      --enable-optimized \
+      --disable-profiling \
+      --enable-targets=host-only \
+      --with-oprofile
+
+  make -C "$build_dir"
+  make -C "$build_dir" install
+
+  find "$install_dir/lib" -iname '*.a' -print0 | xargs -0 strip --strip-debug
+
+The you should define
+
+  export LLVM=/path/to/llvm-2.6-profile
+
+and rebuild.
+
+
 Unit testing
 ============
 
-- 
cgit v1.2.3


From b556bb7c44236a9fae54f58cc03e1d05eaa2124f Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Tue, 21 Sep 2010 17:51:06 +0100
Subject: llvmpipe: When failing free fs shader too.

---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index fb673db6d0..4277c47eeb 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -906,6 +906,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
    shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
    if (shader->draw_data == NULL) {
       FREE((void *) shader->base.tokens);
+      FREE(shader);
       return NULL;
    }
 
-- 
cgit v1.2.3


From 9a8e9f4595b66ea094b293da1afcded8f06ab3d6 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 22 Sep 2010 14:48:28 +0100
Subject: llvmpipe: Special case complementary and identify blend factors in
 SoA.

One multiplication instead of two.

Also fix floating point random number generation and verification.

TODO: Do the same for AoS blending.
---
 src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c |  3 --
 src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c | 46 +++++++++++++++++++++++--
 src/gallium/drivers/llvmpipe/lp_test_blend.c    | 41 ++++++++--------------
 src/gallium/drivers/llvmpipe/lp_test_main.c     | 33 +++++++++++-------
 4 files changed, 79 insertions(+), 44 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index b5924cbb7d..d1c9b88f9b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -320,9 +320,6 @@ lp_build_blend_aos(LLVMBuilderRef builder,
    if(!blend->rt[rt].blend_enable)
       return src;
 
-   /* It makes no sense to blend unless values are normalized */
-   assert(type.norm);
-
    /* Setup build context */
    memset(&bld, 0, sizeof bld);
    lp_build_context_init(&bld.base, builder, type);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
index b9c7a6ceed..30d261e979 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_soa.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -195,6 +195,13 @@ lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
 }
 
 
+static boolean
+lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
+{
+   return dst_factor == (src_factor ^ 0x10);
+}
+
+
 /**
  * Generate blend code in SOA mode.
  * \param rt  render target index (to index the blend / colormask state)
@@ -243,8 +250,41 @@ lp_build_blend_soa(LLVMBuilderRef builder,
             unsigned func = i < 3 ? blend->rt[rt].rgb_func : blend->rt[rt].alpha_func;
             boolean func_commutative = lp_build_blend_func_commutative(func);
 
-            /* It makes no sense to blend unless values are normalized */
-            assert(type.norm);
+	    if (func == PIPE_BLEND_ADD &&
+		lp_build_blend_factor_complementary(src_factor, dst_factor) && 0) {
+               /*
+                * Special case linear interpolation, (i.e., complementary factors).
+                */
+
+	       LLVMValueRef weight;
+	       if (src_factor < dst_factor) {
+		  weight = lp_build_blend_soa_factor(&bld, src_factor, i);
+		  res[i] = lp_build_lerp(&bld.base, weight, dst[i], src[i]);
+	       } else {
+		  weight = lp_build_blend_soa_factor(&bld, dst_factor, i);
+		  res[i] = lp_build_lerp(&bld.base, weight, src[i], dst[i]);
+	       }
+	       continue;
+	    }
+
+	    if ((func == PIPE_BLEND_ADD ||
+                 func == PIPE_BLEND_SUBTRACT ||
+                 func == PIPE_BLEND_REVERSE_SUBTRACT) &&
+		src_factor == dst_factor &&
+                type.floating) {
+               /*
+                * Special common factor.
+                *
+                * XXX: Only for floating points for now, since saturation will
+                * cause different results.
+                */
+
+	       LLVMValueRef factor;
+               factor = lp_build_blend_soa_factor(&bld, src_factor, i);
+               res[i] = lp_build_blend_func(&bld.base, func, src[i], dst[i]);
+               res[i] = lp_build_mul(&bld.base, res[i], factor);
+	       continue;
+	    }
 
             /*
              * Compute src/dst factors.
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index d0389f0cb0..8b6b5e1298 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -243,19 +243,6 @@ add_blend_test(LLVMModuleRef module,
 }
 
 
-/** Add and limit result to ceiling of 1.0 */
-#define ADD_SAT(R, A, B) \
-do { \
-   R = (A) + (B);  if (R > 1.0f) R = 1.0f; \
-} while (0)
-
-/** Subtract and limit result to floor of 0.0 */
-#define SUB_SAT(R, A, B) \
-do { \
-   R = (A) - (B);  if (R < 0.0f) R = 0.0f; \
-} while (0)
-
-
 static void
 compute_blend_ref_term(unsigned rgb_factor,
                        unsigned alpha_factor,
@@ -423,19 +410,19 @@ compute_blend_ref(const struct pipe_blend_state *blend,
     */
    switch (blend->rt[0].rgb_func) {
    case PIPE_BLEND_ADD:
-      ADD_SAT(res[0], src_term[0], dst_term[0]); /* R */
-      ADD_SAT(res[1], src_term[1], dst_term[1]); /* G */
-      ADD_SAT(res[2], src_term[2], dst_term[2]); /* B */
+      res[0] = src_term[0] + dst_term[0]; /* R */
+      res[1] = src_term[1] + dst_term[1]; /* G */
+      res[2] = src_term[2] + dst_term[2]; /* B */
       break;
    case PIPE_BLEND_SUBTRACT:
-      SUB_SAT(res[0], src_term[0], dst_term[0]); /* R */
-      SUB_SAT(res[1], src_term[1], dst_term[1]); /* G */
-      SUB_SAT(res[2], src_term[2], dst_term[2]); /* B */
+      res[0] = src_term[0] - dst_term[0]; /* R */
+      res[1] = src_term[1] - dst_term[1]; /* G */
+      res[2] = src_term[2] - dst_term[2]; /* B */
       break;
    case PIPE_BLEND_REVERSE_SUBTRACT:
-      SUB_SAT(res[0], dst_term[0], src_term[0]); /* R */
-      SUB_SAT(res[1], dst_term[1], src_term[1]); /* G */
-      SUB_SAT(res[2], dst_term[2], src_term[2]); /* B */
+      res[0] = dst_term[0] - src_term[0]; /* R */
+      res[1] = dst_term[1] - src_term[1]; /* G */
+      res[2] = dst_term[2] - src_term[2]; /* B */
       break;
    case PIPE_BLEND_MIN:
       res[0] = MIN2(src_term[0], dst_term[0]); /* R */
@@ -456,13 +443,13 @@ compute_blend_ref(const struct pipe_blend_state *blend,
     */
    switch (blend->rt[0].alpha_func) {
    case PIPE_BLEND_ADD:
-      ADD_SAT(res[3], src_term[3], dst_term[3]); /* A */
+      res[3] = src_term[3] + dst_term[3]; /* A */
       break;
    case PIPE_BLEND_SUBTRACT:
-      SUB_SAT(res[3], src_term[3], dst_term[3]); /* A */
+      res[3] = src_term[3] - dst_term[3]; /* A */
       break;
    case PIPE_BLEND_REVERSE_SUBTRACT:
-      SUB_SAT(res[3], dst_term[3], src_term[3]); /* A */
+      res[3] = dst_term[3] - src_term[3]; /* A */
       break;
    case PIPE_BLEND_MIN:
       res[3] = MIN2(src_term[3], dst_term[3]); /* A */
@@ -676,6 +663,8 @@ test_one(unsigned verbose,
                fprintf(stderr, "  Ref%c: ", channel);
                dump_vec(stderr, type, ref + j*stride);
                fprintf(stderr, "\n");
+
+               fprintf(stderr, "\n");
             }
          }
       }
@@ -773,7 +762,7 @@ blend_funcs[] = {
 
 const struct lp_type blend_types[] = {
    /* float, fixed,  sign,  norm, width, len */
-   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 }, /* f32 x 4 */
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   4 }, /* f32 x 4 */
    {  FALSE, FALSE, FALSE,  TRUE,     8,  16 }, /* u8n x 16 */
 };
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 7bbbc61d4c..7a0d06ae2c 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -205,16 +205,19 @@ random_elem(struct lp_type type, void *dst, unsigned index)
    assert(index < type.length);
    value = (double)rand()/(double)RAND_MAX;
    if(!type.norm) {
-      unsigned long long mask;
-      if (type.floating)
-         mask = ~(unsigned long long)0;
-      else if (type.fixed)
-         mask = ((unsigned long long)1 << (type.width / 2)) - 1;
-      else if (type.sign)
-         mask = ((unsigned long long)1 << (type.width - 1)) - 1;
-      else
-         mask = ((unsigned long long)1 << type.width) - 1;
-      value += (double)(mask & rand());
+      if (type.floating) {
+         value *= 2.0;
+      }
+      else {
+         unsigned long long mask;
+	 if (type.fixed)
+            mask = ((unsigned long long)1 << (type.width / 2)) - 1;
+         else if (type.sign)
+            mask = ((unsigned long long)1 << (type.width - 1)) - 1;
+         else
+            mask = ((unsigned long long)1 << type.width) - 1;
+         value += (double)(mask & rand());
+      }
    }
    if(!type.sign)
       if(rand() & 1)
@@ -261,12 +264,18 @@ boolean
 compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps)
 {
    unsigned i;
+   eps *= type.floating ? 8.0 : 2.0;
    for (i = 0; i < type.length; ++i) {
       double res_elem = read_elem(type, res, i);
       double ref_elem = read_elem(type, ref, i);
-      double delta = fabs(res_elem - ref_elem);
-      if(delta >= 2.0*eps)
+      double delta = res_elem - ref_elem;
+      if (ref_elem < -1.0 || ref_elem > 1.0) {
+	 delta /= ref_elem;
+      }
+      delta = fabs(delta);
+      if (delta >= eps) {
          return FALSE;
+      }
    }
 
    return TRUE;
-- 
cgit v1.2.3


From 87267c71f67d02fcdd59a899fd0eea6d64e523b5 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 22 Sep 2010 15:02:10 +0100
Subject: llvmpipe: Make rgb/alpha bland func/factors match, when there is no
 alpha.

Makes AoS blending easier, and state more canonical.
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 4277c47eeb..f0a15e11b9 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -1049,7 +1049,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
  * Return the blend factor equivalent to a destination alpha of one.
  */
 static INLINE unsigned
-force_dst_alpha_one(unsigned factor, boolean alpha)
+force_dst_alpha_one(unsigned factor)
 {
    switch(factor) {
    case PIPE_BLENDFACTOR_DST_ALPHA:
@@ -1060,15 +1060,6 @@ force_dst_alpha_one(unsigned factor, boolean alpha)
       return PIPE_BLENDFACTOR_ZERO;
    }
 
-   if (alpha) {
-      switch(factor) {
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         return PIPE_BLENDFACTOR_ONE;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         return PIPE_BLENDFACTOR_ZERO;
-      }
-   }
-
    return factor;
 }
 
@@ -1145,12 +1136,15 @@ make_variant_key(struct llvmpipe_context *lp,
        *
        * TODO: This should be generalized to all channels for better
        * performance, but only alpha causes correctness issues.
+       *
+       * Also, force rgb/alpha func/factors match, to make AoS blending easier.
        */
       if (format_desc->swizzle[3] > UTIL_FORMAT_SWIZZLE_W) {
-         blend_rt->rgb_src_factor = force_dst_alpha_one(blend_rt->rgb_src_factor, FALSE);
-         blend_rt->rgb_dst_factor = force_dst_alpha_one(blend_rt->rgb_dst_factor, FALSE);
-         blend_rt->alpha_src_factor = force_dst_alpha_one(blend_rt->alpha_src_factor, TRUE);
-         blend_rt->alpha_dst_factor = force_dst_alpha_one(blend_rt->alpha_dst_factor, TRUE);
+         blend_rt->rgb_src_factor   = force_dst_alpha_one(blend_rt->rgb_src_factor);
+         blend_rt->rgb_dst_factor   = force_dst_alpha_one(blend_rt->rgb_dst_factor);
+         blend_rt->alpha_func       = blend_rt->rgb_func;
+         blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
+         blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
       }
    }
 
-- 
cgit v1.2.3


From 516ac2bd50ad1e71bd2a359d247532d9f18bcf99 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 22 Sep 2010 11:20:48 -0600
Subject: llvmpipe: fix sprite texcoord setup for non-projective texturing

Normally the Mesa state tracker uses TXP instructions for texturing.
But if a fragment shader uses texture2D() that's a TEX instruction.
In that case we were incorrectly computing the texcoord coefficients
in the point sprite setup code.  Some new comments in the code explain
things.
---
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 85 ++++++++++++++++++---------
 1 file changed, 58 insertions(+), 27 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 774a3c80da..2c354d1d0e 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -65,47 +65,77 @@ constant_coef(struct lp_setup_context *setup,
 }
 
 
+/**
+ * Setup automatic texcoord coefficients (for sprite rendering).
+ * \param slot  the vertex attribute slot to setup
+ * \param i  the attribute channel in [0,3]
+ * \param sprite_coord_origin  one of PIPE_SPRITE_COORD_x
+ * \param perspective_proj  will the TEX instruction do a divide by Q?
+ */
 static void
-perspective_coef(struct lp_setup_context *setup,
-                 struct lp_rast_triangle *point,
-                 const struct point_info *info,
-                 unsigned slot,
-                 unsigned vert_attr,
-                 unsigned i,
-                 unsigned sprite_coord_origin)
+texcoord_coef(struct lp_setup_context *setup,
+              struct lp_rast_triangle *point,
+              const struct point_info *info,
+              unsigned slot,
+              unsigned i,
+              unsigned sprite_coord_origin,
+              boolean perspective_proj)
 {
+   assert(i < 4);
+
    if (i == 0) {
       float dadx = FIXED_ONE / (float)info->dx12;
       float dady =  0.0f;
-      point->inputs.dadx[slot][i] = dadx;
-      point->inputs.dady[slot][i] = dady;
-      point->inputs.a0[slot][i] = (0.5 -
-                                  (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
-                                   dady * ((float)info->v0[0][1] - setup->pixel_offset)));
+      float x0 = info->v0[0][0] - setup->pixel_offset;
+      float y0 = info->v0[0][1] - setup->pixel_offset;
+
+      point->inputs.dadx[slot][0] = dadx;
+      point->inputs.dady[slot][0] = dady;
+      point->inputs.a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
+
+      if (!perspective_proj) {
+         /* Divide coefficients by vertex.w here.
+          *
+          * It would be clearer to always multiply by w0 above and
+          * then divide it out for perspective projection here, but
+          * doing it this way involves less algebra.
+          */
+         float w0 = info->v0[0][3];
+         point->inputs.dadx[slot][0] *= w0;
+         point->inputs.dady[slot][0] *= w0;
+         point->inputs.a0[slot][0] *= w0;
+      }
    }
    else if (i == 1) {
       float dadx = 0.0f;
       float dady = FIXED_ONE / (float)info->dx12;
+      float x0 = info->v0[0][0] - setup->pixel_offset;
+      float y0 = info->v0[0][1] - setup->pixel_offset;
 
       if (sprite_coord_origin == PIPE_SPRITE_COORD_LOWER_LEFT) {
          dady = -dady;
       }
 
-      point->inputs.dadx[slot][i] = dadx;
-      point->inputs.dady[slot][i] = dady;
-      point->inputs.a0[slot][i] = (0.5 -
-                                   (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
-                                    dady * ((float)info->v0[0][1] - setup->pixel_offset)));
+      point->inputs.dadx[slot][1] = dadx;
+      point->inputs.dady[slot][1] = dady;
+      point->inputs.a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
+
+      if (!perspective_proj) {
+         float w0 = info->v0[0][3];
+         point->inputs.dadx[slot][1] *= w0;
+         point->inputs.dady[slot][1] *= w0;
+         point->inputs.a0[slot][1] *= w0;
+      }
    }
    else if (i == 2) {
-      point->inputs.a0[slot][i] = 0.0f;
-      point->inputs.dadx[slot][i] = 0.0f;
-      point->inputs.dady[slot][i] = 0.0f;
+      point->inputs.a0[slot][2] = 0.0f;
+      point->inputs.dadx[slot][2] = 0.0f;
+      point->inputs.dady[slot][2] = 0.0f;
    }
-   else if (i == 3) {
-      point->inputs.a0[slot][i] = 1.0f;
-      point->inputs.dadx[slot][i] = 0.0f;
-      point->inputs.dady[slot][i] = 0.0f;
+   else {
+      point->inputs.a0[slot][3] = 1.0f;
+      point->inputs.dadx[slot][3] = 0.0f;
+      point->inputs.dady[slot][3] = 0.0f;
    }
 }
 
@@ -184,7 +214,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
 
       case LP_INTERP_PERSPECTIVE:
          /* check if the sprite coord flag is set for this attribute.
-          * If so, set it up so it up so x any y vary from 0 to 1.
+          * If so, set it up so it up so x and y vary from 0 to 1.
           */
          if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
             const int index = shader->info.input_semantic_index[slot];
@@ -195,8 +225,9 @@ setup_point_coefficients( struct lp_setup_context *setup,
                 (setup->sprite_coord_enable & (1 << index))) {
                for (i = 0; i < NUM_CHANNELS; i++)
                   if (usage_mask & (1 << i))
-                     perspective_coef(setup, point, info, slot+1, vert_attr, i,
-                                      setup->sprite_coord_origin);
+                     texcoord_coef(setup, point, info, slot + 1, i,
+                                   setup->sprite_coord_origin,
+                                   (usage_mask & TGSI_WRITEMASK_W));
                fragcoord_usage_mask |= TGSI_WRITEMASK_W;
                break;                     
             }
-- 
cgit v1.2.3


From 61b7da074e2faebf03d3dfc30e910ee1367bcd5a Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 23 Sep 2010 18:18:40 -0600
Subject: llvmpipe: make min/max lod and lod bias dynamic state

Before, changing any of these sampler values triggered generation
of new JIT code.  Added a new flag for the special case of
min_lod == max_lod which is hit during auto mipmap generation.
---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c     | 35 ++++++++++++-----------
 src/gallium/auxiliary/gallivm/lp_bld_sample.h     | 18 +++++++++++-
 src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c |  2 +-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c |  2 +-
 src/gallium/drivers/llvmpipe/lp_jit.c             | 15 ++++++++++
 src/gallium/drivers/llvmpipe/lp_jit.h             |  7 +++++
 src/gallium/drivers/llvmpipe/lp_setup.c           |  8 +++++-
 src/gallium/drivers/llvmpipe/lp_setup.h           |  3 +-
 src/gallium/drivers/llvmpipe/lp_state_derived.c   |  6 ++--
 src/gallium/drivers/llvmpipe/lp_tex_sample.c      |  7 +++++
 10 files changed, 79 insertions(+), 24 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index e89ee7c230..caf1c7e865 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -99,21 +99,21 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    }
 
+   /* If min_lod == max_lod we can greatly simplify mipmap selection.
+    * This is a case that occurs during automatic mipmap generation.
+    */
+   if (sampler->min_lod == sampler->max_lod) {
+      state->min_max_lod_equal = 1;
+      state->min_max_lod = sampler->min_lod;
+   }
+
    state->compare_mode      = sampler->compare_mode;
    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
       state->compare_func   = sampler->compare_func;
    }
 
    state->normalized_coords = sampler->normalized_coords;
-   state->lod_bias          = sampler->lod_bias;
-   if (!view->last_level &&
-       sampler->min_img_filter == sampler->mag_img_filter) {
-      state->min_lod        = 0.0f;
-      state->max_lod        = 0.0f;
-   } else {
-      state->min_lod        = MAX2(sampler->min_lod, 0.0f);
-      state->max_lod        = sampler->max_lod;
-   }
+
    state->border_color[0]   = sampler->border_color[0];
    state->border_color[1]   = sampler->border_color[1];
    state->border_color[2]   = sampler->border_color[2];
@@ -140,6 +140,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
  */
 LLVMValueRef
 lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      unsigned unit,
                       const LLVMValueRef ddx[4],
                       const LLVMValueRef ddy[4],
                       LLVMValueRef lod_bias, /* optional */
@@ -149,20 +150,20 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
                       LLVMValueRef depth)
 
 {
-   if (bld->static_state->min_lod == bld->static_state->max_lod) {
+   if (bld->static_state->min_max_lod_equal) {
       /* User is forcing sampling from a particular mipmap level.
        * This is hit during mipmap generation.
        */
-      return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod);
+      return LLVMConstReal(LLVMFloatType(), bld->static_state->min_max_lod);
    }
    else {
       struct lp_build_context *float_bld = &bld->float_bld;
-      LLVMValueRef sampler_lod_bias = LLVMConstReal(LLVMFloatType(),
-                                                    bld->static_state->lod_bias);
-      LLVMValueRef min_lod = LLVMConstReal(LLVMFloatType(),
-                                           bld->static_state->min_lod);
-      LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(),
-                                           bld->static_state->max_lod);
+      LLVMValueRef sampler_lod_bias =
+         bld->dynamic_state->lod_bias(bld->dynamic_state, bld->builder, unit);
+      LLVMValueRef min_lod =
+         bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
+      LLVMValueRef max_lod =
+         bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit);
       LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
       LLVMValueRef lod;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 8b042d5242..661f35f6de 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -82,8 +82,9 @@ struct lp_sampler_static_state
    unsigned compare_mode:1;
    unsigned compare_func:3;
    unsigned normalized_coords:1;
-   float lod_bias, min_lod, max_lod;
    float border_color[4];
+   unsigned min_max_lod_equal:1;  /**< min_lod == max_lod ? */
+   float min_max_lod;             /**< only valid when min_max_lod_equal=1 */
 
    /* Aero hacks */
    unsigned force_nearest_s:1;
@@ -143,6 +144,20 @@ struct lp_sampler_dynamic_state
                 LLVMBuilderRef builder,
                 unsigned unit);
 
+   /** Obtain texture min lod */
+   LLVMValueRef
+   (*min_lod)(const struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder, unsigned unit);
+
+   /** Obtain texture max lod */
+   LLVMValueRef
+   (*max_lod)(const struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder, unsigned unit);
+
+   /** Obtain texture lod bias */
+   LLVMValueRef
+   (*lod_bias)(const struct lp_sampler_dynamic_state *state,
+               LLVMBuilderRef builder, unsigned unit);
 };
 
 
@@ -248,6 +263,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 LLVMValueRef
 lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      unsigned unit,
                       const LLVMValueRef ddx[4],
                       const LLVMValueRef ddy[4],
                       LLVMValueRef lod_bias, /* optional */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 7e064900e7..000d4938a0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -970,7 +970,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, ddx, ddy,
+      lod = lp_build_lod_selector(bld, unit, ddx, ddy,
                                   lod_bias, explicit_lod,
                                   width, height, depth);
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 91fab18e4e..cbae1188a5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -937,7 +937,7 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, ddx, ddy,
+      lod = lp_build_lod_selector(bld, unit, ddx, ddy,
                                   lod_bias, explicit_lod,
                                   width, height, depth);
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 8e6dfb293d..4c7089892e 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -65,6 +65,11 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
          LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
                        LP_MAX_TEXTURE_LEVELS);
 
+      elem_types[LP_JIT_TEXTURE_MIN_LOD] = LLVMFloatType();
+      elem_types[LP_JIT_TEXTURE_MAX_LOD] = LLVMFloatType();
+      elem_types[LP_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType();
+
+
       texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, width,
@@ -88,6 +93,16 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, data,
                              screen->target, texture_type,
                              LP_JIT_TEXTURE_DATA);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, min_lod,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_MIN_LOD);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, max_lod,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_MAX_LOD);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, lod_bias,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_LOD_BIAS);
+
       LP_CHECK_STRUCT_SIZE(struct lp_jit_texture,
                            screen->target, texture_type);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index c94189413a..e94d758e20 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -54,6 +54,10 @@ struct lp_jit_texture
    uint32_t row_stride[LP_MAX_TEXTURE_LEVELS];
    uint32_t img_stride[LP_MAX_TEXTURE_LEVELS];
    const void *data[LP_MAX_TEXTURE_LEVELS];
+   /* sampler state, actually */
+   float min_lod;
+   float max_lod;
+   float lod_bias;
 };
 
 
@@ -65,6 +69,9 @@ enum {
    LP_JIT_TEXTURE_ROW_STRIDE,
    LP_JIT_TEXTURE_IMG_STRIDE,
    LP_JIT_TEXTURE_DATA,
+   LP_JIT_TEXTURE_MIN_LOD,
+   LP_JIT_TEXTURE_MAX_LOD,
+   LP_JIT_TEXTURE_LOD_BIAS,
    LP_JIT_TEXTURE_NUM_FIELDS  /* number of fields above */
 };
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index ea7002aafc..28d202bd65 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -617,7 +617,8 @@ lp_setup_set_vertex_info( struct lp_setup_context *setup,
 void
 lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                                     unsigned num,
-                                    struct pipe_sampler_view **views)
+                                    struct pipe_sampler_view **views,
+                                    const struct pipe_sampler_state **samplers)
 {
    unsigned i;
 
@@ -638,6 +639,11 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
          jit_tex->depth = tex->depth0;
          jit_tex->last_level = tex->last_level;
 
+         /* sampler state */
+         jit_tex->min_lod = samplers[i]->min_lod;
+         jit_tex->max_lod = samplers[i]->max_lod;
+         jit_tex->lod_bias = samplers[i]->lod_bias;
+
          /* We're referencing the texture's internal data, so save a
           * reference to it.
           */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index 81ff43f8ad..868bd3ad2f 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -143,7 +143,8 @@ lp_setup_set_scissor( struct lp_setup_context *setup,
 void
 lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                                     unsigned num,
-                                    struct pipe_sampler_view **views);
+                                    struct pipe_sampler_view **views,
+                                    const struct pipe_sampler_state **samplers);
 
 unsigned
 lp_setup_is_resource_referenced( const struct lp_setup_context *setup,
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index edd723f65f..d2be22d7fc 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -208,10 +208,12 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
       lp_setup_set_fs_constants(llvmpipe->setup, 
                                 llvmpipe->constants[PIPE_SHADER_FRAGMENT][0]);
 
-   if (llvmpipe->dirty & LP_NEW_SAMPLER_VIEW)
+   if (llvmpipe->dirty & (LP_NEW_SAMPLER_VIEW |
+                          LP_NEW_SAMPLER))
       lp_setup_set_fragment_sampler_views(llvmpipe->setup,
                                           llvmpipe->num_fragment_sampler_views,
-                                          llvmpipe->fragment_sampler_views);
+                                          llvmpipe->fragment_sampler_views,
+                                          llvmpipe->sampler);
 
    llvmpipe->dirty = 0;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index 4e026cc8ff..151fe93cfb 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -151,6 +151,9 @@ LP_LLVM_TEXTURE_MEMBER(last_level, LP_JIT_TEXTURE_LAST_LEVEL, TRUE)
 LP_LLVM_TEXTURE_MEMBER(row_stride, LP_JIT_TEXTURE_ROW_STRIDE, FALSE)
 LP_LLVM_TEXTURE_MEMBER(img_stride, LP_JIT_TEXTURE_IMG_STRIDE, FALSE)
 LP_LLVM_TEXTURE_MEMBER(data_ptr,   LP_JIT_TEXTURE_DATA, FALSE)
+LP_LLVM_TEXTURE_MEMBER(min_lod,    LP_JIT_TEXTURE_MIN_LOD, TRUE)
+LP_LLVM_TEXTURE_MEMBER(max_lod,    LP_JIT_TEXTURE_MAX_LOD, TRUE)
+LP_LLVM_TEXTURE_MEMBER(lod_bias,   LP_JIT_TEXTURE_LOD_BIAS, TRUE)
 
 
 static void
@@ -217,6 +220,10 @@ lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
    sampler->dynamic_state.base.row_stride = lp_llvm_texture_row_stride;
    sampler->dynamic_state.base.img_stride = lp_llvm_texture_img_stride;
    sampler->dynamic_state.base.data_ptr = lp_llvm_texture_data_ptr;
+   sampler->dynamic_state.base.min_lod = lp_llvm_texture_min_lod;
+   sampler->dynamic_state.base.max_lod = lp_llvm_texture_max_lod;
+   sampler->dynamic_state.base.lod_bias = lp_llvm_texture_lod_bias;
+
    sampler->dynamic_state.static_state = static_state;
    sampler->dynamic_state.context_ptr = context_ptr;
 
-- 
cgit v1.2.3


From d1a4dd4217a4b8b018d4d9a161afece640d75694 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 23 Sep 2010 19:16:33 -0600
Subject: llvmpipe: make texture border_color dynamic state

---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c     |  5 --
 src/gallium/auxiliary/gallivm/lp_bld_sample.h     |  9 +++-
 src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 64 ++++++++++++++++-------
 src/gallium/drivers/llvmpipe/lp_jit.c             |  7 ++-
 src/gallium/drivers/llvmpipe/lp_jit.h             |  2 +
 src/gallium/drivers/llvmpipe/lp_setup.c           |  1 +
 src/gallium/drivers/llvmpipe/lp_tex_sample.c      |  2 +
 7 files changed, 63 insertions(+), 27 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index caf1c7e865..19e380a8dc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -114,11 +114,6 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
    state->normalized_coords = sampler->normalized_coords;
 
-   state->border_color[0]   = sampler->border_color[0];
-   state->border_color[1]   = sampler->border_color[1];
-   state->border_color[2]   = sampler->border_color[2];
-   state->border_color[3]   = sampler->border_color[3];
-
    /*
     * FIXME: Handle the remainder of pipe_sampler_view.
     */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 661f35f6de..9a19e87571 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -82,7 +82,6 @@ struct lp_sampler_static_state
    unsigned compare_mode:1;
    unsigned compare_func:3;
    unsigned normalized_coords:1;
-   float border_color[4];
    unsigned min_max_lod_equal:1;  /**< min_lod == max_lod ? */
    float min_max_lod;             /**< only valid when min_max_lod_equal=1 */
 
@@ -158,6 +157,11 @@ struct lp_sampler_dynamic_state
    LLVMValueRef
    (*lod_bias)(const struct lp_sampler_dynamic_state *state,
                LLVMBuilderRef builder, unsigned unit);
+
+   /** Obtain texture border color */
+   LLVMValueRef
+   (*border_color)(const struct lp_sampler_dynamic_state *state,
+                   LLVMBuilderRef builder, unsigned unit);
 };
 
 
@@ -178,6 +182,9 @@ struct lp_build_sample_context
    struct lp_type float_type;
    struct lp_build_context float_bld;
 
+   /** float vector type */
+   struct lp_build_context float_vec_bld;
+
    /** regular scalar float type */
    struct lp_type int_type;
    struct lp_build_context int_bld;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index cbae1188a5..db2a6a0b22 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -54,6 +54,7 @@
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
 #include "lp_bld_sample_aos.h"
+#include "lp_bld_struct.h"
 #include "lp_bld_quad.h"
 
 
@@ -93,6 +94,7 @@ wrap_mode_uses_border_color(unsigned mode)
  */
 static void
 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
+                          unsigned unit,
                           LLVMValueRef width,
                           LLVMValueRef height,
                           LLVMValueRef depth,
@@ -188,13 +190,18 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
 
    if (use_border) {
       /* select texel color or border color depending on use_border */
+      LLVMValueRef border_color_ptr = 
+         bld->dynamic_state->border_color(bld->dynamic_state,
+                                          bld->builder, unit);
       int chan;
       for (chan = 0; chan < 4; chan++) {
          LLVMValueRef border_chan =
-            lp_build_const_vec(bld->texel_type,
-                                  bld->static_state->border_color[chan]);
+            lp_build_array_get(bld->builder, border_color_ptr,
+                               lp_build_const_int32(chan));
+         LLVMValueRef border_chan_vec =
+            lp_build_broadcast_scalar(&bld->float_vec_bld, border_chan);
          texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
-                                           border_chan, texel_out[chan]);
+                                           border_chan_vec, texel_out[chan]);
       }
    }
 }
@@ -567,6 +574,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
+                              unsigned unit,
                               LLVMValueRef width_vec,
                               LLVMValueRef height_vec,
                               LLVMValueRef depth_vec,
@@ -615,7 +623,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    /*
     * Get texture colors.
     */
-   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+   lp_build_sample_texel_soa(bld, unit,
+                             width_vec, height_vec, depth_vec,
                              x, y, z,
                              row_stride_vec, img_stride_vec,
                              data_ptr, colors_out);
@@ -628,6 +637,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
+                             unsigned unit,
                              LLVMValueRef width_vec,
                              LLVMValueRef height_vec,
                              LLVMValueRef depth_vec,
@@ -689,11 +699,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
     * Get texture colors.
     */
    /* get x0/x1 texels */
-   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+   lp_build_sample_texel_soa(bld, unit,
+                             width_vec, height_vec, depth_vec,
                              x0, y0, z0,
                              row_stride_vec, img_stride_vec,
                              data_ptr, neighbors[0][0]);
-   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+   lp_build_sample_texel_soa(bld, unit,
+                             width_vec, height_vec, depth_vec,
                              x1, y0, z0,
                              row_stride_vec, img_stride_vec,
                              data_ptr, neighbors[0][1]);
@@ -711,11 +723,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       LLVMValueRef colors0[4];
 
       /* get x0/x1 texels at y1 */
-      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+      lp_build_sample_texel_soa(bld, unit,
+                                width_vec, height_vec, depth_vec,
                                 x0, y1, z0,
                                 row_stride_vec, img_stride_vec,
                                 data_ptr, neighbors[1][0]);
-      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+      lp_build_sample_texel_soa(bld, unit,
+                                width_vec, height_vec, depth_vec,
                                 x1, y1, z0,
                                 row_stride_vec, img_stride_vec,
                                 data_ptr, neighbors[1][1]);
@@ -735,19 +749,23 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          LLVMValueRef colors1[4];
 
          /* get x0/x1/y0/y1 texels at z1 */
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x0, y0, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[0][0]);
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x1, y0, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[0][1]);
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x0, y1, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[1][0]);
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x1, y1, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[1][1]);
@@ -787,6 +805,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
+                       unsigned unit,
                        unsigned img_filter,
                        unsigned mip_filter,
                        LLVMValueRef s,
@@ -812,14 +831,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
       /* sample the first mipmap level */
-      lp_build_sample_image_nearest(bld,
+      lp_build_sample_image_nearest(bld, unit,
                                     width0_vec, height0_vec, depth0_vec,
                                     row_stride0_vec, img_stride0_vec,
                                     data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
          /* sample the second mipmap level */
-         lp_build_sample_image_nearest(bld,
+         lp_build_sample_image_nearest(bld, unit,
                                        width1_vec, height1_vec, depth1_vec,
                                        row_stride1_vec, img_stride1_vec,
                                        data_ptr1, s, t, r, colors1);
@@ -829,14 +848,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
 
       /* sample the first mipmap level */
-      lp_build_sample_image_linear(bld,
+      lp_build_sample_image_linear(bld, unit,
                                    width0_vec, height0_vec, depth0_vec,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
          /* sample the second mipmap level */
-         lp_build_sample_image_linear(bld,
+         lp_build_sample_image_linear(bld, unit,
                                       width1_vec, height1_vec, depth1_vec,
                                       row_stride1_vec, img_stride1_vec,
                                       data_ptr1, s, t, r, colors1);
@@ -995,7 +1014,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
     */
    if (min_filter == mag_filter) {
       /* no need to distinquish between minification and magnification */
-      lp_build_sample_mipmap(bld, min_filter, mip_filter, s, t, r, lod_fpart,
+      lp_build_sample_mipmap(bld, unit,
+                             min_filter, mip_filter, s, t, r, lod_fpart,
                              width0_vec, width1_vec,
                              height0_vec, height1_vec,
                              depth0_vec, depth1_vec,
@@ -1027,7 +1047,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       lp_build_if(&if_ctx, flow_ctx, bld->builder, minify);
       {
          /* Use the minification filter */
-         lp_build_sample_mipmap(bld, min_filter, mip_filter,
+         lp_build_sample_mipmap(bld, unit,
+                                min_filter, mip_filter,
                                 s, t, r, lod_fpart,
                                 width0_vec, width1_vec,
                                 height0_vec, height1_vec,
@@ -1040,7 +1061,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       lp_build_else(&if_ctx);
       {
          /* Use the magnification filter */
-         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
+         lp_build_sample_mipmap(bld, unit,
+                                mag_filter, mip_filter,
                                 s, t, r, lod_fpart,
                                 width0_vec, width1_vec,
                                 height0_vec, height1_vec,
@@ -1146,6 +1168,7 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    LLVMValueRef s;
    LLVMValueRef t;
    LLVMValueRef r;
+   struct lp_type float_vec_type;
 
    if (0) {
       enum pipe_format fmt = static_state->format;
@@ -1168,7 +1191,10 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    bld.int_coord_type = lp_int_type(type);
    bld.texel_type = type;
 
+   float_vec_type = lp_type_float_vec(32);
+
    lp_build_context_init(&bld.float_bld, builder, bld.float_type);
+   lp_build_context_init(&bld.float_vec_bld, builder, float_vec_type);
    lp_build_context_init(&bld.int_bld, builder, bld.int_type);
    lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
    lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 4c7089892e..04b12dedcc 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -64,11 +64,11 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
       elem_types[LP_JIT_TEXTURE_DATA] =
          LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
                        LP_MAX_TEXTURE_LEVELS);
-
       elem_types[LP_JIT_TEXTURE_MIN_LOD] = LLVMFloatType();
       elem_types[LP_JIT_TEXTURE_MAX_LOD] = LLVMFloatType();
       elem_types[LP_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType();
-
+      elem_types[LP_JIT_TEXTURE_BORDER_COLOR] = 
+         LLVMArrayType(LLVMFloatType(), 4);
 
       texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -102,6 +102,9 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
       LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, lod_bias,
                              screen->target, texture_type,
                              LP_JIT_TEXTURE_LOD_BIAS);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, border_color,
+                             screen->target, texture_type,
+                             LP_JIT_TEXTURE_BORDER_COLOR);
 
       LP_CHECK_STRUCT_SIZE(struct lp_jit_texture,
                            screen->target, texture_type);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index e94d758e20..16e04fce0c 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -58,6 +58,7 @@ struct lp_jit_texture
    float min_lod;
    float max_lod;
    float lod_bias;
+   float border_color[4];
 };
 
 
@@ -72,6 +73,7 @@ enum {
    LP_JIT_TEXTURE_MIN_LOD,
    LP_JIT_TEXTURE_MAX_LOD,
    LP_JIT_TEXTURE_LOD_BIAS,
+   LP_JIT_TEXTURE_BORDER_COLOR,
    LP_JIT_TEXTURE_NUM_FIELDS  /* number of fields above */
 };
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 28d202bd65..eade400087 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -643,6 +643,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
          jit_tex->min_lod = samplers[i]->min_lod;
          jit_tex->max_lod = samplers[i]->max_lod;
          jit_tex->lod_bias = samplers[i]->lod_bias;
+         COPY_4V(jit_tex->border_color, samplers[i]->border_color);
 
          /* We're referencing the texture's internal data, so save a
           * reference to it.
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index 151fe93cfb..f417fc8a9e 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -154,6 +154,7 @@ LP_LLVM_TEXTURE_MEMBER(data_ptr,   LP_JIT_TEXTURE_DATA, FALSE)
 LP_LLVM_TEXTURE_MEMBER(min_lod,    LP_JIT_TEXTURE_MIN_LOD, TRUE)
 LP_LLVM_TEXTURE_MEMBER(max_lod,    LP_JIT_TEXTURE_MAX_LOD, TRUE)
 LP_LLVM_TEXTURE_MEMBER(lod_bias,   LP_JIT_TEXTURE_LOD_BIAS, TRUE)
+LP_LLVM_TEXTURE_MEMBER(border_color, LP_JIT_TEXTURE_BORDER_COLOR, FALSE)
 
 
 static void
@@ -223,6 +224,7 @@ lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
    sampler->dynamic_state.base.min_lod = lp_llvm_texture_min_lod;
    sampler->dynamic_state.base.max_lod = lp_llvm_texture_max_lod;
    sampler->dynamic_state.base.lod_bias = lp_llvm_texture_lod_bias;
+   sampler->dynamic_state.base.border_color = lp_llvm_texture_border_color;
 
    sampler->dynamic_state.static_state = static_state;
    sampler->dynamic_state.context_ptr = context_ptr;
-- 
cgit v1.2.3


From 60a45b03c389f708c513bb2b70c5973175f01068 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 24 Sep 2010 10:30:52 +0100
Subject: llvmpipe: handle FACING interpolants in line and point setup

---
 src/gallium/drivers/llvmpipe/lp_setup_line.c  |  6 ++++++
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 16 +++++++++++++---
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 829eb8a5a0..156bd63375 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -208,6 +208,12 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
          fragcoord_usage_mask |= usage_mask;
          break;
 
+      case LP_INTERP_FACING:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               constant_coef(setup, tri, slot+1, 1.0, i);
+         break;
+
       default:
          assert(0);
       }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 2c354d1d0e..a95c15751c 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -232,13 +232,23 @@ setup_point_coefficients( struct lp_setup_context *setup,
                break;                     
             }
          }
-
-         /* Otherwise fallthrough */
-      default:
+         /* FALLTHROUGH */
+      case LP_INTERP_CONSTANT:
          for (i = 0; i < NUM_CHANNELS; i++) {
             if (usage_mask & (1 << i))
                constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i);
          }
+         break;
+
+      case LP_INTERP_FACING:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               constant_coef(setup, point, slot+1, 1.0, i);
+         break;
+
+      default:
+         assert(0);
+         break;
       }
    }
 
-- 
cgit v1.2.3


From 72258387786332c49b3275b8136a99be7591bf7f Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 24 Sep 2010 11:18:38 +0100
Subject: llvmpipe: handle up to 8 planes in triangle binner

---
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 5090f82ab5..9016bb8e24 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -160,8 +160,9 @@ lp_setup_print_triangle(struct lp_setup_context *setup,
 }
 
 
+#define MAX_PLANES 8
 static unsigned
-lp_rast_tri_tab[9] = {
+lp_rast_tri_tab[MAX_PLANES+1] = {
    0,               /* should be impossible */
    LP_RAST_OP_TRIANGLE_1,
    LP_RAST_OP_TRIANGLE_2,
@@ -531,11 +532,11 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    }
    else
    {
-      int c[7];
-      int ei[7];
-      int eo[7];
-      int xstep[7];
-      int ystep[7];
+      int c[MAX_PLANES];
+      int ei[MAX_PLANES];
+      int eo[MAX_PLANES];
+      int xstep[MAX_PLANES];
+      int ystep[MAX_PLANES];
       int x, y;
 
       int ix0 = bbox->x0 / TILE_SIZE;
@@ -564,7 +565,7 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
       for (y = iy0; y <= iy1; y++)
       {
 	 boolean in = FALSE;  /* are we inside the triangle? */
-	 int cx[7];
+	 int cx[MAX_PLANES];
 
          for (i = 0; i < nr_planes; i++)
             cx[i] = c[i];
-- 
cgit v1.2.3


From fdcc168a16d59bf2b7fd291383f214834c2546f6 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 29 Sep 2010 12:05:19 +0100
Subject: llvmpipe: Decouple sampler view and sampler state updates.

Fixes glean pbo crash.

It would be possible to avoid crashing without decoupling, but given
that state trackers give no guarantee that number of views is consistent,
that would likely cause too many state updates (or miss some).
---
 src/gallium/drivers/llvmpipe/lp_setup.c         | 43 +++++++++++++++++++------
 src/gallium/drivers/llvmpipe/lp_setup.h         |  6 +++-
 src/gallium/drivers/llvmpipe/lp_state_derived.c |  9 ++++--
 3 files changed, 45 insertions(+), 13 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index eade400087..5ff11a3363 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -617,8 +617,7 @@ lp_setup_set_vertex_info( struct lp_setup_context *setup,
 void
 lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                                     unsigned num,
-                                    struct pipe_sampler_view **views,
-                                    const struct pipe_sampler_state **samplers)
+                                    struct pipe_sampler_view **views)
 {
    unsigned i;
 
@@ -629,7 +628,7 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
       struct pipe_sampler_view *view = i < num ? views[i] : NULL;
 
-      if(view) {
+      if (view) {
          struct pipe_resource *tex = view->texture;
          struct llvmpipe_resource *lp_tex = llvmpipe_resource(tex);
          struct lp_jit_texture *jit_tex;
@@ -639,12 +638,6 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
          jit_tex->depth = tex->depth0;
          jit_tex->last_level = tex->last_level;
 
-         /* sampler state */
-         jit_tex->min_lod = samplers[i]->min_lod;
-         jit_tex->max_lod = samplers[i]->max_lod;
-         jit_tex->lod_bias = samplers[i]->lod_bias;
-         COPY_4V(jit_tex->border_color, samplers[i]->border_color);
-
          /* We're referencing the texture's internal data, so save a
           * reference to it.
           */
@@ -693,6 +686,38 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
 }
 
 
+/**
+ * Called during state validation when LP_NEW_SAMPLER is set.
+ */
+void
+lp_setup_set_fragment_sampler_state(struct lp_setup_context *setup,
+                                    unsigned num,
+                                    const struct pipe_sampler_state **samplers)
+{
+   unsigned i;
+
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      const struct pipe_sampler_state *sampler = i < num ? samplers[i] : NULL;
+
+      if (sampler) {
+         struct lp_jit_texture *jit_tex;
+         jit_tex = &setup->fs.current.jit_context.textures[i];
+
+         jit_tex->min_lod = sampler->min_lod;
+         jit_tex->max_lod = sampler->max_lod;
+         jit_tex->lod_bias = sampler->lod_bias;
+         COPY_4V(jit_tex->border_color, sampler->border_color);
+      }
+   }
+
+   setup->dirty |= LP_SETUP_NEW_FS;
+}
+
+
 /**
  * Is the given texture referenced by any scene?
  * Note: we have to check all scenes including any scenes currently
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index 868bd3ad2f..25dab78f64 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -143,7 +143,11 @@ lp_setup_set_scissor( struct lp_setup_context *setup,
 void
 lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                                     unsigned num,
-                                    struct pipe_sampler_view **views,
+                                    struct pipe_sampler_view **views);
+
+void
+lp_setup_set_fragment_sampler_state(struct lp_setup_context *setup,
+                                    unsigned num,
                                     const struct pipe_sampler_state **samplers);
 
 unsigned
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index d2be22d7fc..bb059d0459 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -208,11 +208,14 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
       lp_setup_set_fs_constants(llvmpipe->setup, 
                                 llvmpipe->constants[PIPE_SHADER_FRAGMENT][0]);
 
-   if (llvmpipe->dirty & (LP_NEW_SAMPLER_VIEW |
-                          LP_NEW_SAMPLER))
+   if (llvmpipe->dirty & (LP_NEW_SAMPLER_VIEW))
       lp_setup_set_fragment_sampler_views(llvmpipe->setup,
                                           llvmpipe->num_fragment_sampler_views,
-                                          llvmpipe->fragment_sampler_views,
+                                          llvmpipe->fragment_sampler_views);
+
+   if (llvmpipe->dirty & (LP_NEW_SAMPLER))
+      lp_setup_set_fragment_sampler_state(llvmpipe->setup,
+                                          llvmpipe->num_samplers,
                                           llvmpipe->sampler);
 
    llvmpipe->dirty = 0;
-- 
cgit v1.2.3


From d2149f6f2256deda180fd1a4c38cb436660e7407 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Thu, 30 Sep 2010 07:29:49 -0700
Subject: gallium/llvmpipe: remove duplicated include

Remove duplicated include.

Signed-off-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index a95c15751c..3b217f9544 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -33,7 +33,6 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "lp_perf.h"
-#include "lp_setup_context.h"
 #include "lp_rast.h"
 #include "lp_state_fs.h"
 #include "tgsi/tgsi_scan.h"
-- 
cgit v1.2.3


From 591e1bc34f5a5dd065614deae41b59682f59ac08 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sun, 3 Oct 2010 11:39:02 +0100
Subject: llvmpipe: make debug_fs_variant respect variant->nr_samplers

---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 48 ++++++++++++++----------------
 1 file changed, 23 insertions(+), 25 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index f0a15e11b9..5a561f9abd 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -782,31 +782,29 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
       debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
    }
    debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
-   for (i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
-      if (key->sampler[i].format) {
-         debug_printf("sampler[%u] = \n", i);
-         debug_printf("  .format = %s\n",
-                      util_format_name(key->sampler[i].format));
-         debug_printf("  .target = %s\n",
-                      util_dump_tex_target(key->sampler[i].target, TRUE));
-         debug_printf("  .pot = %u %u %u\n",
-                      key->sampler[i].pot_width,
-                      key->sampler[i].pot_height,
-                      key->sampler[i].pot_depth);
-         debug_printf("  .wrap = %s %s %s\n",
-                      util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
-                      util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
-                      util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
-         debug_printf("  .min_img_filter = %s\n",
-                      util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
-         debug_printf("  .min_mip_filter = %s\n",
-                      util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
-         debug_printf("  .mag_img_filter = %s\n",
-                      util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
-         if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
-            debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
-         debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
-      }
+   for (i = 0; i < key->nr_samplers; ++i) {
+      debug_printf("sampler[%u] = \n", i);
+      debug_printf("  .format = %s\n",
+                   util_format_name(key->sampler[i].format));
+      debug_printf("  .target = %s\n",
+                   util_dump_tex_target(key->sampler[i].target, TRUE));
+      debug_printf("  .pot = %u %u %u\n",
+                   key->sampler[i].pot_width,
+                   key->sampler[i].pot_height,
+                   key->sampler[i].pot_depth);
+      debug_printf("  .wrap = %s %s %s\n",
+                   util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                   util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                   util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+      debug_printf("  .min_img_filter = %s\n",
+                   util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+      debug_printf("  .min_mip_filter = %s\n",
+                   util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+      debug_printf("  .mag_img_filter = %s\n",
+                   util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+      if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+         debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
+      debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
    }
 }
 
-- 
cgit v1.2.3


From 446dbb921762710d678486f3f5a6dfdf318fe34c Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Tue, 5 Oct 2010 10:50:02 +0100
Subject: llvmpipe: Dump a few missing shader key flags.

---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 5a561f9abd..e50768445f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -746,6 +746,9 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
 
    debug_printf("fs variant %p:\n", (void *) key);
 
+   if (key->flatshade) {
+      debug_printf("flatshade = 1\n");
+   }
    for (i = 0; i < key->nr_cbufs; ++i) {
       debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
    }
@@ -770,6 +773,10 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
       debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
    }
 
+   if (key->occlusion_count) {
+      debug_printf("occlusion_count = 1\n");
+   }
+
    if (key->blend.logicop_enable) {
       debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
    }
-- 
cgit v1.2.3


From e74955eba3fc22fcf6e9111a4e5bbc095d34d357 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Sun, 26 Sep 2010 11:22:20 +0100
Subject: llvmpipe: Fix perspective interpolation for point sprites.

Once a fragment is generated with LP_INTERP_PERSPECTIVE set for an input,
it will do a divide by w for that input. Therefore it's not OK to treat LP_INTERP_PERSPECTIVE as
LP_INTERP_LINEAR or vice-versa, even if the attribute is known to not
vary.

A better strategy would be to take the primitive in consideration when
generating the fragment shader key, and therefore avoid the per-fragment
perspective divide.
---
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 71 ++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 17 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 3b217f9544..c91e85f915 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -64,12 +64,37 @@ constant_coef(struct lp_setup_context *setup,
 }
 
 
+static void
+point_persp_coeff(struct lp_setup_context *setup,
+                  struct lp_rast_triangle *point,
+                  const struct point_info *info,
+                  unsigned slot,
+                  unsigned i)
+{
+   /*
+    * Fragment shader expects pre-multiplied w for LP_INTERP_PERSPECTIVE. A
+    * better stratergy would be to take the primitive in consideration when
+    * generating the fragment shader key, and therefore avoid the per-fragment
+    * perspective divide.
+    */
+
+   float w0 = info->v0[0][3];
+
+   assert(i < 4);
+
+   point->inputs.a0[slot][i] = info->v0[slot][i]*w0;
+   point->inputs.dadx[slot][i] = 0.0f;
+   point->inputs.dady[slot][i] = 0.0f;
+}
+
+
 /**
  * Setup automatic texcoord coefficients (for sprite rendering).
  * \param slot  the vertex attribute slot to setup
  * \param i  the attribute channel in [0,3]
  * \param sprite_coord_origin  one of PIPE_SPRITE_COORD_x
- * \param perspective_proj  will the TEX instruction do a divide by Q?
+ * \param perspective  does the shader expects pre-multiplied w, i.e.,
+ *    LP_INTERP_PERSPECTIVE is specified in the shader key
  */
 static void
 texcoord_coef(struct lp_setup_context *setup,
@@ -78,7 +103,7 @@ texcoord_coef(struct lp_setup_context *setup,
               unsigned slot,
               unsigned i,
               unsigned sprite_coord_origin,
-              boolean perspective_proj)
+              boolean perspective)
 {
    assert(i < 4);
 
@@ -92,7 +117,7 @@ texcoord_coef(struct lp_setup_context *setup,
       point->inputs.dady[slot][0] = dady;
       point->inputs.a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
 
-      if (!perspective_proj) {
+      if (perspective) {
          /* Divide coefficients by vertex.w here.
           *
           * It would be clearer to always multiply by w0 above and
@@ -119,7 +144,7 @@ texcoord_coef(struct lp_setup_context *setup,
       point->inputs.dady[slot][1] = dady;
       point->inputs.a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
 
-      if (!perspective_proj) {
+      if (perspective) {
          float w0 = info->v0[0][3];
          point->inputs.dadx[slot][1] *= w0;
          point->inputs.dady[slot][1] *= w0;
@@ -193,11 +218,17 @@ setup_point_coefficients( struct lp_setup_context *setup,
    /* setup interpolation for all the remaining attributes:
     */
    for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      enum lp_interp interp = setup->fs.input[slot].interp;
+      boolean perspective = !!(interp == LP_INTERP_PERSPECTIVE);
       unsigned vert_attr = setup->fs.input[slot].src_index;
       unsigned usage_mask = setup->fs.input[slot].usage_mask;
       unsigned i;
+
+      if (perspective & usage_mask) {
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+      }
       
-      switch (setup->fs.input[slot].interp) {
+      switch (interp) {
       case LP_INTERP_POSITION:
          /*
           * The generated pixel interpolators will pick up the coeffs from
@@ -210,32 +241,38 @@ setup_point_coefficients( struct lp_setup_context *setup,
       case LP_INTERP_LINEAR:
          /* Sprite tex coords may use linear interpolation someday */
          /* fall-through */
-
       case LP_INTERP_PERSPECTIVE:
          /* check if the sprite coord flag is set for this attribute.
           * If so, set it up so it up so x and y vary from 0 to 1.
           */
          if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
-            const int index = shader->info.input_semantic_index[slot];
+            unsigned semantic_index = shader->info.input_semantic_index[slot];
             /* Note that sprite_coord enable is a bitfield of
              * PIPE_MAX_SHADER_OUTPUTS bits.
              */
-            if (index < PIPE_MAX_SHADER_OUTPUTS &&
-                (setup->sprite_coord_enable & (1 << index))) {
-               for (i = 0; i < NUM_CHANNELS; i++)
-                  if (usage_mask & (1 << i))
+            if (semantic_index < PIPE_MAX_SHADER_OUTPUTS &&
+                (setup->sprite_coord_enable & (1 << semantic_index))) {
+               for (i = 0; i < NUM_CHANNELS; i++) {
+                  if (usage_mask & (1 << i)) {
                      texcoord_coef(setup, point, info, slot + 1, i,
                                    setup->sprite_coord_origin,
-                                   (usage_mask & TGSI_WRITEMASK_W));
-               fragcoord_usage_mask |= TGSI_WRITEMASK_W;
-               break;                     
+                                   perspective);
+                  }
+               }
+               break;
             }
          }
-         /* FALLTHROUGH */
+         /* fall-through */
       case LP_INTERP_CONSTANT:
          for (i = 0; i < NUM_CHANNELS; i++) {
-            if (usage_mask & (1 << i))
-               constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i);
+            if (usage_mask & (1 << i)) {
+               if (perspective) {
+                  point_persp_coeff(setup, point, info, slot+1, i);
+               }
+               else {
+                  constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i);
+               }
+            }
          }
          break;
 
-- 
cgit v1.2.3


From 06472ad7e835813ef7c9bf8a5cd8b62a25fa9cc3 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 6 Oct 2010 09:40:51 +0100
Subject: llvmpipe: Fix sprite coord perspective interpolation of Q.

Q coordinate's coefficients also need to be multiplied by w, otherwise
it will have 1/w, causing problems with TXP.
---
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index c91e85f915..1295aeecd8 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -105,6 +105,8 @@ texcoord_coef(struct lp_setup_context *setup,
               unsigned sprite_coord_origin,
               boolean perspective)
 {
+   float w0 = info->v0[0][3];
+
    assert(i < 4);
 
    if (i == 0) {
@@ -118,13 +120,6 @@ texcoord_coef(struct lp_setup_context *setup,
       point->inputs.a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
 
       if (perspective) {
-         /* Divide coefficients by vertex.w here.
-          *
-          * It would be clearer to always multiply by w0 above and
-          * then divide it out for perspective projection here, but
-          * doing it this way involves less algebra.
-          */
-         float w0 = info->v0[0][3];
          point->inputs.dadx[slot][0] *= w0;
          point->inputs.dady[slot][0] *= w0;
          point->inputs.a0[slot][0] *= w0;
@@ -145,7 +140,6 @@ texcoord_coef(struct lp_setup_context *setup,
       point->inputs.a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
 
       if (perspective) {
-         float w0 = info->v0[0][3];
          point->inputs.dadx[slot][1] *= w0;
          point->inputs.dady[slot][1] *= w0;
          point->inputs.a0[slot][1] *= w0;
@@ -157,7 +151,7 @@ texcoord_coef(struct lp_setup_context *setup,
       point->inputs.dady[slot][2] = 0.0f;
    }
    else {
-      point->inputs.a0[slot][3] = 1.0f;
+      point->inputs.a0[slot][3] = perspective ? w0 : 1.0f;
       point->inputs.dadx[slot][3] = 0.0f;
       point->inputs.dady[slot][3] = 0.0f;
    }
-- 
cgit v1.2.3


From 1c32583581ef5aee59901d9dd8e56cc1a125f0d4 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 6 Oct 2010 14:53:19 +0100
Subject: gallivm: Only apply min/max_lod when necessary.

---
 src/gallium/auxiliary/gallivm/lp_bld_sample.c | 51 +++++++++++++++++++--------
 src/gallium/auxiliary/gallivm/lp_bld_sample.h |  2 ++
 src/gallium/drivers/llvmpipe/lp_state_fs.c    |  4 +++
 3 files changed, 42 insertions(+), 15 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 2227a062d0..c1c98bf859 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -126,21 +126,32 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->wrap_r            = sampler->wrap_r;
    state->min_img_filter    = sampler->min_img_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
-   if (view->last_level) {
+
+   if (view->last_level && sampler->max_lod > 0.0f) {
       state->min_mip_filter = sampler->min_mip_filter;
    } else {
       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    }
 
-   if (sampler->lod_bias != 0.0) {
-      state->lod_bias_non_zero = 1;
-   }
+   if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      if (sampler->lod_bias != 0.0f) {
+         state->lod_bias_non_zero = 1;
+      }
 
-   /* If min_lod == max_lod we can greatly simplify mipmap selection.
-    * This is a case that occurs during automatic mipmap generation.
-    */
-   if (sampler->min_lod == sampler->max_lod) {
-      state->min_max_lod_equal = 1;
+      /* If min_lod == max_lod we can greatly simplify mipmap selection.
+       * This is a case that occurs during automatic mipmap generation.
+       */
+      if (sampler->min_lod == sampler->max_lod) {
+         state->min_max_lod_equal = 1;
+      } else {
+         if (sampler->min_lod > 0.0f) {
+            state->apply_min_lod = 1;
+         }
+
+         if (sampler->max_lod < (float)view->last_level) {
+            state->apply_max_lod = 1;
+         }
+      }
    }
 
    state->compare_mode      = sampler->compare_mode;
@@ -181,21 +192,19 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
                       LLVMValueRef depth)
 
 {
-   LLVMValueRef min_lod =
-      bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
-
    if (bld->static_state->min_max_lod_equal) {
       /* User is forcing sampling from a particular mipmap level.
        * This is hit during mipmap generation.
        */
+      LLVMValueRef min_lod =
+         bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
+
       return min_lod;
    }
    else {
       struct lp_build_context *float_bld = &bld->float_bld;
       LLVMValueRef sampler_lod_bias =
          bld->dynamic_state->lod_bias(bld->dynamic_state, bld->builder, unit);
-      LLVMValueRef max_lod =
-         bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit);
       LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
       LLVMValueRef lod;
 
@@ -265,8 +274,20 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
       if (bld->static_state->lod_bias_non_zero)
          lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
 
+
       /* clamp lod */
-      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
+      if (bld->static_state->apply_max_lod) {
+         LLVMValueRef max_lod =
+            bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit);
+
+         lod = lp_build_min(float_bld, lod, max_lod);
+      }
+      if (bld->static_state->apply_min_lod) {
+         LLVMValueRef min_lod =
+            bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
+
+         lod = lp_build_max(float_bld, lod, min_lod);
+      }
 
       return lod;
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index bb1c8c8dce..bb83ede931 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -84,6 +84,8 @@ struct lp_sampler_static_state
    unsigned normalized_coords:1;
    unsigned min_max_lod_equal:1;  /**< min_lod == max_lod ? */
    unsigned lod_bias_non_zero:1;
+   unsigned apply_min_lod:1;  /**< min_lod > 0 ? */
+   unsigned apply_max_lod:1;  /**< max_lod < last_level ? */
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index e50768445f..3ce8be5a0a 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -812,6 +812,10 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
       if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
          debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
       debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+      debug_printf("  .min_max_lod_equal = %u\n", key->sampler[i].min_max_lod_equal);
+      debug_printf("  .lod_bias_non_zero = %u\n", key->sampler[i].lod_bias_non_zero);
+      debug_printf("  .apply_min_lod = %u\n", key->sampler[i].apply_min_lod);
+      debug_printf("  .apply_max_lod = %u\n", key->sampler[i].apply_max_lod);
    }
 }
 
-- 
cgit v1.2.3


From 9fe510ef35a783a244d0d54baa50f959a6b781dc Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 6 Oct 2010 10:11:15 +0100
Subject: llvmpipe: Cleanup depth-stencil clears.

Only cosmetic changes. No actual practical difference.
---
 src/gallium/drivers/llvmpipe/lp_rast.c  | 34 ++++++++++++++++++++++++---------
 src/gallium/drivers/llvmpipe/lp_rast.h  |  4 ++--
 src/gallium/drivers/llvmpipe/lp_setup.c | 11 +++++++----
 3 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index d7e6415e13..790d88a745 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -211,8 +211,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
                        const union lp_rast_cmd_arg arg)
 {
    const struct lp_scene *scene = task->scene;
-   unsigned clear_value = arg.clear_zstencil.value;
-   unsigned clear_mask = arg.clear_zstencil.mask;
+   uint32_t clear_value = arg.clear_zstencil.value;
+   uint32_t clear_mask = arg.clear_zstencil.mask;
    const unsigned height = TILE_SIZE / TILE_VECTOR_HEIGHT;
    const unsigned width = TILE_SIZE * TILE_VECTOR_HEIGHT;
    const unsigned block_size = scene->zsbuf.blocksize;
@@ -220,7 +220,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
    uint8_t *dst;
    unsigned i, j;
 
-   LP_DBG(DEBUG_RAST, "%s 0x%x%x\n", __FUNCTION__, clear_value, clear_mask);
+   LP_DBG(DEBUG_RAST, "%s: value=0x%08x, mask=0x%08x\n",
+           __FUNCTION__, clear_value, clear_mask);
 
    /*
     * Clear the aera of the swizzled depth/depth buffer matching this tile, in
@@ -232,16 +233,31 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
 
    dst = task->depth_tile;
 
+   clear_value &= clear_mask;
+
    switch (block_size) {
    case 1:
+      assert(clear_mask == 0xff);
       memset(dst, (uint8_t) clear_value, height * width);
       break;
    case 2:
-      for (i = 0; i < height; i++) {
-         uint16_t *row = (uint16_t *)dst;
-         for (j = 0; j < width; j++)
-            *row++ = (uint16_t) clear_value;
-         dst += dst_stride;
+      if (clear_mask == 0xffff) {
+         for (i = 0; i < height; i++) {
+            uint16_t *row = (uint16_t *)dst;
+            for (j = 0; j < width; j++)
+               *row++ = (uint16_t) clear_value;
+            dst += dst_stride;
+         }
+      }
+      else {
+         for (i = 0; i < height; i++) {
+            uint16_t *row = (uint16_t *)dst;
+            for (j = 0; j < width; j++) {
+               uint16_t tmp = ~clear_mask & *row;
+               *row++ = clear_value | tmp;
+            }
+            dst += dst_stride;
+         }
       }
       break;
    case 4:
@@ -258,7 +274,7 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
             uint32_t *row = (uint32_t *)dst;
             for (j = 0; j < width; j++) {
                uint32_t tmp = ~clear_mask & *row;
-               *row++ = (clear_value & clear_mask) | tmp;
+               *row++ = clear_value | tmp;
             }
             dst += dst_stride;
          }
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c55b97a9d1..0f62377c07 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -149,8 +149,8 @@ union lp_rast_cmd_arg {
    const struct lp_rast_state *set_state;
    uint8_t clear_color[4];
    struct {
-      unsigned value;
-      unsigned mask;
+      uint32_t value;
+      uint32_t mask;
    } clear_zstencil;
    struct lp_fence *fence;
    struct llvmpipe_query *query_obj;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 5ff11a3363..e72ead0def 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -377,16 +377,19 @@ lp_setup_try_clear( struct lp_setup_context *setup,
    }
 
    if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
-      unsigned zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
-      unsigned smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
+      uint32_t zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
+      uint32_t smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
 
       zsvalue = util_pack_z_stencil(setup->fb.zsbuf->format,
                                     depth,
                                     stencil);
 
-      zsmask = util_pack_uint_z_stencil(setup->fb.zsbuf->format,
+
+      zsmask = util_pack_mask_z_stencil(setup->fb.zsbuf->format,
                                         zmask,
                                         smask);
+
+      zsvalue &= zsmask;
    }
 
    if (setup->state == SETUP_ACTIVE) {
@@ -431,7 +434,7 @@ lp_setup_try_clear( struct lp_setup_context *setup,
       if (flags & PIPE_CLEAR_COLOR) {
          memcpy(setup->clear.color.clear_color,
                 &color_arg,
-                sizeof color_arg);
+                sizeof setup->clear.color.clear_color);
       }
    }
    
-- 
cgit v1.2.3


From ad6730fadbbeacea96322e31064ede9ea7ebad6f Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 8 Oct 2010 17:01:16 +0100
Subject: llvmpipe: fail gracefully on oom in scene creation

---
 src/gallium/drivers/llvmpipe/lp_setup.c         | 97 ++++++++++++++++++-------
 src/gallium/drivers/llvmpipe/lp_setup_context.h |  6 +-
 src/gallium/drivers/llvmpipe/lp_setup_line.c    |  5 +-
 src/gallium/drivers/llvmpipe/lp_setup_point.c   |  5 +-
 src/gallium/drivers/llvmpipe/lp_setup_tri.c     | 15 ++--
 src/gallium/drivers/llvmpipe/lp_setup_vbuf.c    |  6 +-
 6 files changed, 92 insertions(+), 42 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index e72ead0def..e96f012f1b 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -56,7 +56,7 @@
 #include "draw/draw_vbuf.h"
 
 
-static void set_scene_state( struct lp_setup_context *, enum setup_state,
+static boolean set_scene_state( struct lp_setup_context *, enum setup_state,
                              const char *reason);
 static boolean try_update_scene_state( struct lp_setup_context *setup );
 
@@ -167,7 +167,7 @@ lp_setup_rasterize_scene( struct lp_setup_context *setup )
 
 
-static void
+static boolean
 begin_binning( struct lp_setup_context *setup )
 {
    struct lp_scene *scene = setup->scene;
@@ -181,6 +181,8 @@ begin_binning( struct lp_setup_context *setup )
    /* Always create a fence:
     */
    scene->fence = lp_fence_create(MAX2(1, setup->num_threads));
+   if (!scene->fence)
+      return FALSE;
 
    /* Initialize the bin flags and x/y coords:
     */
@@ -192,7 +194,8 @@ begin_binning( struct lp_setup_context *setup )
    }
 
    ok = try_update_scene_state(setup);
-   assert(ok);
+   if (!ok)
+      return FALSE;
 
    if (setup->fb.zsbuf &&
        ((setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
@@ -208,7 +211,8 @@ begin_binning( struct lp_setup_context *setup )
          ok = lp_scene_bin_everywhere( scene, 
                                        LP_RAST_OP_CLEAR_COLOR, 
                                        setup->clear.color );
-         assert(ok);
+         if (!ok)
+            return FALSE;
       }
    }
 
@@ -216,12 +220,14 @@ begin_binning( struct lp_setup_context *setup )
       if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) {
          if (!need_zsload)
             scene->has_depthstencil_clear = TRUE;
+
          ok = lp_scene_bin_everywhere( scene,
                                        LP_RAST_OP_CLEAR_ZSTENCIL,
                                        lp_rast_arg_clearzs(
                                           setup->clear.zsvalue,
                                           setup->clear.zsmask));
-         assert(ok);
+         if (!ok)
+            return FALSE;
       }
    }
 
@@ -229,15 +235,16 @@ begin_binning( struct lp_setup_context *setup )
       ok = lp_scene_bin_everywhere( scene,
                                     LP_RAST_OP_BEGIN_QUERY,
                                     lp_rast_arg_query(setup->active_query) );
-      assert(ok);
+      if (!ok)
+         return FALSE;
    }
-      
 
    setup->clear.flags = 0;
    setup->clear.zsmask = 0;
    setup->clear.zsvalue = 0;
 
    LP_DBG(DEBUG_SETUP, "%s done\n", __FUNCTION__);
+   return TRUE;
 }
 
 
@@ -246,12 +253,12 @@ begin_binning( struct lp_setup_context *setup )
  *
  * TODO: fast path for fullscreen clears and no triangles.
  */
-static void
+static boolean
 execute_clears( struct lp_setup_context *setup )
 {
    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
-   begin_binning( setup );
+   return begin_binning( setup );
 }
 
 const char *states[] = {
@@ -262,7 +269,7 @@ const char *states[] = {
 };
 
 
-static void
+static boolean
 set_scene_state( struct lp_setup_context *setup,
                  enum setup_state new_state,
                  const char *reason)
@@ -270,7 +277,7 @@ set_scene_state( struct lp_setup_context *setup,
    unsigned old_state = setup->state;
 
    if (old_state == new_state)
-      return;
+      return TRUE;
    
    if (LP_DEBUG & DEBUG_SCENE) {
       debug_printf("%s old %s new %s%s%s\n",
@@ -294,12 +301,14 @@ set_scene_state( struct lp_setup_context *setup,
       break;
 
    case SETUP_ACTIVE:
-      begin_binning( setup );
+      if (!begin_binning( setup ))
+         goto fail;
       break;
 
    case SETUP_FLUSHED:
       if (old_state == SETUP_CLEARED)
-         execute_clears( setup );
+         if (!execute_clears( setup ))
+            goto fail;
 
       lp_setup_rasterize_scene( setup );
       assert(setup->scene == NULL);
@@ -307,9 +316,21 @@ set_scene_state( struct lp_setup_context *setup,
 
    default:
       assert(0 && "invalid setup state mode");
+      goto fail;
    }
 
    setup->state = new_state;
+   return TRUE;
+
+fail:
+   if (setup->scene) {
+      lp_scene_end_rasterization(setup->scene);
+      setup->scene = NULL;
+   }
+
+   setup->state = SETUP_FLUSHED;
+   lp_setup_reset( setup );
+   return FALSE;
 }
 
 
@@ -878,7 +899,7 @@ try_update_scene_state( struct lp_setup_context *setup )
    return TRUE;
 }
 
-void
+boolean
 lp_setup_update_state( struct lp_setup_context *setup,
                        boolean update_scene )
 {
@@ -902,20 +923,38 @@ lp_setup_update_state( struct lp_setup_context *setup,
       assert(lp->dirty == 0);
    }
 
-   if (update_scene)
-      set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ );
+   if (update_scene) {
+      if (!set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ ))
+         return FALSE;
+   }
 
    /* Only call into update_scene_state() if we already have a
     * scene:
     */
    if (update_scene && setup->scene) {
       assert(setup->state == SETUP_ACTIVE);
-      if (!try_update_scene_state(setup)) {
-         lp_setup_flush_and_restart(setup);
-         if (!try_update_scene_state(setup))
-            assert(0);
-      }
+
+      if (try_update_scene_state(setup))
+         return TRUE;
+
+      /* Update failed, try to restart the scene.
+       *
+       * Cannot call lp_setup_flush_and_restart() directly here
+       * because of potential recursion.
+       */
+      if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+         return FALSE;
+
+      if (!set_scene_state(setup, SETUP_ACTIVE, __FUNCTION__))
+         return FALSE;
+
+      if (!setup->scene)
+         return FALSE;
+
+      return try_update_scene_state(setup);
    }
+
+   return TRUE;
 }
 
 
@@ -1019,12 +1058,12 @@ lp_setup_begin_query(struct lp_setup_context *setup,
                                    LP_RAST_OP_BEGIN_QUERY,
                                    lp_rast_arg_query(pq))) {
 
-         lp_setup_flush_and_restart(setup);
+         if (!lp_setup_flush_and_restart(setup))
+            return;
 
          if (!lp_scene_bin_everywhere(setup->scene,
                                       LP_RAST_OP_BEGIN_QUERY,
                                       lp_rast_arg_query(pq))) {
-            assert(0);
             return;
          }
       }
@@ -1068,14 +1107,20 @@ lp_setup_end_query(struct lp_setup_context *setup, struct llvmpipe_query *pq)
 }
 
 
-void
+boolean
 lp_setup_flush_and_restart(struct lp_setup_context *setup)
 {
    if (0) debug_printf("%s\n", __FUNCTION__);
 
    assert(setup->state == SETUP_ACTIVE);
-   set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__);
-   lp_setup_update_state(setup, TRUE);
+
+   if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+      return FALSE;
+   
+   if (!lp_setup_update_state(setup, TRUE))
+      return FALSE;
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 8506ed2dc9..e7b425ebcb 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -160,12 +160,12 @@ void lp_setup_choose_point( struct lp_setup_context *setup );
 
 void lp_setup_init_vbuf(struct lp_setup_context *setup);
 
-void lp_setup_update_state( struct lp_setup_context *setup,
+boolean lp_setup_update_state( struct lp_setup_context *setup,
                             boolean update_scene);
 
 void lp_setup_destroy( struct lp_setup_context *setup );
 
-void lp_setup_flush_and_restart(struct lp_setup_context *setup);
+boolean lp_setup_flush_and_restart(struct lp_setup_context *setup);
 
 void
 lp_setup_print_triangle(struct lp_setup_context *setup,
@@ -191,6 +191,4 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                        const struct u_rect *bbox,
                        int nr_planes );
 
-void lp_setup_flush_and_restart(struct lp_setup_context *setup);
-
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 156bd63375..4d7d6235b0 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -712,10 +712,11 @@ static void lp_setup_line( struct lp_setup_context *setup,
 {
    if (!try_setup_line( setup, v0, v1 ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
 
       if (!try_setup_line( setup, v0, v1 ))
-         assert(0);
+         return;
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 1295aeecd8..31d85f43c2 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -414,10 +414,11 @@ lp_setup_point(struct lp_setup_context *setup,
 {
    if (!try_setup_point( setup, v0 ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
 
       if (!try_setup_point( setup, v0 ))
-         assert(0);
+         return;
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 9016bb8e24..eeffbb5fee 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -644,27 +644,30 @@ static void triangle_cw( struct lp_setup_context *setup,
 {
    if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
 
       if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
-         assert(0);
+         return;
    }
 }
 
 
 /**
- * Draw triangle if it's CCW, cull otherwise.
+ * Draw triangle if it's CW, cull otherwise.
  */
-static void triangle_ccw( struct lp_setup_context *setup,
+static void triangle_cw( struct lp_setup_context *setup,
 			 const float (*v0)[4],
 			 const float (*v1)[4],
 			 const float (*v2)[4] )
 {
    if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
+
       if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
-         assert(0);
+         return;
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
index 6308561f24..9c1f0fe793 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -141,7 +141,8 @@ lp_setup_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
    const boolean flatshade_first = setup->flatshade_first;
    unsigned i;
 
-   lp_setup_update_state(setup, TRUE);
+   if (!lp_setup_update_state(setup, TRUE))
+      return;
 
    switch (setup->prim) {
    case PIPE_PRIM_POINTS:
@@ -338,7 +339,8 @@ lp_setup_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
    const boolean flatshade_first = setup->flatshade_first;
    unsigned i;
 
-   lp_setup_update_state(setup, TRUE);
+   if (!lp_setup_update_state(setup, TRUE))
+      return;
 
    switch (setup->prim) {
    case PIPE_PRIM_POINTS:
-- 
cgit v1.2.3


From 29d6a1483d6c4ecb9c34989423e025b3784ec019 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 8 Oct 2010 17:06:05 +0100
Subject: llvmpipe: avoid overflow in triangle culling

Avoid multiplying fixed-point values.  Calculate triangle area in
floating point use that for culling.

Lift area calculations up a level as we are already doing this in the
triangle_both() case.

Would like to share the calculated area with attribute interpolation,
but the way the code is structured makes this difficult.
---
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 79 ++++++++++++++---------------
 1 file changed, 39 insertions(+), 40 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index eeffbb5fee..0d57f13f61 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -228,7 +228,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
    struct lp_rast_triangle *tri;
    int x[3];
    int y[3];
-   int area;
    struct u_rect bbox;
    unsigned tri_bytes;
    int i;
@@ -312,21 +311,8 @@ do_triangle_ccw(struct lp_setup_context *setup,
    tri->plane[1].dcdx = y[1] - y[2];
    tri->plane[2].dcdx = y[2] - y[0];
 
-   area = (tri->plane[0].dcdy * tri->plane[2].dcdx -
-           tri->plane[2].dcdy * tri->plane[0].dcdx);
-
    LP_COUNT(nr_tris);
 
-   /* Cull non-ccw and zero-sized triangles. 
-    *
-    * XXX: subject to overflow??
-    */
-   if (area <= 0) {
-      lp_scene_putback_data( scene, tri_bytes );
-      LP_COUNT(nr_culled_tris);
-      return TRUE;
-   }
-
    /* Setup parameter interpolants:
     */
    lp_setup_tri_coef( setup, &tri->inputs, v0, v1, v2, frontfacing );
@@ -635,23 +621,36 @@ fail:
 
 
 /**
- * Draw triangle if it's CW, cull otherwise.
+ * Try to draw the triangle, restart the scene on failure.
  */
-static void triangle_cw( struct lp_setup_context *setup,
-			 const float (*v0)[4],
-			 const float (*v1)[4],
-			 const float (*v2)[4] )
+static void retry_triangle_ccw( struct lp_setup_context *setup,
+                                const float (*v0)[4],
+                                const float (*v1)[4],
+                                const float (*v2)[4],
+                                boolean front)
 {
-   if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
+   if (!do_triangle_ccw( setup, v0, v1, v2, front ))
    {
       if (!lp_setup_flush_and_restart(setup))
          return;
 
-      if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
+      if (!do_triangle_ccw( setup, v0, v1, v2, front ))
          return;
    }
 }
 
+static INLINE float
+calc_area(const float (*v0)[4],
+          const float (*v1)[4],
+          const float (*v2)[4])
+{
+   float dx01 = v0[0][0] - v1[0][0];
+   float dy01 = v0[0][1] - v1[0][1];
+   float dx20 = v2[0][0] - v0[0][0];
+   float dy20 = v2[0][1] - v0[0][1];
+   return dx01 * dy20 - dx20 * dy01;
+}
+
 
 /**
  * Draw triangle if it's CW, cull otherwise.
@@ -661,17 +660,23 @@ static void triangle_cw( struct lp_setup_context *setup,
 			 const float (*v1)[4],
 			 const float (*v2)[4] )
 {
-   if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
-   {
-      if (!lp_setup_flush_and_restart(setup))
-         return;
+   float area = calc_area(v0, v1, v2);
 
-      if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
-         return;
-   }
+   if (area < 0.0f) 
+      retry_triangle_ccw(setup, v0, v2, v1, !setup->ccw_is_frontface);
 }
 
 
+static void triangle_ccw( struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4],
+                          const float (*v2)[4])
+{
+   float area = calc_area(v0, v1, v2);
+
+   if (area > 0.0f) 
+      retry_triangle_ccw(setup, v0, v1, v2, setup->ccw_is_frontface);
+}
 
 /**
  * Draw triangle whether it's CW or CCW.
@@ -681,18 +686,12 @@ static void triangle_both( struct lp_setup_context *setup,
 			   const float (*v1)[4],
 			   const float (*v2)[4] )
 {
-   /* edge vectors e = v0 - v2, f = v1 - v2 */
-   const float ex = v0[0][0] - v2[0][0];
-   const float ey = v0[0][1] - v2[0][1];
-   const float fx = v1[0][0] - v2[0][0];
-   const float fy = v1[0][1] - v2[0][1];
-
-   /* det = cross(e,f).z */
-   const float det = ex * fy - ey * fx;
-   if (det < 0.0f) 
-      triangle_ccw( setup, v0, v1, v2 );
-   else if (det > 0.0f)
-      triangle_cw( setup, v0, v1, v2 );
+   float area = calc_area(v0, v1, v2);
+
+   if (area > 0.0f) 
+      retry_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface );
+   else if (area < 0.0f)
+      retry_triangle_ccw( setup, v0, v2, v1, !setup->ccw_is_frontface );
 }
 
 
-- 
cgit v1.2.3


From eeb13e2352d7a44881b011cb3232bb80aee0c826 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 23 Sep 2010 19:56:48 +0100
Subject: llvmpipe: clean up setup_tri a little

---
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 53 ++++++++++++++---------------
 1 file changed, 26 insertions(+), 27 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 0d57f13f61..9f871011d8 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -473,33 +473,6 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    int sz = floor_pot((bbox->x1 - (bbox->x0 & ~3)) |
 		      (bbox->y1 - (bbox->y0 & ~3)));
 
-   if (nr_planes == 3) {
-      if (sz < 4 && dx < 64)
-      {
-	 /* Triangle is contained in a single 4x4 stamp:
-	  */
-	 int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8);
-
-	 return lp_scene_bin_command( scene,
-				      bbox->x0/64, bbox->y0/64,
-				      LP_RAST_OP_TRIANGLE_3_4,
-				      lp_rast_arg_triangle(tri, mask) );
-      }
-
-      if (sz < 16 && dx < 64)
-      {
-	 int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8);
-
-	 /* Triangle is contained in a single 16x16 block:
-	  */
-	 return lp_scene_bin_command( scene,
-				      bbox->x0/64, bbox->y0/64,
-                                      LP_RAST_OP_TRIANGLE_3_16,
-                                      lp_rast_arg_triangle(tri, mask) );
-      }
-   }
-
-
    /* Determine which tile(s) intersect the triangle's bounding box
     */
    if (dx < TILE_SIZE)
@@ -510,6 +483,32 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
       assert(iy0 == bbox->y1 / TILE_SIZE &&
 	     ix0 == bbox->x1 / TILE_SIZE);
 
+      if (nr_planes == 3) {
+         int px = bbox->x0 & 63 & ~3;
+         int py = bbox->y0 & 63 & ~3;
+	 int mask = px | (py << 8);
+
+         if (sz < 4)
+         {
+            /* Triangle is contained in a single 4x4 stamp:
+             */
+
+            return lp_scene_bin_command( scene, ix0, iy0,
+                                         LP_RAST_OP_TRIANGLE_3_4,
+                                         lp_rast_arg_triangle(tri, mask) );
+         }
+
+         if (sz < 16)
+         {
+            /* Triangle is contained in a single 16x16 block:
+             */
+            return lp_scene_bin_command( scene, ix0, iy0,
+                                         LP_RAST_OP_TRIANGLE_3_16,
+                                         lp_rast_arg_triangle(tri, mask) );
+         }
+      }
+
+
       /* Triangle is contained in a single tile:
        */
       return lp_scene_bin_command( scene, ix0, iy0,
-- 
cgit v1.2.3


From 0ff132e5a633170afaed0aea54d01438c895b8ab Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 8 Oct 2010 17:21:03 +0100
Subject: llvmpipe: add rast_tri_4_16 for small lines and points

---
 src/gallium/drivers/llvmpipe/lp_rast.c         |   1 +
 src/gallium/drivers/llvmpipe/lp_rast.h         |  11 +-
 src/gallium/drivers/llvmpipe/lp_rast_debug.c   |   1 +
 src/gallium/drivers/llvmpipe/lp_rast_priv.h    |   4 +
 src/gallium/drivers/llvmpipe/lp_rast_tri.c     | 152 +++----------------------
 src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h | 127 +++++++++++++++++++++
 src/gallium/drivers/llvmpipe/lp_setup_tri.c    |  13 ++-
 7 files changed, 161 insertions(+), 148 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index 790d88a745..db9b2f9b12 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -597,6 +597,7 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] =
    lp_rast_triangle_8,
    lp_rast_triangle_3_4,
    lp_rast_triangle_3_16,
+   lp_rast_triangle_4_16,
    lp_rast_shade_tile,
    lp_rast_shade_tile_opaque,
    lp_rast_begin_query,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index 0f62377c07..df0bea04b9 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -238,12 +238,13 @@ lp_rast_arg_null( void )
 #define LP_RAST_OP_TRIANGLE_8        0x9
 #define LP_RAST_OP_TRIANGLE_3_4      0xa
 #define LP_RAST_OP_TRIANGLE_3_16     0xb
-#define LP_RAST_OP_SHADE_TILE        0xc
-#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xd
-#define LP_RAST_OP_BEGIN_QUERY       0xe
-#define LP_RAST_OP_END_QUERY         0xf
+#define LP_RAST_OP_TRIANGLE_4_16     0xc
+#define LP_RAST_OP_SHADE_TILE        0xd
+#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xe
+#define LP_RAST_OP_BEGIN_QUERY       0xf
+#define LP_RAST_OP_END_QUERY         0x10
 
-#define LP_RAST_OP_MAX               0x10
+#define LP_RAST_OP_MAX               0x11
 #define LP_RAST_OP_MASK              0xff
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
index 9fc78645a3..6f4ba1c6fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
@@ -42,6 +42,7 @@ static const char *cmd_names[LP_RAST_OP_MAX] =
    "triangle_8",
    "triangle_3_4",
    "triangle_3_16",
+   "triangle_4_16",
    "shade_tile",
    "shade_tile_opaque",
    "begin_query",
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 7370119e96..104000a040 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -293,6 +293,10 @@ void lp_rast_triangle_3_4(struct lp_rasterizer_task *,
 
 void lp_rast_triangle_3_16( struct lp_rasterizer_task *, 
                             const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_4_16( struct lp_rasterizer_task *, 
+                            const union lp_rast_cmd_arg );
+
 void
 lp_debug_bin( const struct cmd_bin *bin );
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index a1f309d4b0..f870a187db 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -122,6 +122,16 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    lp_rast_triangle_3(task, arg2);
 }
 
+void
+lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_3(task, arg2);
+}
+
 void
 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
@@ -229,145 +239,6 @@ sign_bits4(const __m128i *cstep, int cdiff)
    return _mm_movemask_epi8(result);
 }
 
-
-/* Special case for 3 plane triangle which is contained entirely
- * within a 16x16 block.
- */
-void
-lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
-                      const union lp_rast_cmd_arg arg)
-{
-   const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
-   unsigned mask = arg.triangle.plane_mask;
-   const int x = task->x + (mask & 0xff);
-   const int y = task->y + (mask >> 8);
-   unsigned outmask, inmask, partmask, partial_mask;
-   unsigned j;
-   __m128i cstep4[3][4];
-
-   outmask = 0;                 /* outside one or more trivial reject planes */
-   partmask = 0;                /* outside one or more trivial accept planes */
-
-   for (j = 0; j < 3; j++) {
-      const int dcdx = -plane[j].dcdx * 4;
-      const int dcdy = plane[j].dcdy * 4;
-      __m128i xdcdy = _mm_set1_epi32(dcdy);
-
-      cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
-      cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
-      cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
-      cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
-
-      {
-	 const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
-	 const int cox = plane[j].eo * 4;
-	 const int cio = plane[j].ei * 4 - 1;
-
-	 outmask |= sign_bits4(cstep4[j], c + cox);
-	 partmask |= sign_bits4(cstep4[j], c + cio);
-      }
-   }
-
-   if (outmask == 0xffff)
-      return;
-
-   /* Mask of sub-blocks which are inside all trivial accept planes:
-    */
-   inmask = ~partmask & 0xffff;
-
-   /* Mask of sub-blocks which are inside all trivial reject planes,
-    * but outside at least one trivial accept plane:
-    */
-   partial_mask = partmask & ~outmask;
-
-   assert((partial_mask & inmask) == 0);
-
-   /* Iterate over partials:
-    */
-   while (partial_mask) {
-      int i = ffs(partial_mask) - 1;
-      int ix = (i & 3) * 4;
-      int iy = (i >> 2) * 4;
-      int px = x + ix;
-      int py = y + iy; 
-      unsigned mask = 0xffff;
-
-      partial_mask &= ~(1 << i);
-
-      for (j = 0; j < 3; j++) {
-         const int cx = (plane[j].c 
-			 - plane[j].dcdx * px
-			 + plane[j].dcdy * py) * 4;
-
-	 mask &= ~sign_bits4(cstep4[j], cx);
-      }
-
-      if (mask)
-	 lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
-   }
-
-   /* Iterate over fulls: 
-    */
-   while (inmask) {
-      int i = ffs(inmask) - 1;
-      int ix = (i & 3) * 4;
-      int iy = (i >> 2) * 4;
-      int px = x + ix;
-      int py = y + iy; 
-
-      inmask &= ~(1 << i);
-
-      block_full_4(task, tri, px, py);
-   }
-}
-
-
-void
-lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
-		     const union lp_rast_cmd_arg arg)
-{
-   const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
-   unsigned mask = arg.triangle.plane_mask;
-   const int x = task->x + (mask & 0xff);
-   const int y = task->y + (mask >> 8);
-   unsigned j;
-
-   /* Iterate over partials:
-    */
-   {
-      unsigned mask = 0xffff;
-
-      for (j = 0; j < 3; j++) {
-	 const int cx = (plane[j].c 
-			 - plane[j].dcdx * x
-			 + plane[j].dcdy * y);
-
-	 const int dcdx = -plane[j].dcdx;
-	 const int dcdy = plane[j].dcdy;
-	 __m128i xdcdy = _mm_set1_epi32(dcdy);
-
-	 __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
-	 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
-	 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
-	 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
-
-	 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
-	 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
-	 __m128i result = _mm_packs_epi16(cstep01, cstep23);
-
-	 /* Extract the sign bits
-	  */
-	 mask &= ~_mm_movemask_epi8(result);
-      }
-
-      if (mask)
-	 lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
-   }
-}
-
-
 #endif
 
 
@@ -383,10 +254,13 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
 
 #define TAG(x) x##_3
 #define NR_PLANES 3
+#define TRI_4 lp_rast_triangle_3_4
+#define TRI_16 lp_rast_triangle_3_16
 #include "lp_rast_tri_tmp.h"
 
 #define TAG(x) x##_4
 #define NR_PLANES 4
+#define TRI_16 lp_rast_triangle_4_16
 #include "lp_rast_tri_tmp.h"
 
 #define TAG(x) x##_5
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 9830a43ba5..c8f9956fda 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -245,6 +245,133 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
    }
 }
 
+#if defined(PIPE_ARCH_SSE) && defined(TRI_16)
+/* XXX: special case this when intersection is not required.
+ *      - tile completely within bbox,
+ *      - bbox completely within tile.
+ */
+void
+TRI_16(struct lp_rasterizer_task *task,
+       const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = tri->plane;
+   unsigned mask = arg.triangle.plane_mask;
+   unsigned outmask, partial_mask;
+   unsigned j;
+   __m128i cstep4[NR_PLANES][4];
+
+   int x = (mask & 0xff);
+   int y = (mask >> 8);
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   
+   x += task->x;
+   y += task->y;
+
+   for (j = 0; j < NR_PLANES; j++) {
+      const int dcdx = -plane[j].dcdx * 4;
+      const int dcdy = plane[j].dcdy * 4;
+      __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+      cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
+      cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
+      cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
+      cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
+
+      {
+	 const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+	 const int cox = plane[j].eo * 4;
+
+	 outmask |= sign_bits4(cstep4[j], c + cox);
+      }
+   }
+
+   if (outmask == 0xffff)
+      return;
+
+
+   /* Mask of sub-blocks which are inside all trivial reject planes,
+    * but outside at least one trivial accept plane:
+    */
+   partial_mask = 0xffff & ~outmask;
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+      unsigned mask = 0xffff;
+
+      partial_mask &= ~(1 << i);
+
+      for (j = 0; j < NR_PLANES; j++) {
+         const int cx = (plane[j].c 
+			 - plane[j].dcdx * px
+			 + plane[j].dcdy * py) * 4;
+
+	 mask &= ~sign_bits4(cstep4[j], cx);
+      }
+
+      if (mask)
+	 lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
+   }
+}
+#endif
+
+#if defined(PIPE_ARCH_SSE) && defined(TRI_4)
+void
+TRI_4(struct lp_rasterizer_task *task,
+      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = tri->plane;
+   unsigned mask = arg.triangle.plane_mask;
+   const int x = task->x + (mask & 0xff);
+   const int y = task->y + (mask >> 8);
+   unsigned j;
+
+   /* Iterate over partials:
+    */
+   {
+      unsigned mask = 0xffff;
+
+      for (j = 0; j < NR_PLANES; j++) {
+	 const int cx = (plane[j].c 
+			 - plane[j].dcdx * x
+			 + plane[j].dcdy * y);
+
+	 const int dcdx = -plane[j].dcdx;
+	 const int dcdy = plane[j].dcdy;
+	 __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+	 __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
+	 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+	 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+	 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+	 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+	 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+	 __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+	 /* Extract the sign bits
+	  */
+	 mask &= ~_mm_movemask_epi8(result);
+      }
+
+      if (mask)
+	 lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
+   }
+}
+#endif
+
+
+
 #undef TAG
+#undef TRI_4
+#undef TRI_16
 #undef NR_PLANES
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 9f871011d8..8fd034666c 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -479,15 +479,14 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    {
       int ix0 = bbox->x0 / TILE_SIZE;
       int iy0 = bbox->y0 / TILE_SIZE;
+      int px = bbox->x0 & 63 & ~3;
+      int py = bbox->y0 & 63 & ~3;
+      int mask = px | (py << 8);
 
       assert(iy0 == bbox->y1 / TILE_SIZE &&
 	     ix0 == bbox->x1 / TILE_SIZE);
 
       if (nr_planes == 3) {
-         int px = bbox->x0 & 63 & ~3;
-         int py = bbox->y0 & 63 & ~3;
-	 int mask = px | (py << 8);
-
          if (sz < 4)
          {
             /* Triangle is contained in a single 4x4 stamp:
@@ -507,6 +506,12 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                                          lp_rast_arg_triangle(tri, mask) );
          }
       }
+      else if (nr_planes == 4 && sz < 16) 
+      {
+         return lp_scene_bin_command( scene, ix0, iy0,
+                                      LP_RAST_OP_TRIANGLE_4_16,
+                                      lp_rast_arg_triangle(tri, mask) );
+      }
 
 
       /* Triangle is contained in a single tile:
-- 
cgit v1.2.3


From ef3407672ed4c2c6d070384ea763e73b3da2240a Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 5 Oct 2010 16:50:22 +0100
Subject: llvmpipe: fix off-by-one in tri_16

---
 src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index c8f9956fda..2f03229512 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -309,7 +309,7 @@ TRI_16(struct lp_rasterizer_task *task,
       partial_mask &= ~(1 << i);
 
       for (j = 0; j < NR_PLANES; j++) {
-         const int cx = (plane[j].c 
+         const int cx = (plane[j].c - 1
 			 - plane[j].dcdx * px
 			 + plane[j].dcdy * py) * 4;
 
-- 
cgit v1.2.3


From d5ef59d8b0ce2ea8f0ad983951e696d1679e3eb7 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Fri, 8 Oct 2010 16:56:45 +0100
Subject: gallivm: Avoid control flow for two-sided stencil test.

---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c | 150 +++++++++++-----------------
 1 file changed, 58 insertions(+), 92 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 7561899a74..7eabe0508d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -128,57 +128,32 @@ lp_build_stencil_test_single(struct lp_build_context *bld,
 /**
  * Do the one or two-sided stencil test comparison.
  * \sa lp_build_stencil_test_single
- * \param face  an integer indicating front (+) or back (-) facing polygon.
- *              If NULL, assume front-facing.
+ * \param front_facing  an integer vector mask, indicating front (~0) or back
+ *                      (0) facing polygon. If NULL, assume front-facing.
  */
 static LLVMValueRef
 lp_build_stencil_test(struct lp_build_context *bld,
                       const struct pipe_stencil_state stencil[2],
                       LLVMValueRef stencilRefs[2],
                       LLVMValueRef stencilVals,
-                      LLVMValueRef face)
+                      LLVMValueRef front_facing)
 {
    LLVMValueRef res;
 
    assert(stencil[0].enabled);
 
-   if (stencil[1].enabled && face) {
-      /* do two-sided test */
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-      LLVMValueRef front_facing;
-      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
-      LLVMValueRef result = bld->undef;
+   /* do front face test */
+   res = lp_build_stencil_test_single(bld, &stencil[0],
+                                      stencilRefs[0], stencilVals);
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
+   if (stencil[1].enabled && front_facing) {
+      /* do back face test */
+      LLVMValueRef back_res;
 
-      lp_build_flow_scope_declare(flow_ctx, &result);
+      back_res = lp_build_stencil_test_single(bld, &stencil[1],
+                                              stencilRefs[1], stencilVals);
 
-      /* front_facing = face > 0.0 */
-      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
-      {
-         result = lp_build_stencil_test_single(bld, &stencil[0],
-                                               stencilRefs[0], stencilVals);
-      }
-      lp_build_else(&if_ctx);
-      {
-         result = lp_build_stencil_test_single(bld, &stencil[1],
-                                               stencilRefs[1], stencilVals);
-      }
-      lp_build_endif(&if_ctx);
-
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
-
-      res = result;
-   }
-   else {
-      /* do single-side test */
-      res = lp_build_stencil_test_single(bld, &stencil[0],
-                                         stencilRefs[0], stencilVals);
+      res = lp_build_select(bld, front_facing, res, back_res);
    }
 
    return res;
@@ -195,14 +170,12 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
                            const struct pipe_stencil_state *stencil,
                            enum stencil_op op,
                            LLVMValueRef stencilRef,
-                           LLVMValueRef stencilVals,
-                           LLVMValueRef mask)
+                           LLVMValueRef stencilVals)
 
 {
-   const unsigned stencilMax = 255; /* XXX fix */
    struct lp_type type = bld->type;
    LLVMValueRef res;
-   LLVMValueRef max = lp_build_const_int_vec(type, stencilMax);
+   LLVMValueRef max = lp_build_const_int_vec(type, 0xff);
    unsigned stencil_op;
 
    assert(type.sign);
@@ -255,19 +228,7 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
       break;
    default:
       assert(0 && "bad stencil op mode");
-      res = NULL;
-   }
-
-   if (stencil->writemask != stencilMax) {
-      /* mask &= stencil->writemask */
-      LLVMValueRef writemask = lp_build_const_int_vec(type, stencil->writemask);
-      mask = LLVMBuildAnd(bld->builder, mask, writemask, "");
-      /* res = (res & mask) | (stencilVals & ~mask) */
-      res = lp_build_select_bitwise(bld, writemask, res, stencilVals);
-   }
-   else {
-      /* res = mask ? res : stencilVals */
-      res = lp_build_select(bld, mask, res, stencilVals);
+      res = bld->undef;
    }
 
    return res;
@@ -284,49 +245,40 @@ lp_build_stencil_op(struct lp_build_context *bld,
                     LLVMValueRef stencilRefs[2],
                     LLVMValueRef stencilVals,
                     LLVMValueRef mask,
-                    LLVMValueRef face)
+                    LLVMValueRef front_facing)
 
 {
-   assert(stencil[0].enabled);
+   LLVMValueRef res;
 
-   if (stencil[1].enabled && face) {
-      /* do two-sided op */
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-      LLVMValueRef front_facing;
-      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
-      LLVMValueRef result = bld->undef;
+   assert(stencil[0].enabled);
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
+   /* do front face op */
+   res = lp_build_stencil_op_single(bld, &stencil[0], op,
+                                     stencilRefs[0], stencilVals);
 
-      lp_build_flow_scope_declare(flow_ctx, &result);
+   if (stencil[1].enabled && front_facing) {
+      /* do back face op */
+      LLVMValueRef back_res;
 
-      /* front_facing = face > 0.0 */
-      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
+      back_res = lp_build_stencil_op_single(bld, &stencil[1], op,
+                                            stencilRefs[1], stencilVals);
 
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
-      {
-         result = lp_build_stencil_op_single(bld, &stencil[0], op,
-                                             stencilRefs[0], stencilVals, mask);
-      }
-      lp_build_else(&if_ctx);
-      {
-         result = lp_build_stencil_op_single(bld, &stencil[1], op,
-                                             stencilRefs[1], stencilVals, mask);
-      }
-      lp_build_endif(&if_ctx);
-
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
+      res = lp_build_select(bld, front_facing, res, back_res);
+   }
 
-      return result;
+   if (stencil->writemask != 0xff) {
+      /* mask &= stencil->writemask */
+      LLVMValueRef writemask = lp_build_const_int_vec(bld->type, stencil->writemask);
+      mask = LLVMBuildAnd(bld->builder, mask, writemask, "");
+      /* res = (res & mask) | (stencilVals & ~mask) */
+      res = lp_build_select_bitwise(bld, writemask, res, stencilVals);
    }
    else {
-      /* do single-sided op */
-      return lp_build_stencil_op_single(bld, &stencil[0], op,
-                                        stencilRefs[0], stencilVals, mask);
+      /* res = mask ? res : stencilVals */
+      res = lp_build_select(bld, mask, res, stencilVals);
    }
+
+   return res;
 }
 
 
@@ -519,6 +471,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
    LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
    LLVMValueRef orig_mask = mask->value;
+   LLVMValueRef front_facing = NULL;
 
    /* Sanity checking */
    {
@@ -616,21 +569,34 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       }
    }
 
-
    if (stencil[0].enabled) {
+
+      if (face) {
+         LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
+
+         /* front_facing = face > 0.0 ? ~0 : 0 */
+         front_facing = LLVMBuildFCmp(builder, LLVMRealUGT, face, zero, "");
+         front_facing = LLVMBuildSExt(builder, front_facing,
+                                      LLVMIntType(bld.type.length*bld.type.width),
+                                      "");
+         front_facing = LLVMBuildBitCast(builder, front_facing,
+                                         bld.int_vec_type, "");
+      }
+
       /* convert scalar stencil refs into vectors */
       stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]);
       stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]);
 
       s_pass_mask = lp_build_stencil_test(&sbld, stencil,
-                                          stencil_refs, stencil_vals, face);
+                                          stencil_refs, stencil_vals,
+                                          front_facing);
 
       /* apply stencil-fail operator */
       {
          LLVMValueRef s_fail_mask = lp_build_andnot(&bld, orig_mask, s_pass_mask);
          stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP,
                                             stencil_refs, stencil_vals,
-                                            s_fail_mask, face);
+                                            s_fail_mask, front_facing);
       }
    }
 
@@ -676,13 +642,13 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          z_fail_mask = lp_build_andnot(&bld, orig_mask, z_pass);
          stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP,
                                             stencil_refs, stencil_vals,
-                                            z_fail_mask, face);
+                                            z_fail_mask, front_facing);
 
          /* apply Z-pass operator */
          z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, "");
          stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
                                             stencil_refs, stencil_vals,
-                                            z_pass_mask, face);
+                                            z_pass_mask, front_facing);
       }
    }
    else {
@@ -692,7 +658,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, "");
       stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
                                          stencil_refs, stencil_vals,
-                                         s_pass_mask, face);
+                                         s_pass_mask, front_facing);
    }
 
    /* The Z bits are already in the right place but we may need to shift the
-- 
cgit v1.2.3


From 6316d540564d116460bfd1382e3eee98480e28ff Mon Sep 17 00:00:00 2001
From: Zack Rusin <zackr@vmware.com>
Date: Thu, 7 Oct 2010 16:26:17 -0400
Subject: llvmpipe: fix rasterization of vertical lines on pixel boundaries

---
 src/gallium/drivers/llvmpipe/lp_setup_line.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 4d7d6235b0..693ac28175 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -475,7 +475,7 @@ try_setup_line( struct lp_setup_context *setup,
       else {
          /* do intersection test */
          float xintersect = fracf(v2[0][0]) + y2diff * dxdy;
-         draw_end = (xintersect < 1.0 && xintersect > 0.0);
+         draw_end = (xintersect < 1.0 && xintersect >= 0.0);
       }
 
       /* Are we already drawing start/end?
@@ -513,7 +513,7 @@ try_setup_line( struct lp_setup_context *setup,
             x_offset_end = y_offset_end * dxdy;
          }
       }
- 
+
       /* x/y positions in fixed point */
       x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) - fixed_width/2;
       x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2;
-- 
cgit v1.2.3


From d0bfb3c5144a9434efd4d53ced149d42016b5bdc Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 6 Oct 2010 20:42:30 +0100
Subject: llvmpipe: Prevent z > 1.0

The current interpolation schemes causes precision loss.

Changing the operation order helps, but does not completely avoid the
problem.

The only short term solution is to clamp z to 1.0.

This is unfortunate, but probably unavoidable until interpolation is
improved.
---
 src/gallium/drivers/llvmpipe/lp_bld_interp.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 2a374f8c39..ee92ce3cdc 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -206,7 +206,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
             dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
 
             /*
-             * a = a0 + x * dadx + y * dady
+             * a = a0 + (x * dadx + y * dady)
              */
 
             if (attrib == 0 && chan == 0) {
@@ -219,11 +219,11 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
                a = a0;
                if (interp != LP_INTERP_CONSTANT &&
                    interp != LP_INTERP_FACING) {
-                  LLVMValueRef tmp;
-                  tmp = LLVMBuildFMul(builder, bld->x, dadx, "");
-                  a = LLVMBuildFAdd(builder, a, tmp, "");
-                  tmp = LLVMBuildFMul(builder, bld->y, dady, "");
-                  a = LLVMBuildFAdd(builder, a, tmp, "");
+                  LLVMValueRef ax, ay, axy;
+                  ax = LLVMBuildFMul(builder, bld->x, dadx, "");
+                  ay = LLVMBuildFMul(builder, bld->y, dady, "");
+                  axy = LLVMBuildFAdd(builder, ax, ay, "");
+                  a = LLVMBuildFAdd(builder, a, axy, "");
                }
             }
 
@@ -350,6 +350,14 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
                }
 #endif
 
+               if (attrib == 0 && chan == 2) {
+                  /* FIXME: Depth values can exceed 1.0, due to the fact that
+                   * setup interpolation coefficients refer to (0,0) which causes
+                   * precision loss. So we must clamp to 1.0 here to avoid artifacts
+                   */
+                  a = lp_build_min(coeff_bld, a, coeff_bld->one);
+               }
+
                attrib_name(a, attrib, chan, "");
             }
             bld->attribs[attrib][chan] = a;
-- 
cgit v1.2.3


From 8009886b0092df2783472deaac1bcaad4a802c19 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Wed, 6 Oct 2010 22:25:48 +0100
Subject: llvmpipe: defer attribute interpolation until after mask and ztest

Don't calculate 1/w for quads which aren't visible...
---
 src/gallium/drivers/llvmpipe/lp_bld_interp.c | 25 ++++++++++++++++++-------
 src/gallium/drivers/llvmpipe/lp_bld_interp.h |  6 +++++-
 src/gallium/drivers/llvmpipe/lp_state_fs.c   | 17 +++++++++++------
 3 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index ee92ce3cdc..c9da8900d0 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -272,7 +272,10 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
  * This is called when we move from one quad to the next.
  */
 static void
-attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
+attribs_update(struct lp_build_interp_soa_context *bld,
+               int quad_index,
+               int start,
+               int end)
 {
    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    LLVMValueRef shuffle = lp_build_const_int_vec(coeff_bld->type, quad_index);
@@ -282,7 +285,7 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
 
    assert(quad_index < 4);
 
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+   for(attrib = start; attrib < end; ++attrib) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
@@ -442,8 +445,6 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
    pos_init(bld, x0, y0);
 
    coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
-
-   attribs_update(bld, 0);
 }
 
 
@@ -451,10 +452,20 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
  * Advance the position and inputs to the given quad within the block.
  */
 void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
-                           int quad_index)
+lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
+                                  int quad_index)
+{
+   assert(quad_index < 4);
+
+   attribs_update(bld, quad_index, 1, bld->num_attribs);
+}
+
+void
+lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
+                                  int quad_index)
 {
    assert(quad_index < 4);
 
-   attribs_update(bld, quad_index);
+   attribs_update(bld, quad_index, 0, 1);
 }
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 3054030f73..6588f7f275 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -89,7 +89,11 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          LLVMValueRef y);
 
 void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
+lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
+                           int quad_index);
+
+void
+lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
                            int quad_index);
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 3ce8be5a0a..0530c61323 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -262,7 +262,7 @@ generate_fs(struct llvmpipe_context *lp,
             struct lp_type type,
             LLVMValueRef context_ptr,
             unsigned i,
-            const struct lp_build_interp_soa_context *interp,
+            struct lp_build_interp_soa_context *interp,
             struct lp_build_sampler_soa *sampler,
             LLVMValueRef *pmask,
             LLVMValueRef (*color)[4],
@@ -276,7 +276,7 @@ generate_fs(struct llvmpipe_context *lp,
    LLVMTypeRef vec_type;
    LLVMValueRef consts_ptr;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
-   LLVMValueRef z = interp->pos[2];
+   LLVMValueRef z;
    LLVMValueRef stencil_refs[2];
    struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
@@ -307,7 +307,6 @@ generate_fs(struct llvmpipe_context *lp,
 	 lp_build_flow_scope_declare(flow, &color[cbuf][chan]);
       }
    }
-   lp_build_flow_scope_declare(flow, &z);
 
    /* do triangle edge testing */
    if (partial_mask) {
@@ -321,6 +320,13 @@ generate_fs(struct llvmpipe_context *lp,
    /* 'mask' will control execution based on quad's pixel alive/killed state */
    lp_build_mask_begin(&mask, flow, type, *pmask);
 
+   lp_build_interp_soa_update_pos(interp, i);
+
+   /* Try to avoid the 1/w for quads where mask is zero.  TODO: avoid
+    * this for depth-fail quads also.
+    */
+   z = interp->pos[2];
+
    early_depth_stencil_test =
       (key->depth.enabled || key->stencil[0].enabled) &&
       !key->alpha.enabled &&
@@ -332,6 +338,8 @@ generate_fs(struct llvmpipe_context *lp,
                              type, &mask,
                              stencil_refs, z, depth_ptr, facing, counter);
 
+   lp_build_interp_soa_update_inputs(interp, i);
+
    lp_build_tgsi_soa(builder, tokens, type, &mask,
                      consts_ptr, interp->pos, interp->inputs,
                      outputs, sampler, &shader->info);
@@ -621,9 +629,6 @@ generate_fragment(struct llvmpipe_context *lp,
       LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS];
       LLVMValueRef depth_ptr_i;
 
-      if(i != 0)
-         lp_build_interp_soa_update(&interp, i);
-
       depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
 
       generate_fs(lp, shader, key,
-- 
cgit v1.2.3


From 40d7be52619fbff2479dcdf56929e3e0c5b12e72 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 7 Oct 2010 18:59:54 +0100
Subject: llvmpipe: use alloca for fs color outputs

Don't try to emit our own phi's, let llvm mem2reg do it for us.
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 0530c61323..f75ae284cb 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -303,8 +303,7 @@ generate_fs(struct llvmpipe_context *lp,
    /* Declare the color and z variables */
    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-	 color[cbuf][chan] = LLVMGetUndef(vec_type);
-	 lp_build_flow_scope_declare(flow, &color[cbuf][chan]);
+	 color[cbuf][chan] = lp_build_alloca(builder, vec_type, "color");
       }
    }
 
@@ -369,7 +368,7 @@ generate_fs(struct llvmpipe_context *lp,
                                          &mask, alpha, alpha_ref_value);
                   }
 
-		  color[cbuf][chan] = out;
+                  LLVMBuildStore(builder, out, color[cbuf][chan]);
                   break;
                }
 
@@ -665,9 +664,18 @@ generate_fragment(struct llvmpipe_context *lp,
        * Convert the fs's output color and mask to fit to the blending type. 
        */
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH];
+         
+         for (i = 0; i < num_fs; i++) {
+            fs_color_vals[i] =
+               LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
+         }
+
 	 lp_build_conv(builder, fs_type, blend_type,
-		       fs_out_color[cbuf][chan], num_fs,
+                       fs_color_vals,
+                       num_fs,
 		       &blend_in_color[chan], 1);
+
 	 lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
       }
 
-- 
cgit v1.2.3


From 954965366fee3fa2eec8a11b6663d4cf218e1d5d Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 7 Oct 2010 19:01:12 +0100
Subject: llvmpipe: dump fragment shader ir and asm when LP_DEBUG=fs

Better than GALLIVM_DEBUG if you're only interested in fragment shaders.
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index f75ae284cb..07b4f74dbc 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -730,7 +730,7 @@ generate_fragment(struct llvmpipe_context *lp,
    /* Apply optimizations to LLVM IR */
    LLVMRunFunctionPassManager(screen->pass, function);
 
-   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+   if ((gallivm_debug & GALLIVM_DEBUG_IR) || (LP_DEBUG & DEBUG_FS)) {
       /* Print the LLVM IR to stderr */
       lp_debug_dump_value(function);
       debug_printf("\n");
@@ -744,7 +744,7 @@ generate_fragment(struct llvmpipe_context *lp,
 
       variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f);
 
-      if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+      if ((gallivm_debug & GALLIVM_DEBUG_ASM) || (LP_DEBUG & DEBUG_FS)) {
          lp_disassemble(f);
       }
       lp_func_delete_body(function);
-- 
cgit v1.2.3


From d2cf757f44f4ee5554243f3279483a25886d9927 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Wed, 6 Oct 2010 18:21:56 +0100
Subject: gallivm: specialized x8z24 depthtest path

Avoid unnecessary masking of non-existant stencil component.
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c | 95 ++++++++++++++++++++++++++++-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  | 30 +--------
 2 files changed, 94 insertions(+), 31 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 7eabe0508d..09b82fbe9b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -71,6 +71,7 @@
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_intr.h"
@@ -446,7 +447,7 @@ lp_build_occlusion_count(LLVMBuilderRef builder,
  * \param format_desc  description of the depth/stencil surface
  * \param mask  the alive/dead pixel mask for the quad (vector)
  * \param stencil_refs  the front/back stencil ref values (scalar)
- * \param z_src  the incoming depth/stencil values (a 2x2 quad)
+ * \param z_src  the incoming depth/stencil values (a 2x2 quad, float32)
  * \param zs_dst_ptr  pointer to depth/stencil values in framebuffer
  * \param facing  contains float value indicating front/back facing polygon
  */
@@ -454,7 +455,7 @@ void
 lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             const struct pipe_depth_state *depth,
                             const struct pipe_stencil_state stencil[2],
-                            struct lp_type type,
+                            struct lp_type z_src_type,
                             const struct util_format_description *format_desc,
                             struct lp_build_mask_context *mask,
                             LLVMValueRef stencil_refs[2],
@@ -463,6 +464,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             LLVMValueRef face,
                             LLVMValueRef counter)
 {
+   struct lp_type type;
    struct lp_build_context bld;
    struct lp_build_context sbld;
    struct lp_type s_type;
@@ -473,6 +475,95 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    LLVMValueRef orig_mask = mask->value;
    LLVMValueRef front_facing = NULL;
 
+   /* Prototype a simpler path:
+    */
+   if (z_src_type.floating &&
+       format_desc->format == PIPE_FORMAT_X8Z24_UNORM &&
+       depth->enabled) 
+   {
+      LLVMValueRef zscaled;
+      LLVMValueRef const_ffffff_float;
+      LLVMValueRef const_8_int;
+      LLVMTypeRef int32_vec_type;
+
+      /* We know the values in z_dst are all >= 0, so allow
+       * lp_build_compare to use signed compare intrinsics:
+       */
+      type.floating = 0;
+      type.fixed = 0;
+      type.sign = 1;
+      type.norm = 1;
+      type.width = 32;
+      type.length = z_src_type.length;
+
+      int32_vec_type = LLVMVectorType(LLVMInt32Type(), z_src_type.length);
+
+      const_8_int = lp_build_const_int_vec(type, 8);
+      const_ffffff_float = lp_build_const_vec(z_src_type, (float)0xffffff);
+
+      zscaled = LLVMBuildFMul(builder, z_src, const_ffffff_float, "zscaled");
+      z_src = LLVMBuildFPToSI(builder, zscaled, int32_vec_type, "z_src");
+      
+      /* Load current z/stencil value from z/stencil buffer */
+      z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
+      z_dst = LLVMBuildLShr(builder, z_dst, const_8_int, "z_dst");
+
+      /* compare src Z to dst Z, returning 'pass' mask */
+      z_pass = lp_build_compare(builder,
+                                type,
+                                depth->func, z_src, z_dst);
+
+      lp_build_mask_update(mask, z_pass);
+
+      /* No need to worry about old stencil contents, just blend the
+       * old and new values and shift into the correct position for
+       * storage.
+       */
+      if (depth->writemask) {
+         type.sign = 0;
+         lp_build_context_init(&bld, builder, type);
+
+         z_dst = lp_build_select(&bld, mask->value, z_src, z_dst);
+         z_dst = LLVMBuildShl(builder, z_dst, const_8_int, "z_dst");
+         LLVMBuildStore(builder, z_dst, zs_dst_ptr);
+      }
+
+      if (counter)
+         lp_build_occlusion_count(builder, type, mask->value, counter);
+
+      return;
+   }
+
+   /*
+    * Depths are expected to be between 0 and 1, even if they are stored in
+    * floats. Setting these bits here will ensure that the lp_build_conv() call
+    * below won't try to unnecessarily clamp the incoming values.
+    */
+   if(z_src_type.floating) {
+      z_src_type.sign = FALSE;
+      z_src_type.norm = TRUE;
+   }
+   else {
+      assert(!z_src_type.sign);
+      assert(z_src_type.norm);
+   }
+
+   /* Pick the depth type. */
+   type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+
+   /* FIXME: Cope with a depth test type with a different bit width. */
+   assert(type.width == z_src_type.width);
+   assert(type.length == z_src_type.length);
+
+   /* Convert fragment Z from float to integer */
+   lp_build_conv(builder, z_src_type, type, &z_src, 1, &z_src, 1);
+
+   zs_dst_ptr = LLVMBuildBitCast(builder,
+                                 zs_dst_ptr,
+                                 LLVMPointerType(lp_build_vec_type(type), 0), "");
+
+
+
    /* Sanity checking */
    {
       const unsigned z_swizzle = format_desc->swizzle[0];
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 07b4f74dbc..b7a51cd667 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -119,7 +119,6 @@ generate_depth_stencil(LLVMBuilderRef builder,
                        LLVMValueRef counter)
 {
    const struct util_format_description *format_desc;
-   struct lp_type dst_type;
 
    if (!key->depth.enabled && !key->stencil[0].enabled && !key->stencil[1].enabled)
       return;
@@ -127,37 +126,10 @@ generate_depth_stencil(LLVMBuilderRef builder,
    format_desc = util_format_description(key->zsbuf_format);
    assert(format_desc);
 
-   /*
-    * Depths are expected to be between 0 and 1, even if they are stored in
-    * floats. Setting these bits here will ensure that the lp_build_conv() call
-    * below won't try to unnecessarily clamp the incoming values.
-    */
-   if(src_type.floating) {
-      src_type.sign = FALSE;
-      src_type.norm = TRUE;
-   }
-   else {
-      assert(!src_type.sign);
-      assert(src_type.norm);
-   }
-
-   /* Pick the depth type. */
-   dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
-
-   /* FIXME: Cope with a depth test type with a different bit width. */
-   assert(dst_type.width == src_type.width);
-   assert(dst_type.length == src_type.length);
-
-   /* Convert fragment Z from float to integer */
-   lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1);
-
-   dst_ptr = LLVMBuildBitCast(builder,
-                              dst_ptr,
-                              LLVMPointerType(lp_build_vec_type(dst_type), 0), "");
    lp_build_depth_stencil_test(builder,
                                &key->depth,
                                key->stencil,
-                               dst_type,
+                               src_type,
                                format_desc,
                                mask,
                                stencil_refs,
-- 
cgit v1.2.3


From aa4cb5e2d8d48c7dcc9653c61a9e25494e3e7b2a Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 7 Oct 2010 15:01:07 +0100
Subject: llvmpipe: try to be sensible about whether to branch after mask
 updates

Don't branch more than once in quick succession.  Don't branch at the
end of the shader.
---
 src/gallium/auxiliary/gallivm/lp_bld_flow.c     |  6 +--
 src/gallium/auxiliary/gallivm/lp_bld_flow.h     |  3 ++
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c | 11 +++-
 src/gallium/drivers/llvmpipe/lp_bld_alpha.c     |  6 ++-
 src/gallium/drivers/llvmpipe/lp_bld_alpha.h     |  3 +-
 src/gallium/drivers/llvmpipe/lp_bld_depth.c     | 14 ++++-
 src/gallium/drivers/llvmpipe/lp_bld_depth.h     |  3 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c      | 69 ++++++++++++++++---------
 8 files changed, 80 insertions(+), 35 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index cd5fbc2463..1ec33c742e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -450,7 +450,7 @@ lp_build_flow_skip_end(struct lp_build_flow_context *flow)
 /**
  * Check if the mask predicate is zero.  If so, jump to the end of the block.
  */
-static void
+void
 lp_build_mask_check(struct lp_build_mask_context *mask)
 {
    LLVMBuilderRef builder = mask->flow->builder;
@@ -490,8 +490,6 @@ lp_build_mask_begin(struct lp_build_mask_context *mask,
    lp_build_flow_scope_begin(flow);
    lp_build_flow_scope_declare(flow, &mask->value);
    lp_build_flow_skip_begin(flow);
-
-   lp_build_mask_check(mask);
 }
 
 
@@ -505,8 +503,6 @@ lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value)
 {
    mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
-
-   lp_build_mask_check(mask);
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index fffb493a93..095c781ec5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -94,6 +94,9 @@ void
 lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value);
 
+void
+lp_build_mask_check(struct lp_build_mask_context *mask);
+
 LLVMValueRef
 lp_build_mask_end(struct lp_build_mask_context *mask);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 441aebae29..03020a62f8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -959,8 +959,13 @@ emit_kil(
       }
    }
 
-   if(mask)
+   if(mask) {
       lp_build_mask_update(bld->mask, mask);
+
+      /* XXX: figure out if we are at the end of the shader and skip this:
+       */
+      lp_build_mask_check(bld->mask);
+   }
 }
 
 
@@ -987,6 +992,10 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld,
    }
 
    lp_build_mask_update(bld->mask, mask);
+
+   /* XXX: figure out if we are at the end of the shader and skip this:
+    */
+   lp_build_mask_check(bld->mask);
 }
 
 static void
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
index e28efe778f..e50643790c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder,
                     struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
-                    LLVMValueRef ref)
+                    LLVMValueRef ref,
+                    boolean do_branch)
 {
    struct lp_build_context bld;
    LLVMValueRef test;
@@ -60,4 +61,7 @@ lp_build_alpha_test(LLVMBuilderRef builder,
    lp_build_name(test, "alpha_mask");
 
    lp_build_mask_update(mask, test);
+
+   if (do_branch)
+      lp_build_mask_check(mask);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
index 44603b418c..27ca8aad4d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder,
                     struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
-                    LLVMValueRef ref);
+                    LLVMValueRef ref,
+                    boolean do_branch);
 
 
 #endif /* !LP_BLD_ALPHA_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 09b82fbe9b..6b8ffb6ca2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -462,7 +462,8 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             LLVMValueRef z_src,
                             LLVMValueRef zs_dst_ptr,
                             LLVMValueRef face,
-                            LLVMValueRef counter)
+                            LLVMValueRef counter,
+                            boolean do_branch)
 {
    struct lp_type type;
    struct lp_build_context bld;
@@ -515,6 +516,9 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
 
       lp_build_mask_update(mask, z_pass);
 
+      if (do_branch)
+         lp_build_mask_check(mask);
+
       /* No need to worry about old stencil contents, just blend the
        * old and new values and shift into the correct position for
        * storage.
@@ -701,6 +705,11 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
           * buffer values.  Don't need to update Z buffer values.
           */
          lp_build_mask_update(mask, z_pass);
+
+         if (do_branch) {
+            lp_build_mask_check(mask);
+            do_branch = FALSE;
+         }
       }
 
       if (depth->writemask) {
@@ -779,6 +788,9 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    if (depth->enabled && stencil[0].enabled)
       lp_build_mask_update(mask, z_pass);
 
+   if (do_branch)
+      lp_build_mask_check(mask);
+
    if (counter)
       lp_build_occlusion_count(builder, type, mask->value, counter);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index e257a5bd7d..2a63bb9378 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -61,7 +61,8 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             LLVMValueRef zs_src,
                             LLVMValueRef zs_dst_ptr,
                             LLVMValueRef facing,
-                            LLVMValueRef counter);
+                            LLVMValueRef counter,
+                            boolean do_branch);
 
 
 #endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index b7a51cd667..df5dd83c87 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -116,7 +116,8 @@ generate_depth_stencil(LLVMBuilderRef builder,
                        LLVMValueRef src,
                        LLVMValueRef dst_ptr,
                        LLVMValueRef facing,
-                       LLVMValueRef counter)
+                       LLVMValueRef counter,
+                       boolean do_branch)
 {
    const struct util_format_description *format_desc;
 
@@ -136,7 +137,8 @@ generate_depth_stencil(LLVMBuilderRef builder,
                                src,
                                dst_ptr,
                                facing,
-                               counter);
+                               counter,
+                               do_branch);
 }
 
 
@@ -253,6 +255,9 @@ generate_fs(struct llvmpipe_context *lp,
    struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
    boolean early_depth_stencil_test;
+   boolean simple_shader = (shader->info.file_count[TGSI_FILE_SAMPLER] == 0 &&
+                            shader->info.num_inputs < 3 &&
+                            shader->info.num_instructions < 8);
    unsigned attrib;
    unsigned chan;
    unsigned cbuf;
@@ -288,15 +293,6 @@ generate_fs(struct llvmpipe_context *lp,
       *pmask = lp_build_const_int_vec(type, ~0);
    }
 
-   /* 'mask' will control execution based on quad's pixel alive/killed state */
-   lp_build_mask_begin(&mask, flow, type, *pmask);
-
-   lp_build_interp_soa_update_pos(interp, i);
-
-   /* Try to avoid the 1/w for quads where mask is zero.  TODO: avoid
-    * this for depth-fail quads also.
-    */
-   z = interp->pos[2];
 
    early_depth_stencil_test =
       (key->depth.enabled || key->stencil[0].enabled) &&
@@ -304,10 +300,22 @@ generate_fs(struct llvmpipe_context *lp,
       !shader->info.uses_kill &&
       !shader->info.writes_z;
 
+   /* 'mask' will control execution based on quad's pixel alive/killed state */
+   lp_build_mask_begin(&mask, flow, type, *pmask);
+
+   if (!early_depth_stencil_test && !simple_shader)
+      lp_build_mask_check(&mask);
+
+   lp_build_interp_soa_update_pos(interp, i);
+   z = interp->pos[2];
+
    if (early_depth_stencil_test)
       generate_depth_stencil(builder, key,
                              type, &mask,
-                             stencil_refs, z, depth_ptr, facing, counter);
+                             stencil_refs, 
+                             z, depth_ptr,
+                             facing, counter,
+                             !simple_shader);
 
    lp_build_interp_soa_update_inputs(interp, i);
 
@@ -337,7 +345,7 @@ generate_fs(struct llvmpipe_context *lp,
                      alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
                      alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
                      lp_build_alpha_test(builder, key->alpha.func, type,
-                                         &mask, alpha, alpha_ref_value);
+                                         &mask, alpha, alpha_ref_value, FALSE);
                   }
 
                   LLVMBuildStore(builder, out, color[cbuf][chan]);
@@ -356,7 +364,8 @@ generate_fs(struct llvmpipe_context *lp,
    if (!early_depth_stencil_test)
       generate_depth_stencil(builder, key,
                              type, &mask,
-                             stencil_refs, z, depth_ptr, facing, counter);
+                             stencil_refs, z, depth_ptr,
+                             facing, counter, FALSE);
 
    lp_build_mask_end(&mask);
 
@@ -386,7 +395,8 @@ generate_blend(const struct pipe_blend_state *blend,
                LLVMValueRef context_ptr,
                LLVMValueRef mask,
                LLVMValueRef *src,
-               LLVMValueRef dst_ptr)
+               LLVMValueRef dst_ptr,
+               boolean do_branch)
 {
    struct lp_build_context bld;
    struct lp_build_flow_context *flow;
@@ -401,9 +411,9 @@ generate_blend(const struct pipe_blend_state *blend,
    lp_build_context_init(&bld, builder, type);
 
    flow = lp_build_flow_create(builder);
-
-   /* we'll use this mask context to skip blending if all pixels are dead */
    lp_build_mask_begin(&mask_ctx, flow, type, mask);
+   if (do_branch)
+      lp_build_mask_check(&mask_ctx);
 
    vec_type = lp_build_vec_type(type);
 
@@ -670,14 +680,23 @@ generate_fragment(struct llvmpipe_context *lp,
       /*
        * Blending.
        */
-      generate_blend(&key->blend,
-                     rt,
-		     builder,
-		     blend_type,
-		     context_ptr,
-		     blend_mask,
-		     blend_in_color,
-		     color_ptr);
+      {
+         /* Could the 4x4 have been killed?
+          */
+         boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) &&
+                              !key->alpha.enabled &&
+                              !shader->info.uses_kill);
+
+         generate_blend(&key->blend,
+                        rt,
+                        builder,
+                        blend_type,
+                        context_ptr,
+                        blend_mask,
+                        blend_in_color,
+                        color_ptr,
+                        do_branch);
+      }
    }
 
 #ifdef PIPE_ARCH_X86
-- 
cgit v1.2.3


From 5b7eb868fde98388d80601d8dea39e679828f42f Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sat, 9 Oct 2010 11:28:00 +0100
Subject: llvmpipe: clean up shader pre/postamble, try to catch more early-z

Specifically, can do early-depth-test even when alpahtest or
kill-pixel are active, providing we defer the actual z write until the
final mask is avaialable.

Improves demos/fire.c especially in the case where you get close to
the trees.
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |  40 +++--
 src/gallium/drivers/llvmpipe/lp_bld_depth.h |  15 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c  | 241 +++++++++++++++++-----------
 3 files changed, 193 insertions(+), 103 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 6b8ffb6ca2..8d9be2ebbb 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -410,7 +410,7 @@ get_s_shift_and_mask(const struct util_format_description *format_desc,
  * \param maskvalue is the depth test mask.
  * \param counter is a pointer of the uint32 counter.
  */
-static void
+void
 lp_build_occlusion_count(LLVMBuilderRef builder,
                          struct lp_type type,
                          LLVMValueRef maskvalue,
@@ -462,7 +462,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             LLVMValueRef z_src,
                             LLVMValueRef zs_dst_ptr,
                             LLVMValueRef face,
-                            LLVMValueRef counter,
+                            LLVMValueRef *zs_value,
                             boolean do_branch)
 {
    struct lp_type type;
@@ -524,17 +524,14 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
        * storage.
        */
       if (depth->writemask) {
-         type.sign = 0;
+         type.sign = 1;
          lp_build_context_init(&bld, builder, type);
 
          z_dst = lp_build_select(&bld, mask->value, z_src, z_dst);
          z_dst = LLVMBuildShl(builder, z_dst, const_8_int, "z_dst");
-         LLVMBuildStore(builder, z_dst, zs_dst_ptr);
+         *zs_value = z_dst;
       }
 
-      if (counter)
-         lp_build_occlusion_count(builder, type, mask->value, counter);
-
       return;
    }
 
@@ -779,7 +776,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       else
          zs_dst = stencil_vals;
 
-      LLVMBuildStore(builder, zs_dst, zs_dst_ptr);
+      *zs_value = zs_dst;
    }
 
    if (s_pass_mask)
@@ -791,6 +788,29 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    if (do_branch)
       lp_build_mask_check(mask);
 
-   if (counter)
-      lp_build_occlusion_count(builder, type, mask->value, counter);
+}
+
+
+
+void
+lp_build_deferred_depth_write(LLVMBuilderRef builder,
+                              struct lp_type z_src_type,
+                              const struct util_format_description *format_desc,
+                              struct lp_build_mask_context *mask,
+                              LLVMValueRef zs_dst_ptr,
+                              LLVMValueRef zs_value)
+{
+   struct lp_type type;
+   struct lp_build_context bld;
+   LLVMValueRef z_dst;
+
+   /* XXX: pointlessly redo type logic:
+    */
+   type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+   lp_build_context_init(&bld, builder, type);
+
+   z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
+   z_dst = lp_build_select(&bld, mask->value, zs_value, z_dst);
+
+   LLVMBuildStore(builder, z_dst, zs_dst_ptr);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index 2a63bb9378..0f89668123 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -61,8 +61,21 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             LLVMValueRef zs_src,
                             LLVMValueRef zs_dst_ptr,
                             LLVMValueRef facing,
-                            LLVMValueRef counter,
+                            LLVMValueRef *zs_value,
                             boolean do_branch);
 
+void
+lp_build_deferred_depth_write(LLVMBuilderRef builder,
+                              struct lp_type z_src_type,
+                              const struct util_format_description *format_desc,
+                              struct lp_build_mask_context *mask,
+                              LLVMValueRef zs_dst_ptr,
+                              LLVMValueRef zs_value);
+
+void
+lp_build_occlusion_count(LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef maskvalue,
+                         LLVMValueRef counter);
 
 #endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index df5dd83c87..f45f36f633 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -104,43 +104,6 @@
 static unsigned fs_no = 0;
 
 
-/**
- * Generate the depth /stencil test code.
- */
-static void
-generate_depth_stencil(LLVMBuilderRef builder,
-                       const struct lp_fragment_shader_variant_key *key,
-                       struct lp_type src_type,
-                       struct lp_build_mask_context *mask,
-                       LLVMValueRef stencil_refs[2],
-                       LLVMValueRef src,
-                       LLVMValueRef dst_ptr,
-                       LLVMValueRef facing,
-                       LLVMValueRef counter,
-                       boolean do_branch)
-{
-   const struct util_format_description *format_desc;
-
-   if (!key->depth.enabled && !key->stencil[0].enabled && !key->stencil[1].enabled)
-      return;
-
-   format_desc = util_format_description(key->zsbuf_format);
-   assert(format_desc);
-
-   lp_build_depth_stencil_test(builder,
-                               &key->depth,
-                               key->stencil,
-                               src_type,
-                               format_desc,
-                               mask,
-                               stencil_refs,
-                               src,
-                               dst_ptr,
-                               facing,
-                               counter,
-                               do_branch);
-}
-
 
 /**
  * Expand the relevent bits of mask_input to a 4-dword mask for the 
@@ -222,6 +185,26 @@ generate_quad_mask(LLVMBuilderRef builder,
 }
 
 
+#define EARLY_DEPTH_TEST  0x1
+#define LATE_DEPTH_TEST   0x2
+#define EARLY_DEPTH_WRITE 0x4
+#define LATE_DEPTH_WRITE  0x8
+
+static int
+find_output_by_semantic( const struct tgsi_shader_info *info,
+			 unsigned semantic,
+			 unsigned index )
+{
+   int i;
+
+   for (i = 0; i < info->num_outputs; i++)
+      if (info->output_semantic_name[i] == semantic &&
+	  info->output_semantic_index[i] == index)
+	 return i;
+
+   return -1;
+}
+
 
 /**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
@@ -246,21 +229,53 @@ generate_fs(struct llvmpipe_context *lp,
             LLVMValueRef mask_input,
             LLVMValueRef counter)
 {
+   const struct util_format_description *zs_format_desc = NULL;
    const struct tgsi_token *tokens = shader->base.tokens;
    LLVMTypeRef vec_type;
    LLVMValueRef consts_ptr;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    LLVMValueRef z;
+   LLVMValueRef zs_value = NULL;
    LLVMValueRef stencil_refs[2];
    struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
-   boolean early_depth_stencil_test;
    boolean simple_shader = (shader->info.file_count[TGSI_FILE_SAMPLER] == 0 &&
                             shader->info.num_inputs < 3 &&
                             shader->info.num_instructions < 8);
    unsigned attrib;
    unsigned chan;
    unsigned cbuf;
+   unsigned depth_mode;
+
+   if (key->depth.enabled ||
+       key->stencil[0].enabled ||
+       key->stencil[1].enabled) {
+
+      zs_format_desc = util_format_description(key->zsbuf_format);
+      assert(zs_format_desc);
+
+      if (!shader->info.writes_z) {
+         if (key->alpha.enabled || shader->info.uses_kill)
+            /* With alpha test and kill, can do the depth test early
+             * and hopefully eliminate some quads.  But need to do a
+             * special deferred depth write once the final mask value
+             * is known.
+             */
+            depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+         else
+            depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
+      }
+      else {
+         depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+      }
+
+      if (!(key->depth.enabled && key->depth.writemask) &&
+          !(key->stencil[0].enabled && key->stencil[0].writemask))
+         depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
+   }
+   else {
+      depth_mode = 0;
+   }
 
    assert(i < 4);
 
@@ -293,79 +308,121 @@ generate_fs(struct llvmpipe_context *lp,
       *pmask = lp_build_const_int_vec(type, ~0);
    }
 
-
-   early_depth_stencil_test =
-      (key->depth.enabled || key->stencil[0].enabled) &&
-      !key->alpha.enabled &&
-      !shader->info.uses_kill &&
-      !shader->info.writes_z;
-
    /* 'mask' will control execution based on quad's pixel alive/killed state */
    lp_build_mask_begin(&mask, flow, type, *pmask);
 
-   if (!early_depth_stencil_test && !simple_shader)
+   if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
       lp_build_mask_check(&mask);
 
    lp_build_interp_soa_update_pos(interp, i);
    z = interp->pos[2];
 
-   if (early_depth_stencil_test)
-      generate_depth_stencil(builder, key,
-                             type, &mask,
-                             stencil_refs, 
-                             z, depth_ptr,
-                             facing, counter,
-                             !simple_shader);
+   if (depth_mode & EARLY_DEPTH_TEST) {
+      lp_build_depth_stencil_test(builder,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z,
+                                  depth_ptr, facing,
+                                  &zs_value,
+                                  !simple_shader);
+
+      if (depth_mode & EARLY_DEPTH_WRITE)
+         LLVMBuildStore(builder, zs_value, depth_ptr);
+   }
 
    lp_build_interp_soa_update_inputs(interp, i);
-
+   
+   /* Build the actual shader */
    lp_build_tgsi_soa(builder, tokens, type, &mask,
                      consts_ptr, interp->pos, interp->inputs,
                      outputs, sampler, &shader->info);
 
-   /* loop over fragment shader outputs/results */
-   for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(outputs[attrib][chan]) {
+
+   /* Alpha test */
+   if (key->alpha.enabled) {
+      int color0 = find_output_by_semantic(&shader->info,
+                                           TGSI_SEMANTIC_COLOR,
+                                           0);
+
+      if (color0 != -1) {
+         LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
+         LLVMValueRef alpha_ref_value;
+
+         alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
+         alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
+
+         lp_build_alpha_test(builder, key->alpha.func, type,
+                             &mask, alpha, alpha_ref_value,
+                             (depth_mode & LATE_DEPTH_TEST) != 0);
+      }
+   }
+
+   /* Late Z test */
+   if (depth_mode & LATE_DEPTH_TEST) { 
+      int pos0 = find_output_by_semantic(&shader->info,
+                                         TGSI_SEMANTIC_POSITION,
+                                         0);
+         
+      if (pos0 != -1) {
+         z = LLVMBuildLoad(builder, outputs[pos0][2], "z");
+         lp_build_name(z, "output%u.%u.%c", i, pos0, "xyzw"[chan]);
+      }
+
+      lp_build_depth_stencil_test(builder,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z,
+                                  depth_ptr, facing,
+                                  &zs_value,
+                                  !simple_shader);
+      /* Late Z write */
+      if (depth_mode & LATE_DEPTH_WRITE)
+         LLVMBuildStore(builder, zs_value, depth_ptr);
+   }
+   else if ((depth_mode & EARLY_DEPTH_TEST) &&
+            (depth_mode & LATE_DEPTH_WRITE))
+   {
+      /* Need to apply a reduced mask to the depth write.  Reload the
+       * depth value, update from zs_value with the new mask value and
+       * write that out.
+       */
+      lp_build_deferred_depth_write(builder,
+                                    type,
+                                    zs_format_desc,
+                                    &mask,
+                                    depth_ptr,
+                                    zs_value);
+   }
+
+
+   /* Color write  */
+   for (attrib = 0; attrib < shader->info.num_outputs; ++attrib)
+   {
+      if (shader->info.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR)
+      {
+         unsigned cbuf = shader->info.output_semantic_index[attrib];
+         for(chan = 0; chan < NUM_CHANNELS; ++chan)
+         {
+            /* XXX: just initialize outputs to point at colors[] and
+             * skip this.
+             */
             LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
-            lp_build_name(out, "output%u.%u.%c", i, attrib, "xyzw"[chan]);
-
-            switch (shader->info.output_semantic_name[attrib]) {
-            case TGSI_SEMANTIC_COLOR:
-               {
-                  unsigned cbuf = shader->info.output_semantic_index[attrib];
-
-                  lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
-
-                  /* Alpha test */
-		  /* XXX: should only test the final assignment to alpha */
-                  if (cbuf == 0 && chan == 3 && key->alpha.enabled) {
-                     LLVMValueRef alpha = out;
-                     LLVMValueRef alpha_ref_value;
-                     alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
-                     alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
-                     lp_build_alpha_test(builder, key->alpha.func, type,
-                                         &mask, alpha, alpha_ref_value, FALSE);
-                  }
-
-                  LLVMBuildStore(builder, out, color[cbuf][chan]);
-                  break;
-               }
-
-            case TGSI_SEMANTIC_POSITION:
-               if(chan == 2)
-                  z = out;
-               break;
-            }
+            lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
+            LLVMBuildStore(builder, out, color[cbuf][chan]);
          }
       }
    }
 
-   if (!early_depth_stencil_test)
-      generate_depth_stencil(builder, key,
-                             type, &mask,
-                             stencil_refs, z, depth_ptr,
-                             facing, counter, FALSE);
+   if (counter)
+      lp_build_occlusion_count(builder, type, mask.value, counter);
 
    lp_build_mask_end(&mask);
 
-- 
cgit v1.2.3


From 2de720dc8ff89676aa7bb5eb74aeb6d44e028fa2 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 1 Oct 2010 15:13:51 +0100
Subject: llvmpipe: simplified SSE2 swz/unswz routines

We've been using these in the linear path for a while now.  Based on
Chris's SSSE3 code, but using only sse2 opcodes.  Speed seems to be
identical, but code is simpler & removes dependency on SSE3.

Should be easier to extend to other rgba8 formats.
---
 src/gallium/drivers/llvmpipe/SConscript     |   8 +-
 src/gallium/drivers/llvmpipe/lp_tile_soa.py | 245 ++++++++++++----------------
 2 files changed, 107 insertions(+), 146 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 650435f0f1..774ad91a07 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -27,13 +27,7 @@ env.Depends('lp_tile_soa.c', [
 ])
 
 
-# Only enable SSSE3 for lp_tile_soa_sse3.c
-ssse3_env = env.Clone()
-if env['gcc'] \
-   and distutils.version.LooseVersion(env['CCVERSION']) >= distutils.version.LooseVersion('4.3') \
-   and env['machine'] in ('x86', 'x86_64') :
-    ssse3_env.Append(CCFLAGS = ['-mssse3'])
-lp_tile_soa_os = ssse3_env.SharedObject('lp_tile_soa.c')
+lp_tile_soa_os = env.SharedObject('lp_tile_soa.c')
 
 
 llvmpipe = env.ConvenienceLibrary(
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
index 2ba39052ab..c76549cdad 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -295,87 +295,98 @@ def generate_ssse3():
 
 #include "util/u_sse.h"
 
+static INLINE void swz4( __m128i x, 
+                         __m128i y, 
+                         __m128i z, 
+                         __m128i w, 
+                         __m128i *a, 
+                         __m128i *b, 
+                         __m128i *c, 
+                         __m128i *d)
+{
+   __m128i i, j, k, l;
+   __m128i m, n, o, p;
+   __m128i e, f, g, h;
+
+   m = _mm_unpacklo_epi8(x,y);
+   n = _mm_unpackhi_epi8(x,y);
+   o = _mm_unpacklo_epi8(z,w);
+   p = _mm_unpackhi_epi8(z,w);
+
+   i = _mm_unpacklo_epi16(m,n);
+   j = _mm_unpackhi_epi16(m,n);
+   k = _mm_unpacklo_epi16(o,p);
+   l = _mm_unpackhi_epi16(o,p);
+
+   e = _mm_unpacklo_epi8(i,j);
+   f = _mm_unpackhi_epi8(i,j);
+   g = _mm_unpacklo_epi8(k,l);
+   h = _mm_unpackhi_epi8(k,l);
+
+   *a = _mm_unpacklo_epi64(e,g);
+   *b = _mm_unpackhi_epi64(e,g);
+   *c = _mm_unpacklo_epi64(f,h);
+   *d = _mm_unpackhi_epi64(f,h);
+}
+
+static INLINE void unswz4( __m128i a, 
+                           __m128i b, 
+                           __m128i c, 
+                           __m128i d, 
+                           __m128i *x, 
+                           __m128i *y, 
+                           __m128i *z, 
+                           __m128i *w)
+{
+   __m128i i, j, k, l;
+   __m128i m, n, o, p;
+
+   i = _mm_unpacklo_epi8(a,b);
+   j = _mm_unpackhi_epi8(a,b);
+   k = _mm_unpacklo_epi8(c,d);
+   l = _mm_unpackhi_epi8(c,d);
+
+   m = _mm_unpacklo_epi16(i,k);
+   n = _mm_unpackhi_epi16(i,k);
+   o = _mm_unpacklo_epi16(j,l);
+   p = _mm_unpackhi_epi16(j,l);
+
+   *x = _mm_unpacklo_epi64(m,n);
+   *y = _mm_unpackhi_epi64(m,n);
+   *z = _mm_unpacklo_epi64(o,p);
+   *w = _mm_unpackhi_epi64(o,p);
+}
+
 static void
 lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
                                          const uint8_t *src, unsigned src_stride,
                                          unsigned x0, unsigned y0)
 {
-
+   __m128i *dst128 = (__m128i *) dst;
    unsigned x, y;
-   __m128i *pdst = (__m128i*) dst;
-   const uint8_t *ysrc0 = src + y0*src_stride + x0*sizeof(uint32_t);
-   unsigned int tile_stridex = src_stride*(TILE_VECTOR_HEIGHT - 1) - sizeof(uint32_t)*TILE_VECTOR_WIDTH;
-   unsigned int tile_stridey = src_stride*TILE_VECTOR_HEIGHT;
-
-   const __m128i shuffle00 = _mm_setr_epi8(0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle01 = _mm_setr_epi8(0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle02 = _mm_setr_epi8(0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle03 = _mm_setr_epi8(0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-
-   const __m128i shuffle10 = _mm_setr_epi8(0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle11 = _mm_setr_epi8(0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle12 = _mm_setr_epi8(0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle13 = _mm_setr_epi8(0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-
-   const __m128i shuffle20 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff);
-   const __m128i shuffle21 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff);
-   const __m128i shuffle22 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff);
-   const __m128i shuffle23 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff);
-
-   const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e);
-   const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d);
-   const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c);
-   const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f);
-
-   for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
-      __m128i line0 = *(__m128i*)ysrc0;
-      const uint8_t *ysrc = ysrc0 + src_stride;
-      ysrc0 += tile_stridey;
-
-      for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
-         __m128i r, g, b, a, line1;
-         line1 = *(__m128i*)ysrc;
-         PIPE_READ_WRITE_BARRIER();
-         ysrc += src_stride;
-         r = _mm_shuffle_epi8(line0, shuffle00);
-         g = _mm_shuffle_epi8(line0, shuffle01);
-         b = _mm_shuffle_epi8(line0, shuffle02);
-         a = _mm_shuffle_epi8(line0, shuffle03);
-
-         line0 = *(__m128i*)ysrc;
-         PIPE_READ_WRITE_BARRIER();
-         ysrc += src_stride;
-         r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle10));
-         g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle11));
-         b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle12));
-         a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle13));
-
-         line1 = *(__m128i*)ysrc;
-         PIPE_READ_WRITE_BARRIER();
-         ysrc -= tile_stridex;
-         r = _mm_or_si128(r, _mm_shuffle_epi8(line0, shuffle20));
-         g = _mm_or_si128(g, _mm_shuffle_epi8(line0, shuffle21));
-         b = _mm_or_si128(b, _mm_shuffle_epi8(line0, shuffle22));
-         a = _mm_or_si128(a, _mm_shuffle_epi8(line0, shuffle23));
-
-         if (x + 1 < TILE_SIZE) {
-            line0 = *(__m128i*)ysrc;
-            ysrc += src_stride;
-         }
-
-         PIPE_READ_WRITE_BARRIER();
-         r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle30));
-         g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle31));
-         b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle32));
-         a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle33));
-
-         *pdst++ = r;
-         *pdst++ = g;
-         *pdst++ = b;
-         *pdst++ = a;
+   
+   src += y0 * src_stride;
+   src += x0 * sizeof(uint32_t);
+
+   for (y = 0; y < TILE_SIZE; y += 4) {
+      const uint8_t *src_row = src;
+
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         swz4(*(__m128i *) (src_row + 0 * src_stride),
+              *(__m128i *) (src_row + 1 * src_stride),
+              *(__m128i *) (src_row + 2 * src_stride),
+              *(__m128i *) (src_row + 3 * src_stride),
+              dst128 + 2,     /* b */
+              dst128 + 1,     /* g */
+              dst128 + 0,     /* r */
+              dst128 + 3);    /* a */
+
+         dst128 += 4;
+         src_row += sizeof(__m128i);
       }
-   }
 
+      src += 4 * src_stride;
+   }
 }
 
 static void
@@ -384,73 +395,29 @@ lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
                                           unsigned x0, unsigned y0)
 {
    unsigned int x, y;
-   const __m128i *psrc = (__m128i*) src;
-   const __m128i *end = (__m128i*) (src + (y0 + TILE_SIZE - 1)*dst_stride + (x0 + TILE_SIZE - 1)*sizeof(uint32_t));
-   uint8_t *pdst = dst + y0 * dst_stride + x0 * sizeof(uint32_t);
-   __m128i c0 = *psrc++;
-   __m128i c1;
-
-   const __m128i shuffle00 = _mm_setr_epi8(0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff);
-   const __m128i shuffle01 = _mm_setr_epi8(0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff);
-   const __m128i shuffle02 = _mm_setr_epi8(0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff);
-   const __m128i shuffle03 = _mm_setr_epi8(0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff);
-
-   const __m128i shuffle10 = _mm_setr_epi8(0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff);
-   const __m128i shuffle11 = _mm_setr_epi8(0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff);
-   const __m128i shuffle12 = _mm_setr_epi8(0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff);
-   const __m128i shuffle13 = _mm_setr_epi8(0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff);
-
-   const __m128i shuffle20 = _mm_setr_epi8(0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff,0xff);
-   const __m128i shuffle21 = _mm_setr_epi8(0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff,0xff);
-   const __m128i shuffle22 = _mm_setr_epi8(0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff,0xff);
-   const __m128i shuffle23 = _mm_setr_epi8(0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff,0xff);
-
-   const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05);
-   const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07);
-   const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d);
-   const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f);
-
-   for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
-      __m128i *tile = (__m128i*) pdst;
-      pdst += dst_stride * TILE_VECTOR_HEIGHT;
-      for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
-         uint8_t *linep = (uint8_t*) (tile++);
-         __m128i line0, line1, line2, line3;
-
-         c1 = *psrc++; /* r */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_shuffle_epi8(c0, shuffle00);
-         line1 = _mm_shuffle_epi8(c0, shuffle01);
-         line2 = _mm_shuffle_epi8(c0, shuffle02);
-         line3 = _mm_shuffle_epi8(c0, shuffle03);
-
-         c0 = *psrc++; /* g */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle10));
-         line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle11));
-         line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle12));
-         line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle13));
-
-         c1 = *psrc++; /* b */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c0, shuffle20));
-         line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c0, shuffle21));
-         line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c0, shuffle22));
-         line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c0, shuffle23));
-
-         if (psrc != end)
-                 c0 = *psrc++; /* a */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle30));
-         line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle31));
-         line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle32));
-         line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle33));
-
-         *(__m128i*) (linep) = line0;
-         *(__m128i*) (((char*)linep) + dst_stride) = line1;
-         *(__m128i*) (((char*)linep) + 2 * dst_stride) = line2;
-         *(__m128i*) (((char*)linep) + 3 * dst_stride) = line3;
+   const __m128i *src128 = (const __m128i *) src;
+   
+   dst += y0 * dst_stride;
+   dst += x0 * sizeof(uint32_t);
+   
+   for (y = 0; y < TILE_SIZE; y += 4) {
+      const uint8_t *dst_row = dst;
+
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         unswz4( src128[2],     /* b */
+                 src128[1],     /* g */
+                 src128[0],     /* r */
+                 src128[3],     /* a */
+                 (__m128i *) (dst_row + 0 * dst_stride),
+                 (__m128i *) (dst_row + 1 * dst_stride),
+                 (__m128i *) (dst_row + 2 * dst_stride),
+                 (__m128i *) (dst_row + 3 * dst_stride));
+
+         src128 += 4;
+         dst_row += sizeof(__m128i);;
       }
+
+      dst += 4 * dst_stride;
    }
 }
 
-- 
cgit v1.2.3


From edba53024f85a27fcbca7cbe139ceda172406653 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 6 Oct 2010 21:01:38 +0100
Subject: llvmpipe: Fix MSVC build. Enable the new SSE2 code on non SSE3
 systems.

---
 src/gallium/drivers/llvmpipe/lp_tile_soa.py | 86 +++++++++++++++--------------
 1 file changed, 44 insertions(+), 42 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
index c76549cdad..e49f9c62fe 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -289,29 +289,30 @@ def generate_format_write(format, src_channel, src_native_type, src_suffix):
     print
     
 
-def generate_ssse3():
+def generate_sse2():
     print '''
 #if defined(PIPE_ARCH_SSE)
 
 #include "util/u_sse.h"
 
-static INLINE void swz4( __m128i x, 
-                         __m128i y, 
-                         __m128i z, 
-                         __m128i w, 
-                         __m128i *a, 
-                         __m128i *b, 
-                         __m128i *c, 
-                         __m128i *d)
+static ALWAYS_INLINE void 
+swz4( const __m128i * restrict x, 
+      const __m128i * restrict y, 
+      const __m128i * restrict z, 
+      const __m128i * restrict w, 
+      __m128i * restrict a, 
+      __m128i * restrict b, 
+      __m128i * restrict c, 
+      __m128i * restrict d)
 {
    __m128i i, j, k, l;
    __m128i m, n, o, p;
    __m128i e, f, g, h;
 
-   m = _mm_unpacklo_epi8(x,y);
-   n = _mm_unpackhi_epi8(x,y);
-   o = _mm_unpacklo_epi8(z,w);
-   p = _mm_unpackhi_epi8(z,w);
+   m = _mm_unpacklo_epi8(*x,*y);
+   n = _mm_unpackhi_epi8(*x,*y);
+   o = _mm_unpacklo_epi8(*z,*w);
+   p = _mm_unpackhi_epi8(*z,*w);
 
    i = _mm_unpacklo_epi16(m,n);
    j = _mm_unpackhi_epi16(m,n);
@@ -329,22 +330,23 @@ static INLINE void swz4( __m128i x,
    *d = _mm_unpackhi_epi64(f,h);
 }
 
-static INLINE void unswz4( __m128i a, 
-                           __m128i b, 
-                           __m128i c, 
-                           __m128i d, 
-                           __m128i *x, 
-                           __m128i *y, 
-                           __m128i *z, 
-                           __m128i *w)
+static ALWAYS_INLINE void
+unswz4( const __m128i * restrict a, 
+        const __m128i * restrict b, 
+        const __m128i * restrict c, 
+        const __m128i * restrict d, 
+        __m128i * restrict x, 
+        __m128i * restrict y, 
+        __m128i * restrict z, 
+        __m128i * restrict w)
 {
    __m128i i, j, k, l;
    __m128i m, n, o, p;
 
-   i = _mm_unpacklo_epi8(a,b);
-   j = _mm_unpackhi_epi8(a,b);
-   k = _mm_unpacklo_epi8(c,d);
-   l = _mm_unpackhi_epi8(c,d);
+   i = _mm_unpacklo_epi8(*a,*b);
+   j = _mm_unpackhi_epi8(*a,*b);
+   k = _mm_unpacklo_epi8(*c,*d);
+   l = _mm_unpackhi_epi8(*c,*d);
 
    m = _mm_unpacklo_epi16(i,k);
    n = _mm_unpackhi_epi16(i,k);
@@ -358,9 +360,9 @@ static INLINE void unswz4( __m128i a,
 }
 
 static void
-lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
-                                         const uint8_t *src, unsigned src_stride,
-                                         unsigned x0, unsigned y0)
+lp_tile_b8g8r8a8_unorm_swizzle_4ub_sse2(uint8_t * restrict dst,
+                                        const uint8_t * restrict src, unsigned src_stride,
+                                        unsigned x0, unsigned y0)
 {
    __m128i *dst128 = (__m128i *) dst;
    unsigned x, y;
@@ -372,10 +374,10 @@ lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
       const uint8_t *src_row = src;
 
       for (x = 0; x < TILE_SIZE; x += 4) {
-         swz4(*(__m128i *) (src_row + 0 * src_stride),
-              *(__m128i *) (src_row + 1 * src_stride),
-              *(__m128i *) (src_row + 2 * src_stride),
-              *(__m128i *) (src_row + 3 * src_stride),
+         swz4((const __m128i *) (src_row + 0 * src_stride),
+              (const __m128i *) (src_row + 1 * src_stride),
+              (const __m128i *) (src_row + 2 * src_stride),
+              (const __m128i *) (src_row + 3 * src_stride),
               dst128 + 2,     /* b */
               dst128 + 1,     /* g */
               dst128 + 0,     /* r */
@@ -390,8 +392,8 @@ lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
 }
 
 static void
-lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
-                                          uint8_t *dst, unsigned dst_stride,
+lp_tile_b8g8r8a8_unorm_unswizzle_4ub_sse2(const uint8_t * restrict src,
+                                          uint8_t * restrict dst, unsigned dst_stride,
                                           unsigned x0, unsigned y0)
 {
    unsigned int x, y;
@@ -404,10 +406,10 @@ lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
       const uint8_t *dst_row = dst;
 
       for (x = 0; x < TILE_SIZE; x += 4) {
-         unswz4( src128[2],     /* b */
-                 src128[1],     /* g */
-                 src128[0],     /* r */
-                 src128[3],     /* a */
+         unswz4( &src128[2],     /* b */
+                 &src128[1],     /* g */
+                 &src128[0],     /* r */
+                 &src128[3],     /* a */
                  (__m128i *) (dst_row + 0 * dst_stride),
                  (__m128i *) (dst_row + 1 * dst_stride),
                  (__m128i *) (dst_row + 2 * dst_stride),
@@ -421,7 +423,7 @@ lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
    }
 }
 
-#endif /* PIPE_ARCH_SSSE3 */
+#endif /* PIPE_ARCH_SSE */
 '''
 
 
@@ -446,7 +448,7 @@ def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix):
             func_name = 'lp_tile_%s_swizzle_%s' % (format.short_name(), dst_suffix)
             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
                 print '#ifdef PIPE_ARCH_SSE'
-                print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
+                print '      func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name)
                 print '#else'
                 print '      func = %s;' % (func_name,)
                 print '#endif'
@@ -484,7 +486,7 @@ def generate_unswizzle(formats, src_channel, src_native_type, src_suffix):
             func_name = 'lp_tile_%s_unswizzle_%s' % (format.short_name(), src_suffix)
             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
                 print '#ifdef PIPE_ARCH_SSE'
-                print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
+                print '      func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name)
                 print '#else'
                 print '      func = %s;' % (func_name,)
                 print '#endif'
@@ -544,7 +546,7 @@ def main():
     print '};'
     print
 
-    generate_ssse3()
+    generate_sse2()
 
     channel = Channel(UNSIGNED, True, 8)
     native_type = 'uint8_t'
-- 
cgit v1.2.3


From cc40abad519cc0f765c6d8f6fad4154bed8dd9c2 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Sat, 9 Oct 2010 12:55:31 +0100
Subject: gallivm: Don't generate Phis for execution mask.

---
 src/gallium/auxiliary/gallivm/lp_bld_flow.c | 28 +++++++++++++++++++++-------
 src/gallium/auxiliary/gallivm/lp_bld_flow.h |  5 ++++-
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |  8 ++++----
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  8 +++-----
 4 files changed, 32 insertions(+), 17 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index 1ec33c742e..a5d65e9b39 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -454,12 +454,15 @@ void
 lp_build_mask_check(struct lp_build_mask_context *mask)
 {
    LLVMBuilderRef builder = mask->flow->builder;
+   LLVMValueRef value;
    LLVMValueRef cond;
 
+   value = lp_build_mask_value(mask);
+
    /* cond = (mask == 0) */
    cond = LLVMBuildICmp(builder,
                         LLVMIntEQ,
-                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
+                        LLVMBuildBitCast(builder, value, mask->reg_type, ""),
                         LLVMConstNull(mask->reg_type),
                         "");
 
@@ -485,14 +488,23 @@ lp_build_mask_begin(struct lp_build_mask_context *mask,
 
    mask->flow = flow;
    mask->reg_type = LLVMIntType(type.width * type.length);
-   mask->value = value;
+   mask->var = lp_build_alloca(flow->builder,
+                               lp_build_int_vec_type(type),
+                               "execution_mask");
+
+   LLVMBuildStore(flow->builder, value, mask->var);
 
-   lp_build_flow_scope_begin(flow);
-   lp_build_flow_scope_declare(flow, &mask->value);
    lp_build_flow_skip_begin(flow);
 }
 
 
+LLVMValueRef
+lp_build_mask_value(struct lp_build_mask_context *mask)
+{
+   return LLVMBuildLoad(mask->flow->builder, mask->var, "");
+}
+
+
 /**
  * Update boolean mask with given value (bitwise AND).
  * Typically used to update the quad's pixel alive/killed mask
@@ -502,7 +514,10 @@ void
 lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value)
 {
-   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
+   value = LLVMBuildAnd(mask->flow->builder,
+                        lp_build_mask_value(mask),
+                        value, "");
+   LLVMBuildStore(mask->flow->builder, value, mask->var);
 }
 
 
@@ -513,8 +528,7 @@ LLVMValueRef
 lp_build_mask_end(struct lp_build_mask_context *mask)
 {
    lp_build_flow_skip_end(mask->flow);
-   lp_build_flow_scope_end(mask->flow);
-   return mask->value;
+   return lp_build_mask_value(mask);
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index 095c781ec5..0fc6317b33 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -77,7 +77,7 @@ struct lp_build_mask_context
 
    LLVMTypeRef reg_type;
 
-   LLVMValueRef value;
+   LLVMValueRef var;
 };
 
 
@@ -87,6 +87,9 @@ lp_build_mask_begin(struct lp_build_mask_context *mask,
                     struct lp_type type,
                     LLVMValueRef value);
 
+LLVMValueRef
+lp_build_mask_value(struct lp_build_mask_context *mask);
+
 /**
  * Bitwise AND the mask with the given value, if a previous mask was set.
  */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 8d9be2ebbb..e768493103 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -473,7 +473,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    LLVMValueRef stencil_vals = NULL;
    LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
    LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
-   LLVMValueRef orig_mask = mask->value;
+   LLVMValueRef orig_mask = lp_build_mask_value(mask);
    LLVMValueRef front_facing = NULL;
 
    /* Prototype a simpler path:
@@ -527,7 +527,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          type.sign = 1;
          lp_build_context_init(&bld, builder, type);
 
-         z_dst = lp_build_select(&bld, mask->value, z_src, z_dst);
+         z_dst = lp_build_select(&bld, lp_build_mask_value(mask), z_src, z_dst);
          z_dst = LLVMBuildShl(builder, z_dst, const_8_int, "z_dst");
          *zs_value = z_dst;
       }
@@ -710,7 +710,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       }
 
       if (depth->writemask) {
-         LLVMValueRef zselectmask = mask->value;
+         LLVMValueRef zselectmask = lp_build_mask_value(mask);
 
          /* mask off bits that failed Z test */
          zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, "");
@@ -810,7 +810,7 @@ lp_build_deferred_depth_write(LLVMBuilderRef builder,
    lp_build_context_init(&bld, builder, type);
 
    z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
-   z_dst = lp_build_select(&bld, mask->value, zs_value, z_dst);
+   z_dst = lp_build_select(&bld, lp_build_mask_value(mask), zs_value, z_dst);
 
    LLVMBuildStore(builder, z_dst, zs_dst_ptr);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index f45f36f633..cf07cb4976 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -422,16 +422,14 @@ generate_fs(struct llvmpipe_context *lp,
    }
 
    if (counter)
-      lp_build_occlusion_count(builder, type, mask.value, counter);
+      lp_build_occlusion_count(builder, type,
+                               lp_build_mask_value(&mask), counter);
 
-   lp_build_mask_end(&mask);
+   *pmask = lp_build_mask_end(&mask);
 
    lp_build_flow_scope_end(flow);
 
    lp_build_flow_destroy(flow);
-
-   *pmask = mask.value;
-
 }
 
 
-- 
cgit v1.2.3


From d45c379027054e563c4f4379fb69fc9f68612f75 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Sat, 9 Oct 2010 20:14:03 +0100
Subject: gallivm: Remove support for Phi generation.

Simply rely on mem2reg pass. It's easier and more reliable.
---
 src/gallium/auxiliary/gallivm/lp_bld_flow.c | 211 ----------------------------
 src/gallium/auxiliary/gallivm/lp_bld_flow.h |  10 --
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |   4 -
 3 files changed, 225 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index a5d65e9b39..22c2db8c44 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -45,22 +45,11 @@
  * Enumeration of all possible flow constructs.
  */
 enum lp_build_flow_construct_kind {
-   LP_BUILD_FLOW_SCOPE,
    LP_BUILD_FLOW_SKIP,
    LP_BUILD_FLOW_IF
 };
 
 
-/**
- * Variable declaration scope.
- */
-struct lp_build_flow_scope
-{
-   /** Number of variables declared in this scope */
-   unsigned num_variables;
-};
-
-
 /**
  * Early exit. Useful to skip to the end of a function or block when
  * the execution mask becomes zero or when there is an error condition.
@@ -69,11 +58,6 @@ struct lp_build_flow_skip
 {
    /** Block to skip to */
    LLVMBasicBlockRef block;
-
-   /** Number of variables declared at the beginning */
-   unsigned num_variables;
-
-   LLVMValueRef *phi;  /**< array [num_variables] */
 };
 
 
@@ -82,10 +66,6 @@ struct lp_build_flow_skip
  */
 struct lp_build_flow_if
 {
-   unsigned num_variables;
-
-   LLVMValueRef *phi;  /**< array [num_variables] */
-
    LLVMValueRef condition;
    LLVMBasicBlockRef entry_block, true_block, false_block, merge_block;
 };
@@ -96,7 +76,6 @@ struct lp_build_flow_if
  */
 union lp_build_flow_construct_data
 {
-   struct lp_build_flow_scope scope;
    struct lp_build_flow_skip skip;
    struct lp_build_flow_if ifthen;
 };
@@ -127,12 +106,6 @@ struct lp_build_flow_context
     */
    struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
    unsigned num_constructs;
-
-   /**
-    * Variable stack
-    */
-   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
-   unsigned num_variables;
 };
 
 
@@ -155,7 +128,6 @@ void
 lp_build_flow_destroy(struct lp_build_flow_context *flow)
 {
    assert(flow->num_constructs == 0);
-   assert(flow->num_variables == 0);
    FREE(flow);
 }
 
@@ -217,93 +189,6 @@ lp_build_flow_pop(struct lp_build_flow_context *flow,
 }
 
 
-/**
- * Begin a variable scope.
- *
- *
- */
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_push(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   scope->num_variables = 0;
-}
-
-
-/**
- * Declare a variable.
- *
- * A variable is a named entity which can have different LLVMValueRef's at
- * different points of the program. This is relevant for control flow because
- * when there are multiple branches to a same location we need to replace
- * the variable's value with a Phi function as explained in
- * http://en.wikipedia.org/wiki/Static_single_assignment_form .
- *
- * We keep track of variables by keeping around a pointer to where they're
- * current.
- *
- * There are a few cautions to observe:
- *
- * - Variable's value must not be NULL. If there is no initial value then
- *   LLVMGetUndef() should be used.
- *
- * - Variable's value must be kept up-to-date. If the variable is going to be
- *   modified by a function then a pointer should be passed so that its value
- *   is accurate. Failure to do this will cause some of the variables'
- *   transient values to be lost, leading to wrong results.
- *
- * - A program should be written from top to bottom, by always appending
- *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
- *   modifying existing statements will most likely lead to wrong results.
- *
- */
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(*variable);
-   if(!*variable)
-      return;
-
-   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
-   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
-      return;
-
-   flow->variables[flow->num_variables++] = variable;
-   ++scope->num_variables;
-}
-
-
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(flow->num_variables >= scope->num_variables);
-   if(flow->num_variables < scope->num_variables) {
-      flow->num_variables = 0;
-      return;
-   }
-
-   flow->num_variables -= scope->num_variables;
-}
-
-
 /**
  * Note: this function has no dependencies on the flow code and could
  * be used elsewhere.
@@ -350,7 +235,6 @@ lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
 {
    struct lp_build_flow_skip *skip;
    LLVMBuilderRef builder;
-   unsigned i;
 
    skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
    if(!skip)
@@ -359,26 +243,9 @@ lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
    /* create new basic block */
    skip->block = lp_build_flow_insert_block(flow);
 
-   skip->num_variables = flow->num_variables;
-   if(!skip->num_variables) {
-      skip->phi = NULL;
-      return;
-   }
-
-   /* Allocate a Phi node for each variable in this skip scope */
-   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
-   if(!skip->phi) {
-      skip->num_variables = 0;
-      return;
-   }
-
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, skip->block);
 
-   /* create a Phi node for each variable */
-   for(i = 0; i < skip->num_variables; ++i)
-      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
-
    LLVMDisposeBuilder(builder);
 }
 
@@ -392,25 +259,14 @@ lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
                               LLVMValueRef cond)
 {
    struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
    LLVMBasicBlockRef new_block;
-   unsigned i;
 
    skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
    if(!skip)
       return;
 
-   current_block = LLVMGetInsertBlock(flow->builder);
-
    new_block = lp_build_flow_insert_block(flow);
 
-   /* for each variable, update the Phi node with a (variable, block) pair */
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-   }
-
    /* if cond is true, goto skip->block, else goto new_block */
    LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
 
@@ -422,28 +278,14 @@ void
 lp_build_flow_skip_end(struct lp_build_flow_context *flow)
 {
    struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
-   unsigned i;
 
    skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
    if(!skip)
       return;
 
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   /* add (variable, block) tuples to the phi nodes */
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-      *flow->variables[i] = skip->phi[i];
-   }
-
    /* goto block */
    LLVMBuildBr(flow->builder, skip->block);
    LLVMPositionBuilderAtEnd(flow->builder, skip->block);
-
-   FREE(skip->phi);
 }
 
 
@@ -659,7 +501,6 @@ lp_build_if(struct lp_build_if_state *ctx,
 {
    LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
    struct lp_build_flow_if *ifthen;
-   unsigned i;
 
    memset(ctx, 0, sizeof(*ctx));
    ctx->builder = builder;
@@ -669,31 +510,13 @@ lp_build_if(struct lp_build_if_state *ctx,
    ifthen = &lp_build_flow_push(flow, LP_BUILD_FLOW_IF)->ifthen;
    assert(ifthen);
 
-   ifthen->num_variables = flow->num_variables;
    ifthen->condition = condition;
    ifthen->entry_block = block;
 
-   /* create a Phi node for each variable in this flow scope */
-   ifthen->phi = MALLOC(ifthen->num_variables * sizeof(*ifthen->phi));
-   if (!ifthen->phi) {
-      ifthen->num_variables = 0;
-      return;
-   }
-
    /* create endif/merge basic block for the phi functions */
    ifthen->merge_block = lp_build_insert_new_block(builder, "endif-block");
    LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
 
-   /* create a phi node for each variable */
-   for (i = 0; i < flow->num_variables; i++) {
-      ifthen->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
-
-      /* add add the initial value of the var from the entry block */
-      if (!LLVMIsUndef(*flow->variables[i]))
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i],
-                         &ifthen->entry_block, 1);
-   }
-
    /* create/insert true_block before merge_block */
    ifthen->true_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-true-block");
 
@@ -710,18 +533,10 @@ lp_build_else(struct lp_build_if_state *ctx)
 {
    struct lp_build_flow_context *flow = ctx->flow;
    struct lp_build_flow_if *ifthen;
-   unsigned i;
 
    ifthen = &lp_build_flow_peek(flow, LP_BUILD_FLOW_IF)->ifthen;
    assert(ifthen);
 
-   /* for each variable, update the Phi node with a (variable, block) pair */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-   for (i = 0; i < flow->num_variables; i++) {
-      assert(*flow->variables[i]);
-      LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
-   }
-
    /* create/insert false_block before the merge block */
    ifthen->false_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-false-block");
 
@@ -738,8 +553,6 @@ lp_build_endif(struct lp_build_if_state *ctx)
 {
    struct lp_build_flow_context *flow = ctx->flow;
    struct lp_build_flow_if *ifthen;
-   LLVMBasicBlockRef curBlock = LLVMGetInsertBlock(ctx->builder);
-   unsigned i;
 
    ifthen = &lp_build_flow_pop(flow, LP_BUILD_FLOW_IF)->ifthen;
    assert(ifthen);
@@ -747,30 +560,6 @@ lp_build_endif(struct lp_build_if_state *ctx)
    /* Insert branch to the merge block from current block */
    LLVMBuildBr(ctx->builder, ifthen->merge_block);
 
-   if (ifthen->false_block) {
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-      /* for each variable, update the Phi node with a (variable, block) pair */
-      for (i = 0; i < flow->num_variables; i++) {
-         assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &curBlock, 1);
-         /* replace the variable ref with the phi function */
-         *flow->variables[i] = ifthen->phi[i];
-      }
-   }
-   else {
-      /* no else clause */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-      for (i = 0; i < flow->num_variables; i++) {
-         assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
-
-         /* replace the variable ref with the phi function */
-         *flow->variables[i] = ifthen->phi[i];
-      }
-   }
-
-   FREE(ifthen->phi);
-
    /***
     *** Now patch in the various branch instructions.
     ***/
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index 0fc6317b33..403e46e52e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -50,16 +50,6 @@ lp_build_flow_create(LLVMBuilderRef builder);
 void
 lp_build_flow_destroy(struct lp_build_flow_context *flow);
 
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable);
-
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow);
-
 void
 lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index cf07cb4976..3b0706e3ec 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -290,8 +290,6 @@ generate_fs(struct llvmpipe_context *lp,
 
    memset(outputs, 0, sizeof outputs);
 
-   lp_build_flow_scope_begin(flow);
-
    /* Declare the color and z variables */
    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
@@ -427,8 +425,6 @@ generate_fs(struct llvmpipe_context *lp,
 
    *pmask = lp_build_mask_end(&mask);
 
-   lp_build_flow_scope_end(flow);
-
    lp_build_flow_destroy(flow);
 }
 
-- 
cgit v1.2.3


From 307df6a858dcab1bc10f3f52d9968acb3ea6d74f Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Sat, 9 Oct 2010 21:39:14 +0100
Subject: gallivm: Cleanup the rest of the flow module.

---
 src/gallium/auxiliary/gallivm/lp_bld_flow.c | 210 +++-------------------------
 src/gallium/auxiliary/gallivm/lp_bld_flow.h |  28 ++--
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  12 +-
 3 files changed, 39 insertions(+), 211 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index ac63bd544f..99a49df317 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -38,146 +38,6 @@
 #include "lp_bld_flow.h"
 
 
-#define LP_BUILD_FLOW_MAX_VARIABLES 64
-#define LP_BUILD_FLOW_MAX_DEPTH 32
-
-/**
- * Enumeration of all possible flow constructs.
- */
-enum lp_build_flow_construct_kind {
-   LP_BUILD_FLOW_SKIP,
-   LP_BUILD_FLOW_IF
-};
-
-
-/**
- * Early exit. Useful to skip to the end of a function or block when
- * the execution mask becomes zero or when there is an error condition.
- */
-struct lp_build_flow_skip
-{
-   /** Block to skip to */
-   LLVMBasicBlockRef block;
-};
-
-
-/**
- * Union of all possible flow constructs' data
- */
-union lp_build_flow_construct_data
-{
-   struct lp_build_flow_skip skip;
-};
-
-
-/**
- * Element of the flow construct stack.
- */
-struct lp_build_flow_construct
-{
-   enum lp_build_flow_construct_kind kind;
-   union lp_build_flow_construct_data data;
-};
-
-
-/**
- * All necessary data to generate LLVM control flow constructs.
- *
- * Besides keeping track of the control flow construct themselves we also
- * need to keep track of variables in order to generate SSA Phi values.
- */
-struct lp_build_flow_context
-{
-   LLVMBuilderRef builder;
-
-   /**
-    * Control flow stack.
-    */
-   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
-   unsigned num_constructs;
-};
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder)
-{
-   struct lp_build_flow_context *flow;
-
-   flow = CALLOC_STRUCT(lp_build_flow_context);
-   if(!flow)
-      return NULL;
-
-   flow->builder = builder;
-
-   return flow;
-}
-
-
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow)
-{
-   assert(flow->num_constructs == 0);
-   FREE(flow);
-}
-
-
-/**
- * Begin/push a new flow control construct, such as a loop, skip block
- * or variable scope.
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_push(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
-   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
-      return NULL;
-
-   flow->constructs[flow->num_constructs].kind = kind;
-   return &flow->constructs[flow->num_constructs++].data;
-}
-
-
-/**
- * Return the current/top flow control construct on the stack.
- * \param kind  the expected type of the top-most construct
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_peek(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[flow->num_constructs - 1].data;
-}
-
-
-/**
- * End/pop the current/top flow control construct on the stack.
- * \param kind  the expected type of the top-most construct
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_pop(struct lp_build_flow_context *flow,
-                  enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[--flow->num_constructs].data;
-}
-
-
 /**
  * Note: this function has no dependencies on the flow code and could
  * be used elsewhere.
@@ -208,34 +68,18 @@ lp_build_insert_new_block(LLVMBuilderRef builder, const char *name)
 }
 
 
-static LLVMBasicBlockRef
-lp_build_flow_insert_block(struct lp_build_flow_context *flow)
-{
-   return lp_build_insert_new_block(flow->builder, "");
-}
-
-
 /**
  * Begin a "skip" block.  Inside this block we can test a condition and
  * skip to the end of the block if the condition is false.
  */
 void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
+lp_build_flow_skip_begin(struct lp_build_skip_context *skip,
+                         LLVMBuilderRef builder)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBuilderRef builder;
-
-   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
+   skip->builder = builder;
 
    /* create new basic block */
-   skip->block = lp_build_flow_insert_block(flow);
-
-   builder = LLVMCreateBuilder();
-   LLVMPositionBuilderAtEnd(builder, skip->block);
-
-   LLVMDisposeBuilder(builder);
+   skip->block = lp_build_insert_new_block(skip->builder, "skip");
 }
 
 
@@ -244,37 +88,26 @@ lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
  * skip block if the condition is true.
  */
 void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+lp_build_flow_skip_cond_break(struct lp_build_skip_context *skip,
                               LLVMValueRef cond)
 {
-   struct lp_build_flow_skip *skip;
    LLVMBasicBlockRef new_block;
 
-   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
-
-   new_block = lp_build_flow_insert_block(flow);
+   new_block = lp_build_insert_new_block(skip->builder, "");
 
    /* if cond is true, goto skip->block, else goto new_block */
-   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
+   LLVMBuildCondBr(skip->builder, cond, skip->block, new_block);
 
-   LLVMPositionBuilderAtEnd(flow->builder, new_block);
+   LLVMPositionBuilderAtEnd(skip->builder, new_block);
 }
 
 
 void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow)
+lp_build_flow_skip_end(struct lp_build_skip_context *skip)
 {
-   struct lp_build_flow_skip *skip;
-
-   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
-
    /* goto block */
-   LLVMBuildBr(flow->builder, skip->block);
-   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
+   LLVMBuildBr(skip->builder, skip->block);
+   LLVMPositionBuilderAtEnd(skip->builder, skip->block);
 }
 
 
@@ -284,7 +117,7 @@ lp_build_flow_skip_end(struct lp_build_flow_context *flow)
 void
 lp_build_mask_check(struct lp_build_mask_context *mask)
 {
-   LLVMBuilderRef builder = mask->flow->builder;
+   LLVMBuilderRef builder = mask->skip.builder;
    LLVMValueRef value;
    LLVMValueRef cond;
 
@@ -298,7 +131,7 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
                         "");
 
    /* if cond, goto end of block */
-   lp_build_flow_skip_cond_break(mask->flow, cond);
+   lp_build_flow_skip_cond_break(&mask->skip, cond);
 }
 
 
@@ -311,28 +144,27 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
  */
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
+                    LLVMBuilderRef builder,
                     struct lp_type type,
                     LLVMValueRef value)
 {
    memset(mask, 0, sizeof *mask);
 
-   mask->flow = flow;
    mask->reg_type = LLVMIntType(type.width * type.length);
-   mask->var = lp_build_alloca(flow->builder,
+   mask->var = lp_build_alloca(builder,
                                lp_build_int_vec_type(type),
                                "execution_mask");
 
-   LLVMBuildStore(flow->builder, value, mask->var);
+   LLVMBuildStore(builder, value, mask->var);
 
-   lp_build_flow_skip_begin(flow);
+   lp_build_flow_skip_begin(&mask->skip, builder);
 }
 
 
 LLVMValueRef
 lp_build_mask_value(struct lp_build_mask_context *mask)
 {
-   return LLVMBuildLoad(mask->flow->builder, mask->var, "");
+   return LLVMBuildLoad(mask->skip.builder, mask->var, "");
 }
 
 
@@ -345,10 +177,10 @@ void
 lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value)
 {
-   value = LLVMBuildAnd(mask->flow->builder,
+   value = LLVMBuildAnd(mask->skip.builder,
                         lp_build_mask_value(mask),
                         value, "");
-   LLVMBuildStore(mask->flow->builder, value, mask->var);
+   LLVMBuildStore(mask->skip.builder, value, mask->var);
 }
 
 
@@ -358,7 +190,7 @@ lp_build_mask_update(struct lp_build_mask_context *mask,
 LLVMValueRef
 lp_build_mask_end(struct lp_build_mask_context *mask)
 {
-   lp_build_flow_skip_end(mask->flow);
+   lp_build_flow_skip_end(&mask->skip);
    return lp_build_mask_value(mask);
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index a4fc8d1955..e21d9de280 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -41,29 +41,33 @@
 struct lp_type;
 
 
-struct lp_build_flow_context;
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder);
+/**
+ * Early exit. Useful to skip to the end of a function or block when
+ * the execution mask becomes zero or when there is an error condition.
+ */
+struct lp_build_skip_context
+{
+   LLVMBuilderRef builder;
 
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow);
+   /** Block to skip to */
+   LLVMBasicBlockRef block;
+};
 
 void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
+lp_build_flow_skip_begin(struct lp_build_skip_context *ctx,
+                         LLVMBuilderRef builder);
 
 void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+lp_build_flow_skip_cond_break(struct lp_build_skip_context *ctx,
                               LLVMValueRef cond);
 
 void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow);
+lp_build_flow_skip_end(struct lp_build_skip_context *ctx);
 
 
 struct lp_build_mask_context
 {
-   struct lp_build_flow_context *flow;
+   struct lp_build_skip_context skip;
 
    LLVMTypeRef reg_type;
 
@@ -73,7 +77,7 @@ struct lp_build_mask_context
 
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
+                    LLVMBuilderRef builder,
                     struct lp_type type,
                     LLVMValueRef value);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 3b0706e3ec..6bfd02061d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -237,7 +237,6 @@ generate_fs(struct llvmpipe_context *lp,
    LLVMValueRef z;
    LLVMValueRef zs_value = NULL;
    LLVMValueRef stencil_refs[2];
-   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
    boolean simple_shader = (shader->info.file_count[TGSI_FILE_SAMPLER] == 0 &&
                             shader->info.num_inputs < 3 &&
@@ -286,8 +285,6 @@ generate_fs(struct llvmpipe_context *lp,
 
    consts_ptr = lp_jit_context_constants(builder, context_ptr);
 
-   flow = lp_build_flow_create(builder);
-
    memset(outputs, 0, sizeof outputs);
 
    /* Declare the color and z variables */
@@ -307,7 +304,7 @@ generate_fs(struct llvmpipe_context *lp,
    }
 
    /* 'mask' will control execution based on quad's pixel alive/killed state */
-   lp_build_mask_begin(&mask, flow, type, *pmask);
+   lp_build_mask_begin(&mask, builder, type, *pmask);
 
    if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
       lp_build_mask_check(&mask);
@@ -424,8 +421,6 @@ generate_fs(struct llvmpipe_context *lp,
                                lp_build_mask_value(&mask), counter);
 
    *pmask = lp_build_mask_end(&mask);
-
-   lp_build_flow_destroy(flow);
 }
 
 
@@ -450,7 +445,6 @@ generate_blend(const struct pipe_blend_state *blend,
                boolean do_branch)
 {
    struct lp_build_context bld;
-   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask_ctx;
    LLVMTypeRef vec_type;
    LLVMValueRef const_ptr;
@@ -461,8 +455,7 @@ generate_blend(const struct pipe_blend_state *blend,
 
    lp_build_context_init(&bld, builder, type);
 
-   flow = lp_build_flow_create(builder);
-   lp_build_mask_begin(&mask_ctx, flow, type, mask);
+   lp_build_mask_begin(&mask_ctx, builder, type, mask);
    if (do_branch)
       lp_build_mask_check(&mask_ctx);
 
@@ -497,7 +490,6 @@ generate_blend(const struct pipe_blend_state *blend,
    }
 
    lp_build_mask_end(&mask_ctx);
-   lp_build_flow_destroy(flow);
 }
 
 
-- 
cgit v1.2.3


From b18fecbd0ea33c9db7e3fd676ed7b5877ebb1bd5 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Sun, 10 Oct 2010 23:36:14 +0100
Subject: llvmpipe: Remove outdated comment about stencil testing.

---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index e768493103..264fce8d6a 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -53,15 +53,8 @@
  *  ... ... ... ... ... ... ... ... ...
  *
  *
- * Stencil test:
- * Two-sided stencil test is supported but probably not as efficient as
- * it could be.  Currently, we use if/then/else constructs to do the
- * operations for front vs. back-facing polygons.  We could probably do
- * both the front and back arithmetic then use a Select() instruction to
- * choose the result depending on polyon orientation.  We'd have to
- * measure performance both ways and see which is better.
- *
  * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Brian Paul <jfonseca@vmware.com>
  */
 
 #include "pipe/p_state.h"
-- 
cgit v1.2.3


From 986cb9d5cf60bc11c7facc19017b5432b17240f7 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Thu, 2 Sep 2010 16:30:23 +0100
Subject: llvmpipe: Use lp_tgsi_info.

---
 src/gallium/drivers/llvmpipe/lp_setup_point.c   |  4 +--
 src/gallium/drivers/llvmpipe/lp_state_derived.c | 16 +++++------
 src/gallium/drivers/llvmpipe/lp_state_fs.c      | 38 ++++++++++++-------------
 src/gallium/drivers/llvmpipe/lp_state_fs.h      |  3 +-
 4 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 31d85f43c2..64b24a88d5 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -239,8 +239,8 @@ setup_point_coefficients( struct lp_setup_context *setup,
          /* check if the sprite coord flag is set for this attribute.
           * If so, set it up so it up so x and y vary from 0 to 1.
           */
-         if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
-            unsigned semantic_index = shader->info.input_semantic_index[slot];
+         if (shader->info.base.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
+            unsigned semantic_index = shader->info.base.input_semantic_index[slot];
             /* Note that sprite_coord enable is a bitfield of
              * PIPE_MAX_SHADER_OUTPUTS bits.
              */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index bb059d0459..7f68818ab4 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -66,14 +66,14 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 
    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
 
-   for (i = 0; i < lpfs->info.num_inputs; i++) {
+   for (i = 0; i < lpfs->info.base.num_inputs; i++) {
       /*
        * Search for each input in current vs output:
        */
 
       vs_index = draw_find_shader_output(llvmpipe->draw,
-                                         lpfs->info.input_semantic_name[i],
-                                         lpfs->info.input_semantic_index[i]);
+                                         lpfs->info.base.input_semantic_name[i],
+                                         lpfs->info.base.input_semantic_index[i]);
       if (vs_index < 0) {
          /*
           * This can happen with sprite coordinates - the vertex
@@ -86,9 +86,9 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 
       /* This can be pre-computed, except for flatshade:
        */
-      inputs[i].usage_mask = lpfs->info.input_usage_mask[i];
+      inputs[i].usage_mask = lpfs->info.base.input_usage_mask[i];
 
-      switch (lpfs->info.input_interpolate[i]) {
+      switch (lpfs->info.base.input_interpolate[i]) {
       case TGSI_INTERPOLATE_CONSTANT:
          inputs[i].interp = LP_INTERP_CONSTANT;
          break;
@@ -103,7 +103,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
          break;
       }
 
-      switch (lpfs->info.input_semantic_name[i]) {
+      switch (lpfs->info.base.input_semantic_name[i]) {
       case TGSI_SEMANTIC_FACE:
          inputs[i].interp = LP_INTERP_FACING;
          break;
@@ -145,7 +145,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
    }
 
-   llvmpipe->num_inputs = lpfs->info.num_inputs;
+   llvmpipe->num_inputs = lpfs->info.base.num_inputs;
 
    draw_compute_vertex_size(vinfo);
 
@@ -153,7 +153,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 
    lp_setup_set_fs_inputs(llvmpipe->setup,
                           inputs,
-                          lpfs->info.num_inputs);
+                          lpfs->info.base.num_inputs);
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 6bfd02061d..6872f2d3c6 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -238,9 +238,9 @@ generate_fs(struct llvmpipe_context *lp,
    LLVMValueRef zs_value = NULL;
    LLVMValueRef stencil_refs[2];
    struct lp_build_mask_context mask;
-   boolean simple_shader = (shader->info.file_count[TGSI_FILE_SAMPLER] == 0 &&
-                            shader->info.num_inputs < 3 &&
-                            shader->info.num_instructions < 8);
+   boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
+                            shader->info.base.num_inputs < 3 &&
+                            shader->info.base.num_instructions < 8);
    unsigned attrib;
    unsigned chan;
    unsigned cbuf;
@@ -253,8 +253,8 @@ generate_fs(struct llvmpipe_context *lp,
       zs_format_desc = util_format_description(key->zsbuf_format);
       assert(zs_format_desc);
 
-      if (!shader->info.writes_z) {
-         if (key->alpha.enabled || shader->info.uses_kill)
+      if (!shader->info.base.writes_z) {
+         if (key->alpha.enabled || shader->info.base.uses_kill)
             /* With alpha test and kill, can do the depth test early
              * and hopefully eliminate some quads.  But need to do a
              * special deferred depth write once the final mask value
@@ -334,12 +334,12 @@ generate_fs(struct llvmpipe_context *lp,
    /* Build the actual shader */
    lp_build_tgsi_soa(builder, tokens, type, &mask,
                      consts_ptr, interp->pos, interp->inputs,
-                     outputs, sampler, &shader->info);
+                     outputs, sampler, &shader->info.base);
 
 
    /* Alpha test */
    if (key->alpha.enabled) {
-      int color0 = find_output_by_semantic(&shader->info,
+      int color0 = find_output_by_semantic(&shader->info.base,
                                            TGSI_SEMANTIC_COLOR,
                                            0);
 
@@ -358,7 +358,7 @@ generate_fs(struct llvmpipe_context *lp,
 
    /* Late Z test */
    if (depth_mode & LATE_DEPTH_TEST) { 
-      int pos0 = find_output_by_semantic(&shader->info,
+      int pos0 = find_output_by_semantic(&shader->info.base,
                                          TGSI_SEMANTIC_POSITION,
                                          0);
          
@@ -399,11 +399,11 @@ generate_fs(struct llvmpipe_context *lp,
 
 
    /* Color write  */
-   for (attrib = 0; attrib < shader->info.num_outputs; ++attrib)
+   for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
    {
-      if (shader->info.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR)
+      if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR)
       {
-         unsigned cbuf = shader->info.output_semantic_index[attrib];
+         unsigned cbuf = shader->info.base.output_semantic_index[attrib];
          for(chan = 0; chan < NUM_CHANNELS; ++chan)
          {
             /* XXX: just initialize outputs to point at colors[] and
@@ -728,7 +728,7 @@ generate_fragment(struct llvmpipe_context *lp,
           */
          boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) &&
                               !key->alpha.enabled &&
-                              !shader->info.uses_kill);
+                              !shader->info.base.uses_kill);
 
          generate_blend(&key->blend,
                         rt,
@@ -917,7 +917,7 @@ generate_variant(struct llvmpipe_context *lp,
          !key->stencil[0].enabled &&
          !key->alpha.enabled &&
          !key->depth.enabled &&
-         !shader->info.uses_kill
+         !shader->info.base.uses_kill
          ? TRUE : FALSE;
 
 
@@ -954,7 +954,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
    make_empty_list(&shader->variants);
 
    /* get/save the summary info for this shader */
-   tgsi_scan_shader(templ->tokens, &shader->info);
+   lp_build_tgsi_info(templ->tokens, &shader->info);
 
    /* we need to keep a local copy of the tokens */
    shader->base.tokens = tgsi_dup_tokens(templ->tokens);
@@ -966,7 +966,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
       return NULL;
    }
 
-   nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+   nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
 
    shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
 				     sampler[nr_samplers]);
@@ -976,8 +976,8 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
       debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader);
       tgsi_dump(templ->tokens, 0);
       debug_printf("usage masks:\n");
-      for (attrib = 0; attrib < shader->info.num_inputs; ++attrib) {
-         unsigned usage_mask = shader->info.input_usage_mask[attrib];
+      for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) {
+         unsigned usage_mask = shader->info.base.input_usage_mask[attrib];
          debug_printf("  IN[%u].%s%s%s%s\n",
                       attrib,
                       usage_mask & TGSI_WRITEMASK_X ? "x" : "",
@@ -1206,10 +1206,10 @@ make_variant_key(struct llvmpipe_context *lp,
 
    /* This value will be the same for all the variants of a given shader:
     */
-   key->nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+   key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
 
    for(i = 0; i < key->nr_samplers; ++i) {
-      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+      if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
          lp_sampler_static_state(&key->sampler[i],
 				 lp->fragment_sampler_views[i],
 				 lp->sampler[i]);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 4999b8dca1..ddad117aca 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -34,6 +34,7 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h" /* for tgsi_shader_info */
 #include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */
+#include "gallivm/lp_bld_tgsi.h" /* for lp_tgsi_info */
 
 
 struct tgsi_token;
@@ -96,7 +97,7 @@ struct lp_fragment_shader
 {
    struct pipe_shader_state base;
 
-   struct tgsi_shader_info info;
+   struct lp_tgsi_info info;
 
    struct lp_fs_variant_list_item variants;
 
-- 
cgit v1.2.3


From 4cb3b4ced80891ce8760cf5a0c06db9dbee36b76 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Mon, 11 Oct 2010 19:45:52 +0100
Subject: llvmpipe: Do not dispose the execution engine.

The engine is a global owned by gallivm module.
---
 src/gallium/drivers/llvmpipe/lp_jit.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 04b12dedcc..e09ec504ab 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -162,9 +162,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
 void
 lp_jit_screen_cleanup(struct llvmpipe_screen *screen)
 {
-   if(screen->engine)
-      LLVMDisposeExecutionEngine(screen->engine);
-
    if(screen->pass)
       LLVMDisposePassManager(screen->pass);
 }
-- 
cgit v1.2.3


From 2cf98d5a6dccba3fd69b8469e67f66dfb5fc9651 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Mon, 11 Oct 2010 16:30:14 +0100
Subject: llvmpipe: try to do more of rast_tri_3_16 with intrinsics

There was actually a large quantity of scalar code in these functions
previously.  This tries to move more into intrinsics.

Introduce an sse2 mm_mullo_epi32 replacement to avoid sse4 dependency
in the new rasterization code.
---
 src/gallium/drivers/llvmpipe/lp_rast.h     |  16 +-
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 264 ++++++++++++++++++++++++++++-
 2 files changed, 271 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index df0bea04b9..e2bcc45016 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -89,19 +89,21 @@ struct lp_rast_shader_inputs {
    const struct lp_rast_state *state;
 };
 
-
+/* Note: the order of these values is important as they are loaded by
+ * sse code in rasterization:
+ */
 struct lp_rast_plane {
-   /* one-pixel sized trivial accept offsets for each plane */
-   int ei;
-
-   /* one-pixel sized trivial reject offsets for each plane */
-   int eo;
-
    /* edge function values at minx,miny ?? */
    int c;
 
    int dcdx;
    int dcdy;
+
+   /* one-pixel sized trivial reject offsets for each plane */
+   int eo;
+
+   /* one-pixel sized trivial accept offsets for each plane */
+   int ei;
 };
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index f870a187db..7a6cbb8b63 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -32,6 +32,7 @@
 #include <limits.h>
 #include "util/u_math.h"
 #include "lp_debug.h"
+#include "lp_debug_intrin.h"
 #include "lp_perf.h"
 #include "lp_rast_priv.h"
 #include "lp_tile_soa.h"
@@ -254,8 +255,8 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 #define TAG(x) x##_3
 #define NR_PLANES 3
-#define TRI_4 lp_rast_triangle_3_4
-#define TRI_16 lp_rast_triangle_3_16
+/*#define TRI_4 lp_rast_triangle_3_4*/
+/*#define TRI_16 lp_rast_triangle_3_16*/
 #include "lp_rast_tri_tmp.h"
 
 #define TAG(x) x##_4
@@ -279,3 +280,262 @@ sign_bits4(const __m128i *cstep, int cdiff)
 #define NR_PLANES 8
 #include "lp_rast_tri_tmp.h"
 
+
+static INLINE void
+transpose4_epi32(__m128i a,
+                 __m128i b,
+                 __m128i c,
+                 __m128i d,
+                 __m128i *o,
+                 __m128i *p,
+                 __m128i *q,
+                 __m128i *r)
+{
+  __m128i t0 = _mm_unpacklo_epi32(a, b);
+  __m128i t1 = _mm_unpacklo_epi32(c, d);
+  __m128i t2 = _mm_unpackhi_epi32(a, b);
+  __m128i t3 = _mm_unpackhi_epi32(c, d);
+
+  *o = _mm_unpacklo_epi64(t0, t1);
+  *p = _mm_unpackhi_epi64(t0, t1);
+  *q = _mm_unpacklo_epi64(t2, t3);
+  *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+
+#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
+
+#define NR_PLANES 3
+
+
+
+/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * I suspect this works fine for us because one of our operands is
+ * always positive, but not sure that this can be used for general
+ * signed integer multiplication.
+ *
+ * This seems close enough to the speed of SSE4 and the real
+ * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
+ * dependency at this point.
+ */
+static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
+{
+   __m128i a4   = _mm_srli_si128(a, 4);  /* shift by one dword */
+   __m128i b4   = _mm_srli_si128(b, 4);  /* shift by one dword */
+   __m128i ba   = _mm_mul_epu32(b, a);   /* multply dwords 0, 2 */
+   __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
+
+   /* Interleave the results, either with shuffles or (slightly
+    * faster) direct bit operations:
+    */
+#if 0
+   __m128i ba8             = _mm_shuffle_epi32(ba, 8);
+   __m128i b4a48           = _mm_shuffle_epi32(b4a4, 8);
+   __m128i result          = _mm_unpacklo_epi32(ba8, b4a48);
+#else
+   __m128i mask            = _mm_setr_epi32(~0,0,~0,0);
+   __m128i ba_mask         = _mm_and_si128(ba, mask);
+   __m128i b4a4_mask       = _mm_and_si128(b4a4, mask);
+   __m128i b4a4_mask_shift = _mm_slli_si128(b4a4_mask, 4);
+   __m128i result          = _mm_or_si128(ba_mask, b4a4_mask_shift);
+#endif
+
+   return result;
+}
+
+
+
+
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = tri->plane;
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+   unsigned i, j;
+
+   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+   unsigned nr = 0;
+
+   __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = _mm_setzero_si128();
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+   __m128i rej4;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+   
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+   
+   transpose4_epi32(p0, p1, p2, zero,
+                   &c, &dcdx, &dcdy, &rej4);
+
+   /* Adjust dcdx;
+    */
+   dcdx = _mm_sub_epi32(zero, dcdx);
+
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+   rej4 = _mm_slli_epi32(rej4, 2);
+
+   dcdx2 = _mm_add_epi32(dcdx, dcdx);
+   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(zero, dcdx, dcdx2, dcdx3,
+                   &span_0, &span_1, &span_2, &unused);
+
+   for (i = 0; i < 4; i++) {
+      __m128i cx = c;
+
+      for (j = 0; j < 4; j++) {
+         __m128i c4rej = _mm_add_epi32(cx, rej4);
+         __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
+
+         /* if (is_zero(rej_masks)) */
+         if (_mm_movemask_epi8(rej_masks) == 0) {
+            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
+            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
+            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
+
+            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
+
+            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+            __m128i c_01 = _mm_packs_epi32(c_0, c_1);
+
+            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
+
+            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+            __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+            __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+            unsigned mask = _mm_movemask_epi8(c_0123);
+
+            out[nr].i = i;
+            out[nr].j = j;
+            out[nr].mask = mask;
+            if (mask != 0xffff)
+               nr++;
+         }
+         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
+      }
+
+      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
+   }
+
+   for (i = 0; i < nr; i++)
+      lp_rast_shade_quads_mask(task,
+                               &tri->inputs,
+                               x + 4 * out[i].j,
+                               y + 4 * out[i].i,
+                               0xffff & ~out[i].mask);
+}
+
+
+
+
+
+void
+lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
+                     const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = tri->plane;
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+
+   __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = _mm_setzero_si128();
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+   
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+   
+   transpose4_epi32(p0, p1, p2, zero,
+                    &c, &dcdx, &dcdy, &unused);
+
+   /* Adjust dcdx;
+    */
+   dcdx = _mm_sub_epi32(zero, dcdx);
+
+   c = _mm_add_epi32(c, _mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+   c = _mm_add_epi32(c, _mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+
+   dcdx2 = _mm_add_epi32(dcdx, dcdx);
+   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(zero, dcdx, dcdx2, dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
+
+
+   {
+      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
+      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
+      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
+      
+      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
+
+      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
+
+      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+      __m128i c_01 = _mm_packs_epi32(c_0, c_1);
+
+      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
+
+      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
+
+      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
+
+      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+      __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+      __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+      unsigned mask = _mm_movemask_epi8(c_0123);
+
+      if (mask != 0xffff)
+         lp_rast_shade_quads_mask(task,
+                                  &tri->inputs,
+                                  x,
+                                  y,
+                                  0xffff & ~mask);
+   }
+}
-- 
cgit v1.2.3


From 9d59e148f86c1de2c69639d389398d7435cc193e Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Mon, 11 Oct 2010 18:20:02 +0100
Subject: llvmpipe: add debug helpers for epi32 etc

---
 src/gallium/drivers/llvmpipe/lp_debug_intrin.h | 115 +++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 src/gallium/drivers/llvmpipe/lp_debug_intrin.h

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_debug_intrin.h b/src/gallium/drivers/llvmpipe/lp_debug_intrin.h
new file mode 100644
index 0000000000..5237e7893b
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_debug_intrin.h
@@ -0,0 +1,115 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+#ifndef _LP_DEBUG_INTRIN_H_
+#define _LP_DEBUG_INTRIN_H_
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_SSE)
+
+#include <emmintrin.h>
+
+static INLINE void print_epi8(const char *name, __m128i r)
+{
+   union { __m128i m; ubyte ub[16]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x\n",
+                name,
+                u.ub[0],  u.ub[1],  u.ub[2],  u.ub[3],
+                u.ub[4],  u.ub[5],  u.ub[6],  u.ub[7],
+                u.ub[8],  u.ub[9],  u.ub[10], u.ub[11],
+                u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
+}
+
+static INLINE void print_epi16(const char *name, __m128i r)
+{
+   union { __m128i m; ushort us[8]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x\n",
+                name,
+                u.us[0],  u.us[1],  u.us[2],  u.us[3],
+                u.us[4],  u.us[5],  u.us[6],  u.us[7]);
+}
+
+static INLINE void print_epi32(const char *name, __m128i r)
+{
+   union { __m128i m; uint ui[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%08x/"
+                "%08x/"
+                "%08x/"
+                "%08x\n",
+                name,
+                u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
+}
+
+static INLINE void print_ps(const char *name, __m128 r)
+{
+   union { __m128 m; float f[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%f/"
+                "%f/"
+                "%f/"
+                "%f\n",
+                name,
+                u.f[0],  u.f[1],  u.f[2],  u.f[3]);
+}
+
+ 
+#endif
+#endif
-- 
cgit v1.2.3


From 9773722c2b09d5f0615a47cecf4347859474dc56 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 12 Oct 2010 11:02:19 +0100
Subject: llvmpipe: try to keep plane c values small

Avoid accumulating more and more fixed point bits.
---
 src/gallium/drivers/llvmpipe/lp_setup_line.c |  3 +--
 src/gallium/drivers/llvmpipe/lp_setup_tri.c  | 38 +++++++++++++++++-----------
 2 files changed, 24 insertions(+), 17 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 693ac28175..c940860850 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -640,8 +640,7 @@ try_setup_line( struct lp_setup_context *setup,
          }
       }
 
-      plane->dcdx *= FIXED_ONE;
-      plane->dcdy *= FIXED_ONE;
+      plane->c = (plane->c + (FIXED_ONE-1)) / FIXED_ONE;
 
       /* find trivial reject offsets for each edge for a single-pixel
        * sized block.  These will be scaled up at each recursive level to
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 8fd034666c..dfe1bd11ea 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -343,26 +343,34 @@ do_triangle_ccw(struct lp_setup_context *setup,
        * Also, sometimes (in FBO cases) GL will render upside down
        * to its usual method, in which case it will probably want
        * to use the opposite, top-left convention.
+       *
+       * XXX: Chances are this will get stripped away.  In fact this
+       * is only meaningful if:
+       *
+       *          (plane->c & (FIXED_ONE-1)) == 0
+       *
        */         
-      if (plane->dcdx < 0) {
-         /* both fill conventions want this - adjust for left edges */
-         plane->c++;            
-      }
-      else if (plane->dcdx == 0) {
-         if (setup->pixel_offset == 0) {
-            /* correct for top-left fill convention:
-             */
-            if (plane->dcdy > 0) plane->c++;
+      if ((plane->c & (FIXED_ONE-1)) == 0) {
+         if (plane->dcdx < 0) {
+            /* both fill conventions want this - adjust for left edges */
+            plane->c++;            
          }
-         else {
-            /* correct for bottom-left fill convention:
-             */
-            if (plane->dcdy < 0) plane->c++;
+         else if (plane->dcdx == 0) {
+            if (setup->pixel_offset == 0) {
+               /* correct for top-left fill convention:
+                */
+               if (plane->dcdy > 0) plane->c++;
+            }
+            else {
+               /* correct for bottom-left fill convention:
+                */
+               if (plane->dcdy < 0) plane->c++;
+            }
          }
       }
 
-      plane->dcdx *= FIXED_ONE;
-      plane->dcdy *= FIXED_ONE;
+      plane->c = (plane->c + (FIXED_ONE-1)) / FIXED_ONE;
+
 
       /* find trivial reject offsets for each edge for a single-pixel
        * sized block.  These will be scaled up at each recursive level to
-- 
cgit v1.2.3


From b4277bc5843aca7f9e0ecc7e956733f1becd6ad6 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 12 Oct 2010 11:51:28 +0100
Subject: llvmpipe: fix typo in last commit

---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 7a6cbb8b63..854fd5cc1e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -491,8 +491,8 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
     */
    dcdx = _mm_sub_epi32(zero, dcdx);
 
-   c = _mm_add_epi32(c, _mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
-   c = _mm_add_epi32(c, _mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
 
    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
-- 
cgit v1.2.3


From 39331be44efc5b5ae749df3f6987626837c7b8ff Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Tue, 12 Oct 2010 12:27:55 +0100
Subject: llvmpipe: Fix MSVC build.

MSVC doesn't accept more than 3 __m128i arguments.
---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 36 +++++++++++++++---------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 854fd5cc1e..5e8918b1d8 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -282,19 +282,19 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 
 static INLINE void
-transpose4_epi32(__m128i a,
-                 __m128i b,
-                 __m128i c,
-                 __m128i d,
-                 __m128i *o,
-                 __m128i *p,
-                 __m128i *q,
-                 __m128i *r)
+transpose4_epi32(const __m128i * restrict a,
+                 const __m128i * restrict b,
+                 const __m128i * restrict c,
+                 const __m128i * restrict d,
+                 __m128i * restrict o,
+                 __m128i * restrict p,
+                 __m128i * restrict q,
+                 __m128i * restrict r)
 {
-  __m128i t0 = _mm_unpacklo_epi32(a, b);
-  __m128i t1 = _mm_unpacklo_epi32(c, d);
-  __m128i t2 = _mm_unpackhi_epi32(a, b);
-  __m128i t3 = _mm_unpackhi_epi32(c, d);
+  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
 
   *o = _mm_unpacklo_epi64(t0, t1);
   *p = _mm_unpackhi_epi64(t0, t1);
@@ -379,8 +379,8 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
    
-   transpose4_epi32(p0, p1, p2, zero,
-                   &c, &dcdx, &dcdy, &rej4);
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &rej4);
 
    /* Adjust dcdx;
     */
@@ -393,8 +393,8 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 
-   transpose4_epi32(zero, dcdx, dcdx2, dcdx3,
-                   &span_0, &span_1, &span_2, &unused);
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
 
    for (i = 0; i < 4; i++) {
       __m128i cx = c;
@@ -484,7 +484,7 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
    
-   transpose4_epi32(p0, p1, p2, zero,
+   transpose4_epi32(&p0, &p1, &p2, &zero,
                     &c, &dcdx, &dcdy, &unused);
 
    /* Adjust dcdx;
@@ -497,7 +497,7 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 
-   transpose4_epi32(zero, dcdx, dcdx2, dcdx3,
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
                     &span_0, &span_1, &span_2, &unused);
 
 
-- 
cgit v1.2.3


From 1a574afabc8840da82e68ac643ec3a7b05afb631 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 12 Oct 2010 13:02:28 +0100
Subject: gallium: move sse intrinsics debug helpers to u_sse.h

---
 src/gallium/auxiliary/util/u_sse.h             |  80 ++++++++++++++++-
 src/gallium/drivers/llvmpipe/lp_debug_intrin.h | 115 -------------------------
 src/gallium/drivers/llvmpipe/lp_rast_tri.c     |   1 -
 3 files changed, 79 insertions(+), 117 deletions(-)
 delete mode 100644 src/gallium/drivers/llvmpipe/lp_debug_intrin.h

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 03198c91da..8fd0e52a3a 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -72,6 +72,84 @@ _mm_castps_si128(__m128 a)
 #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
 
 
+static INLINE void u_print_epi8(const char *name, __m128i r)
+{
+   union { __m128i m; ubyte ub[16]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x\n",
+                name,
+                u.ub[0],  u.ub[1],  u.ub[2],  u.ub[3],
+                u.ub[4],  u.ub[5],  u.ub[6],  u.ub[7],
+                u.ub[8],  u.ub[9],  u.ub[10], u.ub[11],
+                u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
+}
+
+static INLINE void u_print_epi16(const char *name, __m128i r)
+{
+   union { __m128i m; ushort us[8]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x\n",
+                name,
+                u.us[0],  u.us[1],  u.us[2],  u.us[3],
+                u.us[4],  u.us[5],  u.us[6],  u.us[7]);
+}
+
+static INLINE void u_print_epi32(const char *name, __m128i r)
+{
+   union { __m128i m; uint ui[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%08x/"
+                "%08x/"
+                "%08x/"
+                "%08x\n",
+                name,
+                u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
+}
+
+static INLINE void u_print_ps(const char *name, __m128 r)
+{
+   union { __m128 m; float f[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%f/"
+                "%f/"
+                "%f/"
+                "%f\n",
+                name,
+                u.f[0],  u.f[1],  u.f[2],  u.f[3]);
+}
+
+
+
 #if defined(PIPE_ARCH_SSSE3)
 
 #include <tmmintrin.h>
@@ -98,6 +176,6 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
 #endif /* !PIPE_ARCH_SSSE3 */
 
 
-#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+#endif /* PIPE_ARCH_SSE */
 
 #endif /* U_SSE_H_ */
diff --git a/src/gallium/drivers/llvmpipe/lp_debug_intrin.h b/src/gallium/drivers/llvmpipe/lp_debug_intrin.h
deleted file mode 100644
index 5237e7893b..0000000000
--- a/src/gallium/drivers/llvmpipe/lp_debug_intrin.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- **************************************************************************/
-
-#ifndef _LP_DEBUG_INTRIN_H_
-#define _LP_DEBUG_INTRIN_H_
-
-#include "pipe/p_config.h"
-
-#if defined(PIPE_ARCH_SSE)
-
-#include <emmintrin.h>
-
-static INLINE void print_epi8(const char *name, __m128i r)
-{
-   union { __m128i m; ubyte ub[16]; } u;
-   u.m = r;
-
-   debug_printf("%s: "
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x/"
-                "%02x\n",
-                name,
-                u.ub[0],  u.ub[1],  u.ub[2],  u.ub[3],
-                u.ub[4],  u.ub[5],  u.ub[6],  u.ub[7],
-                u.ub[8],  u.ub[9],  u.ub[10], u.ub[11],
-                u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
-}
-
-static INLINE void print_epi16(const char *name, __m128i r)
-{
-   union { __m128i m; ushort us[8]; } u;
-   u.m = r;
-
-   debug_printf("%s: "
-                "%04x/"
-                "%04x/"
-                "%04x/"
-                "%04x/"
-                "%04x/"
-                "%04x/"
-                "%04x/"
-                "%04x\n",
-                name,
-                u.us[0],  u.us[1],  u.us[2],  u.us[3],
-                u.us[4],  u.us[5],  u.us[6],  u.us[7]);
-}
-
-static INLINE void print_epi32(const char *name, __m128i r)
-{
-   union { __m128i m; uint ui[4]; } u;
-   u.m = r;
-
-   debug_printf("%s: "
-                "%08x/"
-                "%08x/"
-                "%08x/"
-                "%08x\n",
-                name,
-                u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
-}
-
-static INLINE void print_ps(const char *name, __m128 r)
-{
-   union { __m128 m; float f[4]; } u;
-   u.m = r;
-
-   debug_printf("%s: "
-                "%f/"
-                "%f/"
-                "%f/"
-                "%f\n",
-                name,
-                u.f[0],  u.f[1],  u.f[2],  u.f[3]);
-}
-
- 
-#endif
-#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 5e8918b1d8..19b0bd686a 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -32,7 +32,6 @@
 #include <limits.h>
 #include "util/u_math.h"
 #include "lp_debug.h"
-#include "lp_debug_intrin.h"
 #include "lp_perf.h"
 #include "lp_rast_priv.h"
 #include "lp_tile_soa.h"
-- 
cgit v1.2.3


From 0ca0382d1bfd1e9128fa4b588ce1411f7b8a85df Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 12 Oct 2010 13:20:39 +0100
Subject: Revert "llvmpipe: try to keep plane c values small"

This reverts commit 9773722c2b09d5f0615a47cecf4347859474dc56.

Looks like there are some floor/rounding issues here that need
to be better understood.
---
 src/gallium/drivers/llvmpipe/lp_setup_line.c |  3 ++-
 src/gallium/drivers/llvmpipe/lp_setup_tri.c  | 38 +++++++++++-----------------
 2 files changed, 17 insertions(+), 24 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index c940860850..693ac28175 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -640,7 +640,8 @@ try_setup_line( struct lp_setup_context *setup,
          }
       }
 
-      plane->c = (plane->c + (FIXED_ONE-1)) / FIXED_ONE;
+      plane->dcdx *= FIXED_ONE;
+      plane->dcdy *= FIXED_ONE;
 
       /* find trivial reject offsets for each edge for a single-pixel
        * sized block.  These will be scaled up at each recursive level to
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index dfe1bd11ea..8fd034666c 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -343,34 +343,26 @@ do_triangle_ccw(struct lp_setup_context *setup,
        * Also, sometimes (in FBO cases) GL will render upside down
        * to its usual method, in which case it will probably want
        * to use the opposite, top-left convention.
-       *
-       * XXX: Chances are this will get stripped away.  In fact this
-       * is only meaningful if:
-       *
-       *          (plane->c & (FIXED_ONE-1)) == 0
-       *
        */         
-      if ((plane->c & (FIXED_ONE-1)) == 0) {
-         if (plane->dcdx < 0) {
-            /* both fill conventions want this - adjust for left edges */
-            plane->c++;            
+      if (plane->dcdx < 0) {
+         /* both fill conventions want this - adjust for left edges */
+         plane->c++;            
+      }
+      else if (plane->dcdx == 0) {
+         if (setup->pixel_offset == 0) {
+            /* correct for top-left fill convention:
+             */
+            if (plane->dcdy > 0) plane->c++;
          }
-         else if (plane->dcdx == 0) {
-            if (setup->pixel_offset == 0) {
-               /* correct for top-left fill convention:
-                */
-               if (plane->dcdy > 0) plane->c++;
-            }
-            else {
-               /* correct for bottom-left fill convention:
-                */
-               if (plane->dcdy < 0) plane->c++;
-            }
+         else {
+            /* correct for bottom-left fill convention:
+             */
+            if (plane->dcdy < 0) plane->c++;
          }
       }
 
-      plane->c = (plane->c + (FIXED_ONE-1)) / FIXED_ONE;
-
+      plane->dcdx *= FIXED_ONE;
+      plane->dcdy *= FIXED_ONE;
 
       /* find trivial reject offsets for each edge for a single-pixel
        * sized block.  These will be scaled up at each recursive level to
-- 
cgit v1.2.3


From e3ec0fdd546259005c9ed2bf7b05cead2ab95b43 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Tue, 12 Oct 2010 14:15:59 +0100
Subject: llmvpipe: improve mm_mullo_epi32

Apply Jose's suggestions for a small but measurable improvement in
isosurf.
---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 19b0bd686a..c3eefb724c 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -321,8 +321,8 @@ transpose4_epi32(const __m128i * restrict a,
  */
 static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
 {
-   __m128i a4   = _mm_srli_si128(a, 4);  /* shift by one dword */
-   __m128i b4   = _mm_srli_si128(b, 4);  /* shift by one dword */
+   __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
+   __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
    __m128i ba   = _mm_mul_epu32(b, a);   /* multply dwords 0, 2 */
    __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
 
@@ -336,8 +336,7 @@ static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
 #else
    __m128i mask            = _mm_setr_epi32(~0,0,~0,0);
    __m128i ba_mask         = _mm_and_si128(ba, mask);
-   __m128i b4a4_mask       = _mm_and_si128(b4a4, mask);
-   __m128i b4a4_mask_shift = _mm_slli_si128(b4a4_mask, 4);
+   __m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32);
    __m128i result          = _mm_or_si128(ba_mask, b4a4_mask_shift);
 #endif
 
-- 
cgit v1.2.3


From 7533c374570b333b5e0d626d36d18c41d4611cb5 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 12 Oct 2010 18:26:41 +0100
Subject: llvmpipe: make sure intrinsics code is guarded with PIPE_ARCH_SSE

---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 82 +++++++++++++++---------------
 1 file changed, 42 insertions(+), 40 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index c3eefb724c..bae772b9c5 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -239,46 +239,6 @@ sign_bits4(const __m128i *cstep, int cdiff)
    return _mm_movemask_epi8(result);
 }
 
-#endif
-
-
-
-
-#define TAG(x) x##_1
-#define NR_PLANES 1
-#include "lp_rast_tri_tmp.h"
-
-#define TAG(x) x##_2
-#define NR_PLANES 2
-#include "lp_rast_tri_tmp.h"
-
-#define TAG(x) x##_3
-#define NR_PLANES 3
-/*#define TRI_4 lp_rast_triangle_3_4*/
-/*#define TRI_16 lp_rast_triangle_3_16*/
-#include "lp_rast_tri_tmp.h"
-
-#define TAG(x) x##_4
-#define NR_PLANES 4
-#define TRI_16 lp_rast_triangle_4_16
-#include "lp_rast_tri_tmp.h"
-
-#define TAG(x) x##_5
-#define NR_PLANES 5
-#include "lp_rast_tri_tmp.h"
-
-#define TAG(x) x##_6
-#define NR_PLANES 6
-#include "lp_rast_tri_tmp.h"
-
-#define TAG(x) x##_7
-#define NR_PLANES 7
-#include "lp_rast_tri_tmp.h"
-
-#define TAG(x) x##_8
-#define NR_PLANES 8
-#include "lp_rast_tri_tmp.h"
-
 
 static INLINE void
 transpose4_epi32(const __m128i * restrict a,
@@ -537,3 +497,45 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
                                   0xffff & ~mask);
    }
 }
+
+#undef NR_PLANES
+#endif
+
+
+
+
+#define TAG(x) x##_1
+#define NR_PLANES 1
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_2
+#define NR_PLANES 2
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_3
+#define NR_PLANES 3
+/*#define TRI_4 lp_rast_triangle_3_4*/
+/*#define TRI_16 lp_rast_triangle_3_16*/
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_4
+#define NR_PLANES 4
+#define TRI_16 lp_rast_triangle_4_16
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_5
+#define NR_PLANES 5
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_6
+#define NR_PLANES 6
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_7
+#define NR_PLANES 7
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_8
+#define NR_PLANES 8
+#include "lp_rast_tri_tmp.h"
+
-- 
cgit v1.2.3


From 048a90c1cb926fdeae47392582cb91b0a689905f Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 12 Oct 2010 18:38:22 -0600
Subject: draw/llvmpipe: replace DRAW_MAX_TEXTURE_LEVELS with
 PIPE_MAX_TEXTURE_LEVELS

There's no apparent reason for the former to exist.  And they didn't
even have the same value.
---
 src/gallium/auxiliary/draw/draw_context.c       |  6 +++---
 src/gallium/auxiliary/draw/draw_context.h       |  7 +++----
 src/gallium/auxiliary/draw/draw_llvm.c          | 12 ++++++------
 src/gallium/auxiliary/draw/draw_llvm.h          | 13 ++++++-------
 src/gallium/drivers/llvmpipe/lp_state_sampler.c |  6 +++---
 5 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 032fcbbc70..40f654643b 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -721,9 +721,9 @@ draw_set_mapped_texture(struct draw_context *draw,
                         unsigned sampler_idx,
                         uint32_t width, uint32_t height, uint32_t depth,
                         uint32_t last_level,
-                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        const void *data[DRAW_MAX_TEXTURE_LEVELS])
+                        uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        const void *data[PIPE_MAX_TEXTURE_LEVELS])
 {
 #ifdef HAVE_LLVM
    if(draw->llvm)
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 1f27cbf488..ff4f753604 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -49,7 +49,6 @@ struct draw_geometry_shader;
 struct draw_fragment_shader;
 struct tgsi_sampler;
 
-#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_context *draw_create( struct pipe_context *pipe );
 
@@ -120,9 +119,9 @@ draw_set_mapped_texture(struct draw_context *draw,
                         unsigned sampler_idx,
                         uint32_t width, uint32_t height, uint32_t depth,
                         uint32_t last_level,
-                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+                        uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        const void *data[PIPE_MAX_TEXTURE_LEVELS]);
 
 
 /*
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 7fb86d7cb2..d94340367c 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -72,12 +72,12 @@ init_globals(struct draw_llvm *llvm)
       elem_types[DRAW_JIT_TEXTURE_DEPTH] = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_ROW_STRIDE] =
-         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+         LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_IMG_STRIDE] =
-         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+         LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_DATA] =
          LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
-                       DRAW_MAX_TEXTURE_LEVELS);
+                       PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_MIN_LOD] = LLVMFloatType();
       elem_types[DRAW_JIT_TEXTURE_MAX_LOD] = LLVMFloatType();
       elem_types[DRAW_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType();
@@ -1066,9 +1066,9 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,
                              unsigned sampler_idx,
                              uint32_t width, uint32_t height, uint32_t depth,
                              uint32_t last_level,
-                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             const void *data[DRAW_MAX_TEXTURE_LEVELS])
+                             uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             const void *data[PIPE_MAX_TEXTURE_LEVELS])
 {
    unsigned j;
    struct draw_jit_texture *jit_tex;
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index d0a68ae412..de89b657f3 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -41,7 +41,6 @@
 #include <llvm-c/Target.h>
 #include <llvm-c/ExecutionEngine.h>
 
-#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_llvm;
 struct llvm_vertex_shader;
@@ -52,9 +51,9 @@ struct draw_jit_texture
    uint32_t height;
    uint32_t depth;
    uint32_t last_level;
-   uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
-   uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
-   const void *data[DRAW_MAX_TEXTURE_LEVELS];
+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   const void *data[PIPE_MAX_TEXTURE_LEVELS];
    float min_lod;
    float max_lod;
    float lod_bias;
@@ -290,8 +289,8 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,
                              unsigned sampler_idx,
                              uint32_t width, uint32_t height, uint32_t depth,
                              uint32_t last_level,
-                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+                             uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             const void *data[PIPE_MAX_TEXTURE_LEVELS]);
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 17a4a0ed02..1dd866195d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -246,9 +246,9 @@ llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp,
                                  struct pipe_sampler_view **views)
 {
    unsigned i;
-   uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
-   uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
-   const void *data[DRAW_MAX_TEXTURE_LEVELS];
+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   const void *data[PIPE_MAX_TEXTURE_LEVELS];
 
    assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
    if (!num)
-- 
cgit v1.2.3


From 95c18abb03b035c6fa029cd0852f07fb39951279 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 13 Oct 2010 14:28:51 +0100
Subject: llvmpipe: Unbreak Z32_FLOAT.

Z32_FLOAT uses <4 x float> as intermediate/destination type,
instead of <4 x i32>.

The necessary bitcasts got removed with commit
5b7eb868fde98388d80601d8dea39e679828f42f

Also use depth/stencil type and build contexts consistently, and
make the depth pointer argument a ordinary <i8 *>, to catch this
sort of issues in the future (and also to pave way for Z16 and
Z32_FLOAT_S8_X24 support).
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c | 130 ++++++++++++++++------------
 src/gallium/drivers/llvmpipe/lp_bld_depth.h |   6 ++
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  21 +++--
 3 files changed, 93 insertions(+), 64 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 264fce8d6a..3162f3e1c2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -458,9 +458,9 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             LLVMValueRef *zs_value,
                             boolean do_branch)
 {
-   struct lp_type type;
-   struct lp_build_context bld;
-   struct lp_build_context sbld;
+   struct lp_type z_type;
+   struct lp_build_context z_bld;
+   struct lp_build_context s_bld;
    struct lp_type s_type;
    LLVMValueRef zs_dst, z_dst = NULL;
    LLVMValueRef stencil_vals = NULL;
@@ -483,28 +483,31 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       /* We know the values in z_dst are all >= 0, so allow
        * lp_build_compare to use signed compare intrinsics:
        */
-      type.floating = 0;
-      type.fixed = 0;
-      type.sign = 1;
-      type.norm = 1;
-      type.width = 32;
-      type.length = z_src_type.length;
+      z_type.floating = 0;
+      z_type.fixed = 0;
+      z_type.sign = 1;
+      z_type.norm = 1;
+      z_type.width = 32;
+      z_type.length = z_src_type.length;
 
       int32_vec_type = LLVMVectorType(LLVMInt32Type(), z_src_type.length);
 
-      const_8_int = lp_build_const_int_vec(type, 8);
+      const_8_int = lp_build_const_int_vec(z_type, 8);
       const_ffffff_float = lp_build_const_vec(z_src_type, (float)0xffffff);
 
       zscaled = LLVMBuildFMul(builder, z_src, const_ffffff_float, "zscaled");
       z_src = LLVMBuildFPToSI(builder, zscaled, int32_vec_type, "z_src");
       
       /* Load current z/stencil value from z/stencil buffer */
+      zs_dst_ptr = LLVMBuildBitCast(builder,
+                                    zs_dst_ptr,
+                                    LLVMPointerType(int32_vec_type, 0), "");
       z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
       z_dst = LLVMBuildLShr(builder, z_dst, const_8_int, "z_dst");
 
       /* compare src Z to dst Z, returning 'pass' mask */
       z_pass = lp_build_compare(builder,
-                                type,
+                                z_type,
                                 depth->func, z_src, z_dst);
 
       lp_build_mask_update(mask, z_pass);
@@ -517,10 +520,10 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
        * storage.
        */
       if (depth->writemask) {
-         type.sign = 1;
-         lp_build_context_init(&bld, builder, type);
+         z_type.sign = 1;
+         lp_build_context_init(&z_bld, builder, z_type);
 
-         z_dst = lp_build_select(&bld, lp_build_mask_value(mask), z_src, z_dst);
+         z_dst = lp_build_select(&z_bld, lp_build_mask_value(mask), z_src, z_dst);
          z_dst = LLVMBuildShl(builder, z_dst, const_8_int, "z_dst");
          *zs_value = z_dst;
       }
@@ -543,19 +546,14 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    }
 
    /* Pick the depth type. */
-   type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+   z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
 
    /* FIXME: Cope with a depth test type with a different bit width. */
-   assert(type.width == z_src_type.width);
-   assert(type.length == z_src_type.length);
+   assert(z_type.width == z_src_type.width);
+   assert(z_type.length == z_src_type.length);
 
    /* Convert fragment Z from float to integer */
-   lp_build_conv(builder, z_src_type, type, &z_src, 1, &z_src, 1);
-
-   zs_dst_ptr = LLVMBuildBitCast(builder,
-                                 zs_dst_ptr,
-                                 LLVMPointerType(lp_build_vec_type(type), 0), "");
-
+   lp_build_conv(builder, z_src_type, z_type, &z_src, 1, &z_src, 1);
 
 
    /* Sanity checking */
@@ -578,8 +576,8 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       }
 
       assert(z_swizzle < 4);
-      assert(format_desc->block.bits == type.width);
-      if (type.floating) {
+      assert(format_desc->block.bits == z_type.width);
+      if (z_type.floating) {
          assert(z_swizzle == 0);
          assert(format_desc->channel[z_swizzle].type ==
                 UTIL_FORMAT_TYPE_FLOAT);
@@ -590,21 +588,24 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          assert(format_desc->channel[z_swizzle].type ==
                 UTIL_FORMAT_TYPE_UNSIGNED);
          assert(format_desc->channel[z_swizzle].normalized);
-         assert(!type.fixed);
-         assert(!type.sign);
-         assert(type.norm);
+         assert(!z_type.fixed);
+         assert(!z_type.sign);
+         assert(z_type.norm);
       }
    }
 
 
    /* Setup build context for Z vals */
-   lp_build_context_init(&bld, builder, type);
+   lp_build_context_init(&z_bld, builder, z_type);
 
    /* Setup build context for stencil vals */
-   s_type = lp_type_int_vec(type.width);
-   lp_build_context_init(&sbld, builder, s_type);
+   s_type = lp_type_int_vec(z_type.width);
+   lp_build_context_init(&s_bld, builder, s_type);
 
    /* Load current z/stencil value from z/stencil buffer */
+   zs_dst_ptr = LLVMBuildBitCast(builder,
+                                 zs_dst_ptr,
+                                 LLVMPointerType(z_bld.vec_type, 0), "");
    zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, "");
 
    lp_build_name(zs_dst, "zsbufval");
@@ -618,12 +619,12 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
 
       if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) {
          if (z_shift) {
-            LLVMValueRef shift = lp_build_const_int_vec(type, z_shift);
+            LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
             z_src = LLVMBuildLShr(builder, z_src, shift, "");
          }
 
          if (z_mask != 0xffffffff) {
-            LLVMValueRef mask = lp_build_const_int_vec(type, z_mask);
+            LLVMValueRef mask = lp_build_const_int_vec(z_type, z_mask);
             z_src = LLVMBuildAnd(builder, z_src, mask, "");
             z_dst = LLVMBuildAnd(builder, zs_dst, mask, "");
             z_bitmask = mask;  /* used below */
@@ -637,7 +638,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
 
       if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
          if (s_shift) {
-            LLVMValueRef shift = lp_build_const_int_vec(type, s_shift);
+            LLVMValueRef shift = lp_build_const_int_vec(s_type, s_shift);
             stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, "");
             stencil_shift = shift;  /* used below */
          }
@@ -646,7 +647,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          }
 
          if (s_mask != 0xffffffff) {
-            LLVMValueRef mask = lp_build_const_int_vec(type, s_mask);
+            LLVMValueRef mask = lp_build_const_int_vec(s_type, s_mask);
             stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
          }
 
@@ -662,24 +663,24 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          /* front_facing = face > 0.0 ? ~0 : 0 */
          front_facing = LLVMBuildFCmp(builder, LLVMRealUGT, face, zero, "");
          front_facing = LLVMBuildSExt(builder, front_facing,
-                                      LLVMIntType(bld.type.length*bld.type.width),
+                                      LLVMIntType(s_bld.type.length*s_bld.type.width),
                                       "");
          front_facing = LLVMBuildBitCast(builder, front_facing,
-                                         bld.int_vec_type, "");
+                                         s_bld.int_vec_type, "");
       }
 
       /* convert scalar stencil refs into vectors */
-      stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]);
-      stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]);
+      stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]);
+      stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]);
 
-      s_pass_mask = lp_build_stencil_test(&sbld, stencil,
+      s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
                                           stencil_refs, stencil_vals,
                                           front_facing);
 
       /* apply stencil-fail operator */
       {
-         LLVMValueRef s_fail_mask = lp_build_andnot(&bld, orig_mask, s_pass_mask);
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP,
+         LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
                                             stencil_refs, stencil_vals,
                                             s_fail_mask, front_facing);
       }
@@ -687,7 +688,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
 
    if (depth->enabled) {
       /* compare src Z to dst Z, returning 'pass' mask */
-      z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst);
+      z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
 
       if (!stencil[0].enabled) {
          /* We can potentially skip all remaining operations here, but only
@@ -721,7 +722,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          /* Mix the old and new Z buffer values.
           * z_dst[i] = (zselectmask[i] & z_src[i]) | (~zselectmask[i] & z_dst[i])
           */
-         z_dst = lp_build_select_bitwise(&bld, zselectmask, z_src, z_dst);
+         z_dst = lp_build_select_bitwise(&z_bld, zselectmask, z_src, z_dst);
       }
 
       if (stencil[0].enabled) {
@@ -729,14 +730,14 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          LLVMValueRef z_fail_mask, z_pass_mask;
 
          /* apply Z-fail operator */
-         z_fail_mask = lp_build_andnot(&bld, orig_mask, z_pass);
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP,
+         z_fail_mask = lp_build_andnot(&z_bld, orig_mask, z_pass);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
                                             stencil_refs, stencil_vals,
                                             z_fail_mask, front_facing);
 
          /* apply Z-pass operator */
-         z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, "");
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+         z_pass_mask = LLVMBuildAnd(z_bld.builder, orig_mask, z_pass, "");
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                             stencil_refs, stencil_vals,
                                             z_pass_mask, front_facing);
       }
@@ -745,8 +746,8 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       /* No depth test: apply Z-pass operator to stencil buffer values which
        * passed the stencil test.
        */
-      s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, "");
-      stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+      s_pass_mask = LLVMBuildAnd(s_bld.builder, orig_mask, s_pass_mask, "");
+      stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                          stencil_refs, stencil_vals,
                                          s_pass_mask, front_facing);
    }
@@ -755,7 +756,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
     * stencil bits before ORing Z with Stencil to make the final pixel value.
     */
    if (stencil_vals && stencil_shift)
-      stencil_vals = LLVMBuildShl(bld.builder, stencil_vals,
+      stencil_vals = LLVMBuildShl(s_bld.builder, stencil_vals,
                                   stencil_shift, "");
 
    /* Finally, merge/store the z/stencil values */
@@ -763,7 +764,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
        (stencil[0].enabled && stencil[0].writemask)) {
 
       if (z_dst && stencil_vals)
-         zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, "");
+         zs_dst = LLVMBuildOr(z_bld.builder, z_dst, stencil_vals, "");
       else if (z_dst)
          zs_dst = z_dst;
       else
@@ -784,6 +785,18 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
 }
 
 
+void
+lp_build_depth_write(LLVMBuilderRef builder,
+                     const struct util_format_description *format_desc,
+                     LLVMValueRef zs_dst_ptr,
+                     LLVMValueRef zs_value)
+{
+   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr,
+                                 LLVMPointerType(LLVMTypeOf(zs_value), 0), "");
+
+   LLVMBuildStore(builder, zs_value, zs_dst_ptr);
+}
+
 
 void
 lp_build_deferred_depth_write(LLVMBuilderRef builder,
@@ -793,17 +806,20 @@ lp_build_deferred_depth_write(LLVMBuilderRef builder,
                               LLVMValueRef zs_dst_ptr,
                               LLVMValueRef zs_value)
 {
-   struct lp_type type;
-   struct lp_build_context bld;
+   struct lp_type z_type;
+   struct lp_build_context z_bld;
    LLVMValueRef z_dst;
 
    /* XXX: pointlessly redo type logic:
     */
-   type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
-   lp_build_context_init(&bld, builder, type);
+   z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+   lp_build_context_init(&z_bld, builder, z_type);
+
+   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr,
+                                 LLVMPointerType(z_bld.vec_type, 0), "");
 
    z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
-   z_dst = lp_build_select(&bld, lp_build_mask_value(mask), zs_value, z_dst);
+   z_dst = lp_build_select(&z_bld, lp_build_mask_value(mask), zs_value, z_dst);
 
    LLVMBuildStore(builder, z_dst, zs_dst_ptr);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index 0f89668123..a54ef3a711 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -64,6 +64,12 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             LLVMValueRef *zs_value,
                             boolean do_branch);
 
+void
+lp_build_depth_write(LLVMBuilderRef builder,
+                     const struct util_format_description *format_desc,
+                     LLVMValueRef zs_dst_ptr,
+                     LLVMValueRef zs_value);
+
 void
 lp_build_deferred_depth_write(LLVMBuilderRef builder,
                               struct lp_type z_src_type,
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 6872f2d3c6..c09835635d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -325,8 +325,9 @@ generate_fs(struct llvmpipe_context *lp,
                                   &zs_value,
                                   !simple_shader);
 
-      if (depth_mode & EARLY_DEPTH_WRITE)
-         LLVMBuildStore(builder, zs_value, depth_ptr);
+      if (depth_mode & EARLY_DEPTH_WRITE) {
+         lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+      }
    }
 
    lp_build_interp_soa_update_inputs(interp, i);
@@ -379,8 +380,9 @@ generate_fs(struct llvmpipe_context *lp,
                                   &zs_value,
                                   !simple_shader);
       /* Late Z write */
-      if (depth_mode & LATE_DEPTH_WRITE)
-         LLVMBuildStore(builder, zs_value, depth_ptr);
+      if (depth_mode & LATE_DEPTH_WRITE) {
+         lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+      }
    }
    else if ((depth_mode & EARLY_DEPTH_TEST) &&
             (depth_mode & LATE_DEPTH_WRITE))
@@ -534,6 +536,7 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMValueRef blend_mask;
    LLVMValueRef function;
    LLVMValueRef facing;
+   const struct util_format_description *zs_format_desc;
    unsigned num_fs;
    unsigned i;
    unsigned chan;
@@ -579,7 +582,7 @@ generate_fragment(struct llvmpipe_context *lp,
    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
    arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
-   arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+   arg_types[8] = LLVMPointerType(LLVMInt8Type(), 0);  /* depth */
    arg_types[9] = LLVMInt32Type();                     /* mask_input */
    arg_types[10] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */
 
@@ -648,12 +651,16 @@ generate_fragment(struct llvmpipe_context *lp,
    sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
 
    /* loop over quads in the block */
+   zs_format_desc = util_format_description(key->zsbuf_format);
+
    for(i = 0; i < num_fs; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef depth_offset = LLVMConstInt(LLVMInt32Type(),
+                                               i*fs_type.length*zs_format_desc->block.bits/8,
+                                               0);
       LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS];
       LLVMValueRef depth_ptr_i;
 
-      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
+      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &depth_offset, 1, "");
 
       generate_fs(lp, shader, key,
                   builder,
-- 
cgit v1.2.3


From ae00e34e4b0d3be247b0538b60810176397c7915 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Wed, 13 Oct 2010 20:25:17 +0100
Subject: llvmpipe: Generalize the x8z24 fast path to all depth formats.

Together with the previous commit, this generalize the benefits of
d2cf757f44f4ee5554243f3279483a25886d9927 to all depth formats, in
particular:
- simpler float -> 24unorm conversion
- avoid unsigned comparisons (not directly supported on SSE) by aligning
to the least significant bit
- avoid unecessary/repeated mask ANDing

Verified with trivial/tri-z that the exact same assembly is produced for
X8Z24.
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c | 193 ++++++++++++----------------
 1 file changed, 82 insertions(+), 111 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 3162f3e1c2..e4cfa97aa3 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -304,8 +304,13 @@ lp_depth_type(const struct util_format_description *format_desc,
    }
    else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
       assert(format_desc->block.bits <= 32);
-      if(format_desc->channel[swizzle].normalized)
-         type.norm = TRUE;
+      assert(format_desc->channel[swizzle].normalized);
+      if (format_desc->channel[swizzle].size < format_desc->block.bits) {
+         /* Prefer signed integers when possible, as SSE has less support
+          * for unsigned comparison;
+          */
+         type.sign = TRUE;
+      }
    }
    else
       assert(0);
@@ -325,9 +330,9 @@ lp_depth_type(const struct util_format_description *format_desc,
  * in the Z buffer (typically 0xffffff00 or 0x00ffffff).  That lets us
  * get by with fewer bit twiddling steps.
  */
-static boolean
+static void
 get_z_shift_and_mask(const struct util_format_description *format_desc,
-                     unsigned *shift, unsigned *mask)
+                     unsigned *shift, unsigned *width, unsigned *mask)
 {
    const unsigned total_bits = format_desc->block.bits;
    unsigned z_swizzle;
@@ -340,15 +345,16 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
 
    z_swizzle = format_desc->swizzle[0];
 
-   if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
-      return FALSE;
+   assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
+
+   *width = format_desc->channel[z_swizzle].size;
 
    padding_right = 0;
    for (chan = 0; chan < z_swizzle; ++chan)
       padding_right += format_desc->channel[chan].size;
 
    padding_left =
-      total_bits - (padding_right + format_desc->channel[z_swizzle].size);
+      total_bits - (padding_right + *width);
 
    if (padding_left || padding_right) {
       unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
@@ -359,9 +365,7 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
       *mask = 0xffffffff;
    }
 
-   *shift = padding_left;
-
-   return TRUE;
+   *shift = padding_right;
 }
 
 
@@ -462,6 +466,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    struct lp_build_context z_bld;
    struct lp_build_context s_bld;
    struct lp_type s_type;
+   unsigned z_shift, z_width, z_mask;
    LLVMValueRef zs_dst, z_dst = NULL;
    LLVMValueRef stencil_vals = NULL;
    LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
@@ -469,67 +474,6 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    LLVMValueRef orig_mask = lp_build_mask_value(mask);
    LLVMValueRef front_facing = NULL;
 
-   /* Prototype a simpler path:
-    */
-   if (z_src_type.floating &&
-       format_desc->format == PIPE_FORMAT_X8Z24_UNORM &&
-       depth->enabled) 
-   {
-      LLVMValueRef zscaled;
-      LLVMValueRef const_ffffff_float;
-      LLVMValueRef const_8_int;
-      LLVMTypeRef int32_vec_type;
-
-      /* We know the values in z_dst are all >= 0, so allow
-       * lp_build_compare to use signed compare intrinsics:
-       */
-      z_type.floating = 0;
-      z_type.fixed = 0;
-      z_type.sign = 1;
-      z_type.norm = 1;
-      z_type.width = 32;
-      z_type.length = z_src_type.length;
-
-      int32_vec_type = LLVMVectorType(LLVMInt32Type(), z_src_type.length);
-
-      const_8_int = lp_build_const_int_vec(z_type, 8);
-      const_ffffff_float = lp_build_const_vec(z_src_type, (float)0xffffff);
-
-      zscaled = LLVMBuildFMul(builder, z_src, const_ffffff_float, "zscaled");
-      z_src = LLVMBuildFPToSI(builder, zscaled, int32_vec_type, "z_src");
-      
-      /* Load current z/stencil value from z/stencil buffer */
-      zs_dst_ptr = LLVMBuildBitCast(builder,
-                                    zs_dst_ptr,
-                                    LLVMPointerType(int32_vec_type, 0), "");
-      z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
-      z_dst = LLVMBuildLShr(builder, z_dst, const_8_int, "z_dst");
-
-      /* compare src Z to dst Z, returning 'pass' mask */
-      z_pass = lp_build_compare(builder,
-                                z_type,
-                                depth->func, z_src, z_dst);
-
-      lp_build_mask_update(mask, z_pass);
-
-      if (do_branch)
-         lp_build_mask_check(mask);
-
-      /* No need to worry about old stencil contents, just blend the
-       * old and new values and shift into the correct position for
-       * storage.
-       */
-      if (depth->writemask) {
-         z_type.sign = 1;
-         lp_build_context_init(&z_bld, builder, z_type);
-
-         z_dst = lp_build_select(&z_bld, lp_build_mask_value(mask), z_src, z_dst);
-         z_dst = LLVMBuildShl(builder, z_dst, const_8_int, "z_dst");
-         *zs_value = z_dst;
-      }
-
-      return;
-   }
 
    /*
     * Depths are expected to be between 0 and 1, even if they are stored in
@@ -552,10 +496,6 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    assert(z_type.width == z_src_type.width);
    assert(z_type.length == z_src_type.length);
 
-   /* Convert fragment Z from float to integer */
-   lp_build_conv(builder, z_src_type, z_type, &z_src, 1, &z_src, 1);
-
-
    /* Sanity checking */
    {
       const unsigned z_swizzle = format_desc->swizzle[0];
@@ -589,8 +529,6 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                 UTIL_FORMAT_TYPE_UNSIGNED);
          assert(format_desc->channel[z_swizzle].normalized);
          assert(!z_type.fixed);
-         assert(!z_type.sign);
-         assert(z_type.norm);
       }
    }
 
@@ -608,34 +546,14 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                                  LLVMPointerType(z_bld.vec_type, 0), "");
    zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, "");
 
-   lp_build_name(zs_dst, "zsbufval");
+   lp_build_name(zs_dst, "zs_dst");
 
 
    /* Compute and apply the Z/stencil bitmasks and shifts.
     */
    {
-      unsigned z_shift, z_mask;
       unsigned s_shift, s_mask;
 
-      if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) {
-         if (z_shift) {
-            LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
-            z_src = LLVMBuildLShr(builder, z_src, shift, "");
-         }
-
-         if (z_mask != 0xffffffff) {
-            LLVMValueRef mask = lp_build_const_int_vec(z_type, z_mask);
-            z_src = LLVMBuildAnd(builder, z_src, mask, "");
-            z_dst = LLVMBuildAnd(builder, zs_dst, mask, "");
-            z_bitmask = mask;  /* used below */
-         }
-         else {
-            z_dst = zs_dst;
-         }
-
-         lp_build_name(z_dst, "zsbuf.z");
-      }
-
       if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
          if (s_shift) {
             LLVMValueRef shift = lp_build_const_int_vec(s_type, s_shift);
@@ -651,7 +569,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
             stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
          }
 
-         lp_build_name(stencil_vals, "stencil");
+         lp_build_name(stencil_vals, "s_dst");
       }
    }
 
@@ -687,6 +605,62 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    }
 
    if (depth->enabled) {
+      get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
+
+      /*
+       * Convert fragment Z to the desired type, aligning the LSB to the right.
+       */
+
+      assert(z_type.width == z_src_type.width);
+      assert(z_type.length == z_src_type.length);
+      assert(lp_check_value(z_src_type, z_src));
+      if (z_src_type.floating) {
+         /*
+          * Convert from floating point values
+          */
+
+         if (!z_type.floating) {
+            z_src = lp_build_clamped_float_to_unsigned_norm(builder,
+                                                            z_src_type,
+                                                            z_width,
+                                                            z_src);
+         }
+      } else {
+         /*
+          * Convert from unsigned normalized values.
+          */
+
+         assert(!z_src_type.sign);
+         assert(!z_src_type.fixed);
+         assert(z_src_type.norm);
+         assert(!z_type.floating);
+         if (z_src_type.width > z_width) {
+            LLVMValueRef shift = lp_build_const_int_vec(z_src_type,
+                                                        z_src_type.width - z_width);
+            z_src = LLVMBuildLShr(builder, z_src, shift, "");
+         }
+      }
+      assert(lp_check_value(z_type, z_src));
+
+      lp_build_name(z_src, "z_src");
+
+      if (z_mask != 0xffffffff) {
+         z_bitmask = lp_build_const_int_vec(z_type, z_mask);
+      }
+
+      /*
+       * Align the framebuffer Z 's LSB to the right.
+       */
+      if (z_shift) {
+         LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
+         z_dst = LLVMBuildLShr(builder, zs_dst, shift, "z_dst");
+      } else if (z_bitmask) {
+         z_dst = LLVMBuildAnd(builder, zs_dst, z_bitmask, "z_dst");
+      } else {
+         z_dst = zs_dst;
+         lp_build_name(z_dst, "z_dst");
+      }
+
       /* compare src Z to dst Z, returning 'pass' mask */
       z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
 
@@ -704,25 +678,20 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       }
 
       if (depth->writemask) {
-         LLVMValueRef zselectmask = lp_build_mask_value(mask);
+         LLVMValueRef zselectmask;
 
          /* mask off bits that failed Z test */
-         zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, "");
+         zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, "");
 
          /* mask off bits that failed stencil test */
          if (s_pass_mask) {
             zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
          }
 
-         /* if combined Z/stencil format, mask off the stencil bits */
-         if (z_bitmask) {
-            zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, "");
-         }
-
          /* Mix the old and new Z buffer values.
-          * z_dst[i] = (zselectmask[i] & z_src[i]) | (~zselectmask[i] & z_dst[i])
+          * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
           */
-         z_dst = lp_build_select_bitwise(&z_bld, zselectmask, z_src, z_dst);
+         z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst);
       }
 
       if (stencil[0].enabled) {
@@ -752,9 +721,11 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                                          s_pass_mask, front_facing);
    }
 
-   /* The Z bits are already in the right place but we may need to shift the
-    * stencil bits before ORing Z with Stencil to make the final pixel value.
-    */
+   /* Put Z and ztencil bits in the right place */
+   if (z_dst && z_shift) {
+      LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
+      z_dst = LLVMBuildShl(builder, z_dst, shift, "");
+   }
    if (stencil_vals && stencil_shift)
       stencil_vals = LLVMBuildShl(s_bld.builder, stencil_vals,
                                   stencil_shift, "");
-- 
cgit v1.2.3


From f0bd76f28d17da6eabf977a7e619e4ff943a70c7 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 14 Oct 2010 13:15:28 +0100
Subject: llvmpipe: don't try to emit non-existent color outputs

---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index c09835635d..6e3c27e78e 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -406,14 +406,15 @@ generate_fs(struct llvmpipe_context *lp,
       if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR)
       {
          unsigned cbuf = shader->info.base.output_semantic_index[attrib];
-         for(chan = 0; chan < NUM_CHANNELS; ++chan)
-         {
-            /* XXX: just initialize outputs to point at colors[] and
-             * skip this.
-             */
-            LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
-            lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
-            LLVMBuildStore(builder, out, color[cbuf][chan]);
+         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+            if(outputs[attrib][chan]) {
+               /* XXX: just initialize outputs to point at colors[] and
+                * skip this.
+                */
+               LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+               lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
+               LLVMBuildStore(builder, out, color[cbuf][chan]);
+            }
          }
       }
    }
-- 
cgit v1.2.3


From 3d7479d70568c84354338d0da0b7bed4d296c169 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 14 Oct 2010 16:31:54 -0600
Subject: llvmpipe: code to dump bytecode to file (disabled)

---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 6e3c27e78e..d2fbe27708 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -99,6 +99,7 @@
 
 
 #include <llvm-c/Analysis.h>
+#include <llvm-c/BitWriter.h>
 
 
 static unsigned fs_no = 0;
@@ -778,6 +779,11 @@ generate_fragment(struct llvmpipe_context *lp,
       debug_printf("\n");
    }
 
+   /* Dump byte code to a file */
+   if (0) {
+      LLVMWriteBitcodeToFile(lp_build_module, "llvmpipe.bc");
+   }
+
    /*
     * Translate the LLVM IR into machine code.
     */
-- 
cgit v1.2.3


From 4195febeecd2d2f5571afdb90cbb185a4759f50a Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Thu, 14 Oct 2010 23:28:10 +0100
Subject: llvmpipe: reintroduce SET_STATE binner command

But bin lazily only into bins which are receiving geometry.
---
 src/gallium/drivers/llvmpipe/lp_rast.c        | 13 +++++--
 src/gallium/drivers/llvmpipe/lp_rast.h        |  6 ++--
 src/gallium/drivers/llvmpipe/lp_rast_debug.c  | 35 +++++++++++-------
 src/gallium/drivers/llvmpipe/lp_rast_priv.h   |  7 +++-
 src/gallium/drivers/llvmpipe/lp_scene.c       |  4 ++-
 src/gallium/drivers/llvmpipe/lp_scene.h       | 28 ++++++++++++++-
 src/gallium/drivers/llvmpipe/lp_setup_line.c  |  1 -
 src/gallium/drivers/llvmpipe/lp_setup_point.c |  1 -
 src/gallium/drivers/llvmpipe/lp_setup_tri.c   | 51 +++++++++++++++------------
 9 files changed, 100 insertions(+), 46 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index db9b2f9b12..35e2f731e8 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -334,7 +334,7 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
 {
    const struct lp_scene *scene = task->scene;
    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
-   const struct lp_rast_state *state = inputs->state;
+   const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    const unsigned tile_x = task->x, tile_y = task->y;
    unsigned x, y;
@@ -414,7 +414,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
                          unsigned x, unsigned y,
                          unsigned mask)
 {
-   const struct lp_rast_state *state = inputs->state;
+   const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    const struct lp_scene *scene = task->scene;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
@@ -490,6 +490,14 @@ lp_rast_end_query(struct lp_rasterizer_task *task,
 }
 
 
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg)
+{
+   task->state = arg.state;
+}
+
+
 
 /**
  * Set top row and left column of the tile's pixels to white.  For debugging.
@@ -602,6 +610,7 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] =
    lp_rast_shade_tile_opaque,
    lp_rast_begin_query,
    lp_rast_end_query,
+   lp_rast_set_state,
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index e2bcc45016..f74b198a66 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -85,8 +85,6 @@ struct lp_rast_shader_inputs {
    float (*a0)[4];
    float (*dadx)[4];
    float (*dady)[4];
-
-   const struct lp_rast_state *state;
 };
 
 /* Note: the order of these values is important as they are loaded by
@@ -154,6 +152,7 @@ union lp_rast_cmd_arg {
       uint32_t value;
       uint32_t mask;
    } clear_zstencil;
+   const struct lp_rast_state *state;
    struct lp_fence *fence;
    struct llvmpipe_query *query_obj;
 };
@@ -245,8 +244,9 @@ lp_rast_arg_null( void )
 #define LP_RAST_OP_SHADE_TILE_OPAQUE 0xe
 #define LP_RAST_OP_BEGIN_QUERY       0xf
 #define LP_RAST_OP_END_QUERY         0x10
+#define LP_RAST_OP_SET_STATE         0x11
 
-#define LP_RAST_OP_MAX               0x11
+#define LP_RAST_OP_MAX               0x12
 #define LP_RAST_OP_MASK              0xff
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
index 6f4ba1c6fe..3113e196c4 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
@@ -12,6 +12,7 @@ static INLINE int u_bit_scan(unsigned *mask)
 struct tile {
    int coverage;
    int overdraw;
+   const struct lp_rast_state *state;
    char data[TILE_SIZE][TILE_SIZE];
 };
 
@@ -47,6 +48,7 @@ static const char *cmd_names[LP_RAST_OP_MAX] =
    "shade_tile_opaque",
    "begin_query",
    "end_query",
+   "set_state",
 };
 
 static const char *cmd_name(unsigned cmd)
@@ -56,31 +58,31 @@ static const char *cmd_name(unsigned cmd)
 }
 
 static const struct lp_fragment_shader_variant *
-get_variant(  const struct cmd_block *block,
-              int k )
+get_variant( const struct lp_rast_state *state,
+             const struct cmd_block *block,
+             int k )
 {
    if (block->cmd[k] == LP_RAST_OP_SHADE_TILE ||
-       block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE)
-      return  block->arg[k].shade_tile->state->variant;
-
-   if (block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
+       block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_2 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_3 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_4 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_5 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_6 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_7)
-      return block->arg[k].triangle.tri->inputs.state->variant;
+      return state->variant;
 
    return NULL;
 }
 
 
 static boolean
-is_blend( const struct cmd_block *block,
+is_blend( const struct lp_rast_state *state,
+          const struct cmd_block *block,
           int k )
 {
-   const struct lp_fragment_shader_variant *variant = get_variant(block, k);
+   const struct lp_fragment_shader_variant *variant = get_variant(state, block, k);
 
    if (variant)
       return  variant->key.blend.rt[0].blend_enable;
@@ -93,6 +95,7 @@ is_blend( const struct cmd_block *block,
 static void
 debug_bin( const struct cmd_bin *bin )
 {
+   const struct lp_rast_state *state;
    const struct cmd_block *head = bin->head;
    int i, j = 0;
 
@@ -100,9 +103,12 @@ debug_bin( const struct cmd_bin *bin )
                 
    while (head) {
       for (i = 0; i < head->count; i++, j++) {
+         if (head->cmd[i] == LP_RAST_OP_SET_STATE)
+            state = head->arg[i].state;
+
          debug_printf("%d: %s %s\n", j,
                       cmd_name(head->cmd[i]),
-                      is_blend(head, i) ? "blended" : "");
+                      is_blend(state, head, i) ? "blended" : "");
       }
       head = head->next;
    }
@@ -134,7 +140,7 @@ debug_shade_tile(int x, int y,
                  char val)
 {
    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
-   boolean blend = inputs->state->variant->key.blend.rt[0].blend_enable;
+   boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;
    unsigned i,j;
 
    if (inputs->disable)
@@ -176,7 +182,7 @@ debug_triangle(int tilex, int tiley,
    int x, y;
    int count = 0;
    unsigned i, nr_planes = 0;
-   boolean blend = tri->inputs.state->variant->key.blend.rt[0].blend_enable;
+   boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;
 
    if (tri->inputs.disable) {
       /* This triangle was partially binned and has been disabled */
@@ -236,12 +242,15 @@ do_debug_bin( struct tile *tile,
 
    for (block = bin->head; block; block = block->next) {
       for (k = 0; k < block->count; k++, j++) {
-         boolean blend = is_blend(block, k);
+         boolean blend = is_blend(tile->state, block, k);
          char val = get_label(j);
          int count = 0;
             
          if (print_cmds)
             debug_printf("%c: %15s", val, cmd_name(block->cmd[k]));
+
+         if (block->cmd[k] == LP_RAST_OP_SET_STATE)
+            tile->state = block->arg[k].state;
          
          if (block->cmd[k] == LP_RAST_OP_CLEAR_COLOR ||
              block->cmd[k] == LP_RAST_OP_CLEAR_ZSTENCIL)
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 104000a040..7ffd735def 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -77,6 +77,7 @@ struct cmd_bin;
 struct lp_rasterizer_task
 {
    const struct cmd_bin *bin;
+   const struct lp_rast_state *state;
 
    struct lp_scene *scene;
    unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
@@ -244,7 +245,7 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
                          unsigned x, unsigned y )
 {
    const struct lp_scene *scene = task->scene;
-   const struct lp_rast_state *state = inputs->state;
+   const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
    void *depth;
@@ -297,6 +298,10 @@ void lp_rast_triangle_3_16( struct lp_rasterizer_task *,
 void lp_rast_triangle_4_16( struct lp_rasterizer_task *, 
                             const union lp_rast_cmd_arg );
 
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg);
+ 
 void
 lp_debug_bin( const struct cmd_bin *bin );
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index 8b504f23a3..a4fdf7cff3 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -203,7 +203,9 @@ lp_scene_end_rasterization(struct lp_scene *scene )
    for (i = 0; i < scene->tiles_x; i++) {
       for (j = 0; j < scene->tiles_y; j++) {
          struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
-         bin->head = bin->tail = NULL;
+         bin->head = NULL;
+         bin->tail = NULL;
+         bin->last_state = NULL;
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index dbef7692e4..622c522f11 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -41,6 +41,7 @@
 #include "lp_debug.h"
 
 struct lp_scene_queue;
+struct lp_rast_state;
 
 /* We're limited to 2K by 2K for 32bit fixed point rasterization.
  * Will need a 64-bit version for larger framebuffers.
@@ -94,6 +95,7 @@ struct data_block {
 struct cmd_bin {
    ushort x;
    ushort y;
+   const struct lp_rast_state *last_state;       /* most recent state set in bin */
    struct cmd_block *head;
    struct cmd_block *tail;
 };
@@ -297,7 +299,7 @@ lp_scene_bin_command( struct lp_scene *scene,
 
    assert(x < scene->tiles_x);
    assert(y < scene->tiles_y);
-   assert(cmd <= LP_RAST_OP_END_QUERY);
+   assert(cmd < LP_RAST_OP_MAX);
 
    if (tail == NULL || tail->count == CMD_BLOCK_MAX) {
       tail = lp_scene_new_cmd_block( scene, bin );
@@ -318,6 +320,30 @@ lp_scene_bin_command( struct lp_scene *scene,
 }
 
 
+static INLINE boolean
+lp_scene_bin_cmd_with_state( struct lp_scene *scene,
+                             unsigned x, unsigned y,
+                             const struct lp_rast_state *state,
+                             unsigned cmd,
+                             union lp_rast_cmd_arg arg )
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+
+   if (state != bin->last_state) {
+      bin->last_state = state;
+      if (!lp_scene_bin_command(scene, x, y,
+                                LP_RAST_OP_SET_STATE,
+                                lp_rast_arg_state(state)))
+         return FALSE;
+   }
+
+   if (!lp_scene_bin_command( scene, x, y, cmd, arg ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
 /* Add a command to all active bins.
  */
 static INLINE boolean
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 693ac28175..e4cff9aa42 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -597,7 +597,6 @@ try_setup_line( struct lp_setup_context *setup,
    setup_line_coefficients( setup, line, &info); 
 
    line->inputs.facing = 1.0F;
-   line->inputs.state = setup->fs.stored;
    line->inputs.disable = FALSE;
    line->inputs.opaque = FALSE;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 64b24a88d5..93c3efe347 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -374,7 +374,6 @@ try_setup_point( struct lp_setup_context *setup,
    setup_point_coefficients(setup, point, &info);
 
    point->inputs.facing = 1.0F;
-   point->inputs.state = setup->fs.stored;
    point->inputs.disable = FALSE;
    point->inputs.opaque = FALSE;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 8fd034666c..bc48eb8d1b 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -200,14 +200,16 @@ lp_setup_whole_tile(struct lp_setup_context *setup,
       }
 
       LP_COUNT(nr_shade_opaque_64);
-      return lp_scene_bin_command( scene, tx, ty,
-                                   LP_RAST_OP_SHADE_TILE_OPAQUE,
-                                   lp_rast_arg_inputs(inputs) );
+      return lp_scene_bin_cmd_with_state( scene, tx, ty,
+                                          setup->fs.stored,
+                                          LP_RAST_OP_SHADE_TILE_OPAQUE,
+                                          lp_rast_arg_inputs(inputs) );
    } else {
       LP_COUNT(nr_shade_64);
-      return lp_scene_bin_command( scene, tx, ty,
-                                   LP_RAST_OP_SHADE_TILE,
-                                   lp_rast_arg_inputs(inputs) );
+      return lp_scene_bin_cmd_with_state( scene, tx, ty,
+                                          setup->fs.stored, 
+                                          LP_RAST_OP_SHADE_TILE,
+                                          lp_rast_arg_inputs(inputs) );
    }
 }
 
@@ -320,7 +322,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
    tri->inputs.facing = frontfacing ? 1.0F : -1.0F;
    tri->inputs.disable = FALSE;
    tri->inputs.opaque = setup->fs.current.variant->opaque;
-   tri->inputs.state = setup->fs.stored;
 
   
    for (i = 0; i < 3; i++) {
@@ -491,34 +492,36 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
          {
             /* Triangle is contained in a single 4x4 stamp:
              */
-
-            return lp_scene_bin_command( scene, ix0, iy0,
-                                         LP_RAST_OP_TRIANGLE_3_4,
-                                         lp_rast_arg_triangle(tri, mask) );
+            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+                                                setup->fs.stored,
+                                                LP_RAST_OP_TRIANGLE_3_4,
+                                                lp_rast_arg_triangle(tri, mask) );
          }
 
          if (sz < 16)
          {
             /* Triangle is contained in a single 16x16 block:
              */
-            return lp_scene_bin_command( scene, ix0, iy0,
-                                         LP_RAST_OP_TRIANGLE_3_16,
-                                         lp_rast_arg_triangle(tri, mask) );
+            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+                                                setup->fs.stored,
+                                                LP_RAST_OP_TRIANGLE_3_16,
+                                                lp_rast_arg_triangle(tri, mask) );
          }
       }
       else if (nr_planes == 4 && sz < 16) 
       {
-         return lp_scene_bin_command( scene, ix0, iy0,
-                                      LP_RAST_OP_TRIANGLE_4_16,
-                                      lp_rast_arg_triangle(tri, mask) );
+         return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
+                                            setup->fs.stored,
+                                            LP_RAST_OP_TRIANGLE_4_16,
+                                            lp_rast_arg_triangle(tri, mask) );
       }
 
 
       /* Triangle is contained in a single tile:
        */
-      return lp_scene_bin_command( scene, ix0, iy0,
-                                   lp_rast_tri_tab[nr_planes], 
-                                   lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
+      return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored,
+                                          lp_rast_tri_tab[nr_planes], 
+                                          lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
    }
    else
    {
@@ -584,9 +587,11 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                 */
                int count = util_bitcount(partial);
                in = TRUE;
-               if (!lp_scene_bin_command( scene, x, y,
-                                          lp_rast_tri_tab[count], 
-                                          lp_rast_arg_triangle(tri, partial) ))
+               
+               if (!lp_scene_bin_cmd_with_state( scene, x, y,
+                                                 setup->fs.stored,
+                                                 lp_rast_tri_tab[count], 
+                                                 lp_rast_arg_triangle(tri, partial) ))
                   goto fail;
 
                LP_COUNT(nr_partially_covered_64);
-- 
cgit v1.2.3


From 0a1c9001037a13b69b157994e7983aa3dee158d3 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 15 Oct 2010 00:12:19 +0100
Subject: llvmpipe: don't pass frontfacing as a float

---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c   | 8 ++++----
 src/gallium/drivers/llvmpipe/lp_jit.h         | 2 +-
 src/gallium/drivers/llvmpipe/lp_rast.c        | 4 ++--
 src/gallium/drivers/llvmpipe/lp_rast.h        | 2 +-
 src/gallium/drivers/llvmpipe/lp_rast_priv.h   | 2 +-
 src/gallium/drivers/llvmpipe/lp_setup_line.c  | 2 +-
 src/gallium/drivers/llvmpipe/lp_setup_point.c | 2 +-
 src/gallium/drivers/llvmpipe/lp_setup_tri.c   | 2 +-
 src/gallium/drivers/llvmpipe/lp_state_fs.c    | 2 +-
 9 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index e4cfa97aa3..ddf7da0b14 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -446,7 +446,7 @@ lp_build_occlusion_count(LLVMBuilderRef builder,
  * \param stencil_refs  the front/back stencil ref values (scalar)
  * \param z_src  the incoming depth/stencil values (a 2x2 quad, float32)
  * \param zs_dst_ptr  pointer to depth/stencil values in framebuffer
- * \param facing  contains float value indicating front/back facing polygon
+ * \param facing  contains boolean value indicating front/back facing polygon
  */
 void
 lp_build_depth_stencil_test(LLVMBuilderRef builder,
@@ -576,10 +576,10 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    if (stencil[0].enabled) {
 
       if (face) {
-         LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
+         LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
 
-         /* front_facing = face > 0.0 ? ~0 : 0 */
-         front_facing = LLVMBuildFCmp(builder, LLVMRealUGT, face, zero, "");
+         /* front_facing = face != 0 ? ~0 : 0 */
+         front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
          front_facing = LLVMBuildSExt(builder, front_facing,
                                       LLVMIntType(s_bld.type.length*s_bld.type.width),
                                       "");
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 16e04fce0c..114f21f2d1 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -144,7 +144,7 @@ typedef void
 (*lp_jit_frag_func)(const struct lp_jit_context *context,
                     uint32_t x,
                     uint32_t y,
-                    float facing,
+                    uint32_t facing,
                     const void *a0,
                     const void *dadx,
                     const void *dady,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index 35e2f731e8..8e9be755e0 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -365,7 +365,7 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
          BEGIN_JIT_CALL(state);
          variant->jit_function[RAST_WHOLE]( &state->jit_context,
                                             tile_x + x, tile_y + y,
-                                            inputs->facing,
+                                            inputs->frontfacing,
                                             inputs->a0,
                                             inputs->dadx,
                                             inputs->dady,
@@ -446,7 +446,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
    BEGIN_JIT_CALL(state);
    variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
                                          x, y,
-                                         inputs->facing,
+                                         inputs->frontfacing,
                                          inputs->a0,
                                          inputs->dadx,
                                          inputs->dady,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index f74b198a66..c5fb15484c 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -78,7 +78,7 @@ struct lp_rast_state {
  * These pointers point into the bin data buffer.
  */
 struct lp_rast_shader_inputs {
-   float facing;     /** Positive for front-facing, negative for back-facing */
+   unsigned frontfacing;     /** One for front-facing */
    unsigned disable:1;  /** Partially binned, disable this command */
    unsigned opaque:1;   /** Is opaque */
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 7ffd735def..e5d04c65b0 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -261,7 +261,7 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
    BEGIN_JIT_CALL(state);
    variant->jit_function[RAST_WHOLE]( &state->jit_context,
                                       x, y,
-                                      inputs->facing,
+                                      inputs->frontfacing,
                                       inputs->a0,
                                       inputs->dadx,
                                       inputs->dady,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index e4cff9aa42..efc48eecfe 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -596,7 +596,7 @@ try_setup_line( struct lp_setup_context *setup,
     */
    setup_line_coefficients( setup, line, &info); 
 
-   line->inputs.facing = 1.0F;
+   line->inputs.frontfacing = TRUE;
    line->inputs.disable = FALSE;
    line->inputs.opaque = FALSE;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 93c3efe347..108c831e66 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -373,7 +373,7 @@ try_setup_point( struct lp_setup_context *setup,
     */
    setup_point_coefficients(setup, point, &info);
 
-   point->inputs.facing = 1.0F;
+   point->inputs.frontfacing = TRUE;
    point->inputs.disable = FALSE;
    point->inputs.opaque = FALSE;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index bc48eb8d1b..3bf0b2d252 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -319,7 +319,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
     */
    lp_setup_tri_coef( setup, &tri->inputs, v0, v1, v2, frontfacing );
 
-   tri->inputs.facing = frontfacing ? 1.0F : -1.0F;
+   tri->inputs.frontfacing = frontfacing;
    tri->inputs.disable = FALSE;
    tri->inputs.opaque = setup->fs.current.variant->opaque;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index d2fbe27708..8df807cec8 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -579,7 +579,7 @@ generate_fragment(struct llvmpipe_context *lp,
    arg_types[0] = screen->context_ptr_type;            /* context */
    arg_types[1] = LLVMInt32Type();                     /* x */
    arg_types[2] = LLVMInt32Type();                     /* y */
-   arg_types[3] = LLVMFloatType();                     /* facing */
+   arg_types[3] = LLVMInt32Type();                     /* facing */
    arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
-- 
cgit v1.2.3


From 9bf8a55c4b29d55320fc2e7875ecf0e9ca164ee8 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 15 Oct 2010 12:23:22 +0100
Subject: llvmpipe: slightly shrink the size of a binned triangle

---
 src/gallium/drivers/llvmpipe/lp_rast.c             |  12 +-
 src/gallium/drivers/llvmpipe/lp_rast.h             |  30 ++--
 src/gallium/drivers/llvmpipe/lp_rast_debug.c       |   3 +-
 src/gallium/drivers/llvmpipe/lp_rast_priv.h        |   6 +-
 src/gallium/drivers/llvmpipe/lp_rast_tri.c         |   4 +-
 src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h     |   7 +-
 src/gallium/drivers/llvmpipe/lp_setup_coef.c       |  67 ++++-----
 src/gallium/drivers/llvmpipe/lp_setup_coef.h       |   4 +
 .../drivers/llvmpipe/lp_setup_coef_intrin.c        |  52 ++++---
 src/gallium/drivers/llvmpipe/lp_setup_line.c       | 154 +++++++++++----------
 src/gallium/drivers/llvmpipe/lp_setup_point.c      | 141 ++++++++++---------
 src/gallium/drivers/llvmpipe/lp_setup_tri.c        | 128 ++++++++---------
 12 files changed, 316 insertions(+), 292 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index 8e9be755e0..d358a98394 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -366,9 +366,9 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
          variant->jit_function[RAST_WHOLE]( &state->jit_context,
                                             tile_x + x, tile_y + y,
                                             inputs->frontfacing,
-                                            inputs->a0,
-                                            inputs->dadx,
-                                            inputs->dady,
+                                            GET_A0(inputs),
+                                            GET_DADX(inputs),
+                                            GET_DADY(inputs),
                                             color,
                                             depth,
                                             0xffff,
@@ -447,9 +447,9 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
    variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
                                          x, y,
                                          inputs->frontfacing,
-                                         inputs->a0,
-                                         inputs->dadx,
-                                         inputs->dady,
+                                         GET_A0(inputs),
+                                         GET_DADX(inputs),
+                                         GET_DADY(inputs),
                                          color,
                                          depth,
                                          mask,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c5fb15484c..8d8b6210ec 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -78,13 +78,14 @@ struct lp_rast_state {
  * These pointers point into the bin data buffer.
  */
 struct lp_rast_shader_inputs {
-   unsigned frontfacing;     /** One for front-facing */
-   unsigned disable:1;  /** Partially binned, disable this command */
-   unsigned opaque:1;   /** Is opaque */
-
-   float (*a0)[4];
-   float (*dadx)[4];
-   float (*dady)[4];
+   unsigned frontfacing:1;      /** True for front-facing */
+   unsigned disable:1;          /** Partially binned, disable this command */
+   unsigned opaque:1;           /** Is opaque */
+   unsigned pad0:29;            /* wasted space */
+   unsigned stride;             /* how much to advance data between a0, dadx, dady */
+   unsigned pad2;               /* wasted space */
+   unsigned pad3;               /* wasted space */
+   /* followed by a0, dadx, dady and planes[] */
 };
 
 /* Note: the order of these values is important as they are loaded by
@@ -111,17 +112,24 @@ struct lp_rast_plane {
  * Objects of this type are put into the lp_setup_context::data buffer.
  */
 struct lp_rast_triangle {
-   /* inputs for the shader */
-   struct lp_rast_shader_inputs inputs;
-
 #ifdef DEBUG
    float v[3][2];
+   float pad0;
+   float pad1;
 #endif
 
-   struct lp_rast_plane plane[8]; /* NOTE: may allocate fewer planes */
+   /* inputs for the shader */
+   struct lp_rast_shader_inputs inputs;
+   /* planes are also allocated here */
 };
 
 
+#define GET_A0(inputs) ((float (*)[4])((inputs)+1))
+#define GET_DADX(inputs) ((float (*)[4])((char *)((inputs) + 1) + (inputs)->stride))
+#define GET_DADY(inputs) ((float (*)[4])((char *)((inputs) + 1) + 2 * (inputs)->stride))
+#define GET_PLANES(tri) ((struct lp_rast_plane *)((char *)(&(tri)->inputs + 1) + 3 * (tri)->inputs.stride))
+
+
 
 struct lp_rasterizer *
 lp_rast_create( unsigned num_threads );
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
index 3113e196c4..e2783aa568 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
@@ -178,6 +178,7 @@ debug_triangle(int tilex, int tiley,
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    unsigned plane_mask = arg.triangle.plane_mask;
+   const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
    struct lp_rast_plane plane[8];
    int x, y;
    int count = 0;
@@ -190,7 +191,7 @@ debug_triangle(int tilex, int tiley,
    }
 
    while (plane_mask) {
-      plane[nr_planes] = tri->plane[u_bit_scan(&plane_mask)];
+      plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)];
       plane[nr_planes].c = (plane[nr_planes].c +
                             plane[nr_planes].dcdy * tiley -
                             plane[nr_planes].dcdx * tilex);
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index e5d04c65b0..b30408f097 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -262,9 +262,9 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
    variant->jit_function[RAST_WHOLE]( &state->jit_context,
                                       x, y,
                                       inputs->frontfacing,
-                                      inputs->a0,
-                                      inputs->dadx,
-                                      inputs->dady,
+                                      GET_A0(inputs),
+                                      GET_DADX(inputs),
+                                      GET_DADY(inputs),
                                       color,
                                       depth,
                                       0xffff,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index bae772b9c5..5bdf19712f 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -311,7 +311,7 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
    int x = (arg.triangle.plane_mask & 0xff) + task->x;
    int y = (arg.triangle.plane_mask >> 8) + task->y;
    unsigned i, j;
@@ -421,7 +421,7 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
                      const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
    int x = (arg.triangle.plane_mask & 0xff) + task->x;
    int y = (arg.triangle.plane_mask >> 8) + task->y;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 2f03229512..9976996719 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -156,6 +156,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    unsigned plane_mask = arg.triangle.plane_mask;
+   const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
    const int x = task->x, y = task->y;
    struct lp_rast_plane plane[NR_PLANES];
    int c[NR_PLANES];
@@ -172,7 +173,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 
    while (plane_mask) {
       int i = ffs(plane_mask) - 1;
-      plane[j] = tri->plane[i];
+      plane[j] = tri_plane[i];
       plane_mask &= ~(1 << i);
       c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
 
@@ -255,7 +256,7 @@ TRI_16(struct lp_rasterizer_task *task,
        const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
    unsigned mask = arg.triangle.plane_mask;
    unsigned outmask, partial_mask;
    unsigned j;
@@ -328,7 +329,7 @@ TRI_4(struct lp_rasterizer_task *task,
       const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
    unsigned mask = arg.triangle.plane_mask;
    const int x = task->x + (mask & 0xff);
    const int y = task->y + (mask >> 8);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
index 8dc2688ddb..a835df6af2 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
@@ -42,20 +42,19 @@
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  */
-static void constant_coef( struct lp_rast_shader_inputs *inputs,
+static void constant_coef( struct lp_tri_info *info,
                            unsigned slot,
 			   const float value,
                            unsigned i )
 {
-   inputs->a0[slot][i] = value;
-   inputs->dadx[slot][i] = 0.0f;
-   inputs->dady[slot][i] = 0.0f;
+   info->a0[slot][i] = value;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
 }
 
 
-static void linear_coef( struct lp_rast_shader_inputs *inputs,
-                         const struct lp_tri_info *info,
+static void linear_coef( struct lp_tri_info *info,
                          unsigned slot,
                          unsigned vert_attr,
                          unsigned i)
@@ -69,8 +68,8 @@ static void linear_coef( struct lp_rast_shader_inputs *inputs,
    float dadx = (da01 * info->dy20_ooa - info->dy01_ooa * da20);
    float dady = (da20 * info->dx01_ooa - info->dx20_ooa * da01);
 
-   inputs->dadx[slot][i] = dadx;
-   inputs->dady[slot][i] = dady;
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;
 
    /* calculate a0 as the value which would be sampled for the
     * fragment at (0,0), taking into account that we want to sample at
@@ -84,7 +83,7 @@ static void linear_coef( struct lp_rast_shader_inputs *inputs,
     * to define a0 as the sample at a pixel center somewhere near vmin
     * instead - i'll switch to this later.
     */
-   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
+   info->a0[slot][i] = a0 - (dadx * info->x0_center +
 				   dady * info->y0_center);
 }
 
@@ -97,8 +96,7 @@ static void linear_coef( struct lp_rast_shader_inputs *inputs,
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void perspective_coef( struct lp_rast_shader_inputs *inputs,
-                              const struct lp_tri_info *info,
+static void perspective_coef( struct lp_tri_info *info,
                               unsigned slot,
 			      unsigned vert_attr,
                               unsigned i)
@@ -113,9 +111,9 @@ static void perspective_coef( struct lp_rast_shader_inputs *inputs,
    float dadx = da01 * info->dy20_ooa - info->dy01_ooa * da20;
    float dady = da20 * info->dx01_ooa - info->dx20_ooa * da01;
 
-   inputs->dadx[slot][i] = dadx;
-   inputs->dady[slot][i] = dady;
-   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;
+   info->a0[slot][i] = a0 - (dadx * info->x0_center +
 				   dady * info->y0_center);
 }
 
@@ -127,23 +125,22 @@ static void perspective_coef( struct lp_rast_shader_inputs *inputs,
  * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
  */
 static void
-setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs,
-                     const struct lp_tri_info *info,
+setup_fragcoord_coef(struct lp_tri_info *info,
                      unsigned slot,
                      unsigned usage_mask)
 {
    /*X*/
    if (usage_mask & TGSI_WRITEMASK_X) {
-      inputs->a0[slot][0] = 0.0;
-      inputs->dadx[slot][0] = 1.0;
-      inputs->dady[slot][0] = 0.0;
+      info->a0[slot][0] = 0.0;
+      info->dadx[slot][0] = 1.0;
+      info->dady[slot][0] = 0.0;
    }
 
    /*Y*/
    if (usage_mask & TGSI_WRITEMASK_Y) {
-      inputs->a0[slot][1] = 0.0;
-      inputs->dadx[slot][1] = 0.0;
-      inputs->dady[slot][1] = 1.0;
+      info->a0[slot][1] = 0.0;
+      info->dadx[slot][1] = 0.0;
+      info->dady[slot][1] = 1.0;
    }
 
    /*Z*/
@@ -162,23 +159,23 @@ setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs,
  * Setup the fragment input attribute with the front-facing value.
  * \param frontface  is the triangle front facing?
  */
-static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
+static void setup_facing_coef( struct lp_tri_info *info,
                                unsigned slot,
                                boolean frontface,
                                unsigned usage_mask)
 {
    /* convert TRUE to 1.0 and FALSE to -1.0 */
    if (usage_mask & TGSI_WRITEMASK_X)
-      constant_coef( inputs, slot, 2.0f * frontface - 1.0f, 0 );
+      constant_coef( info, slot, 2.0f * frontface - 1.0f, 0 );
 
    if (usage_mask & TGSI_WRITEMASK_Y)
-      constant_coef( inputs, slot, 0.0f, 1 ); /* wasted */
+      constant_coef( info, slot, 0.0f, 1 ); /* wasted */
 
    if (usage_mask & TGSI_WRITEMASK_Z)
-      constant_coef( inputs, slot, 0.0f, 2 ); /* wasted */
+      constant_coef( info, slot, 0.0f, 2 ); /* wasted */
 
    if (usage_mask & TGSI_WRITEMASK_W)
-      constant_coef( inputs, slot, 0.0f, 3 ); /* wasted */
+      constant_coef( info, slot, 0.0f, 3 ); /* wasted */
 }
 
 
@@ -212,6 +209,10 @@ void lp_setup_tri_coef( struct lp_setup_context *setup,
    info.dx20_ooa  = dx20 * oneoverarea;
    info.dy01_ooa  = dy01 * oneoverarea;
    info.dy20_ooa  = dy20 * oneoverarea;
+   info.a0 = GET_A0(inputs);
+   info.dadx = GET_DADX(inputs);
+   info.dady = GET_DADY(inputs);
+      
 
 
    /* setup interpolation for all the remaining attributes:
@@ -225,25 +226,25 @@ void lp_setup_tri_coef( struct lp_setup_context *setup,
          if (setup->flatshade_first) {
             for (i = 0; i < NUM_CHANNELS; i++)
                if (usage_mask & (1 << i))
-                  constant_coef(inputs, slot+1, info.v0[vert_attr][i], i);
+                  constant_coef(&info, slot+1, info.v0[vert_attr][i], i);
          }
          else {
             for (i = 0; i < NUM_CHANNELS; i++)
                if (usage_mask & (1 << i))
-                  constant_coef(inputs, slot+1, info.v2[vert_attr][i], i);
+                  constant_coef(&info, slot+1, info.v2[vert_attr][i], i);
          }
          break;
 
       case LP_INTERP_LINEAR:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               linear_coef(inputs, &info, slot+1, vert_attr, i);
+               linear_coef(&info, slot+1, vert_attr, i);
          break;
 
       case LP_INTERP_PERSPECTIVE:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               perspective_coef(inputs, &info, slot+1, vert_attr, i);
+               perspective_coef(&info, slot+1, vert_attr, i);
          fragcoord_usage_mask |= TGSI_WRITEMASK_W;
          break;
 
@@ -257,7 +258,7 @@ void lp_setup_tri_coef( struct lp_setup_context *setup,
          break;
 
       case LP_INTERP_FACING:
-         setup_facing_coef(inputs, slot+1, info.frontfacing, usage_mask);
+         setup_facing_coef(&info, slot+1, info.frontfacing, usage_mask);
          break;
 
       default:
@@ -267,7 +268,7 @@ void lp_setup_tri_coef( struct lp_setup_context *setup,
 
    /* The internal position input is in slot zero:
     */
-   setup_fragcoord_coef(inputs, &info, 0, fragcoord_usage_mask);
+   setup_fragcoord_coef(&info, 0, fragcoord_usage_mask);
 }
 
 #else
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.h b/src/gallium/drivers/llvmpipe/lp_setup_coef.h
index 87a3255ccc..7b5b78edd5 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef.h
@@ -52,6 +52,10 @@ struct lp_tri_info {
    const float (*v2)[4];
 
    boolean frontfacing;		/* remove eventually */
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
 };
 
 void lp_setup_tri_coef( struct lp_setup_context *setup,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
index 3742fd672b..29714e2768 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
@@ -40,14 +40,13 @@
 #include <emmintrin.h>
 
 
-static void constant_coef4( struct lp_rast_shader_inputs *inputs,
-			    const struct lp_tri_info *info,
+static void constant_coef4( struct lp_tri_info *info,
 			    unsigned slot,
 			    const float *attr)
 {
-   *(__m128 *)inputs->a0[slot]   = *(__m128 *)attr;
-   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
-   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
+   *(__m128 *)info->a0[slot]   = *(__m128 *)attr;
+   *(__m128 *)info->dadx[slot] = _mm_set1_ps(0.0);
+   *(__m128 *)info->dady[slot] = _mm_set1_ps(0.0);
 }
 
 
@@ -56,8 +55,7 @@ static void constant_coef4( struct lp_rast_shader_inputs *inputs,
  * Setup the fragment input attribute with the front-facing value.
  * \param frontface  is the triangle front facing?
  */
-static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
-			       const struct lp_tri_info *info,
+static void setup_facing_coef( struct lp_tri_info *info,
 			       unsigned slot )
 {
    /* XXX: just pass frontface directly to the shader, don't bother
@@ -66,15 +64,14 @@ static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
    __m128 a0 = _mm_setr_ps(info->frontfacing ? 1.0 : -1.0,
 			   0, 0, 0);
 
-   *(__m128 *)inputs->a0[slot]   = a0;
-   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
-   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
+   *(__m128 *)info->a0[slot]   = a0;
+   *(__m128 *)info->dadx[slot] = _mm_set1_ps(0.0);
+   *(__m128 *)info->dady[slot] = _mm_set1_ps(0.0);
 }
 
 
-static void calc_coef4( struct lp_rast_shader_inputs *inputs,
-			const struct lp_tri_info *info,
+static void calc_coef4(	const struct lp_tri_info *info,
 			unsigned slot,
 			__m128 a0,
 			__m128 a1,
@@ -96,14 +93,13 @@ static void calc_coef4( struct lp_rast_shader_inputs *inputs,
    __m128 attr_v0       = _mm_add_ps(dadx_x0, dady_y0);
    __m128 attr_0        = _mm_sub_ps(a0, attr_v0);
 
-   *(__m128 *)inputs->a0[slot]   = attr_0;
-   *(__m128 *)inputs->dadx[slot] = dadx;
-   *(__m128 *)inputs->dady[slot] = dady;
+   *(__m128 *)info->a0[slot]   = attr_0;
+   *(__m128 *)info->dadx[slot] = dadx;
+   *(__m128 *)info->dady[slot] = dady;
 }
 
 
-static void linear_coef( struct lp_rast_shader_inputs *inputs,
-                         const struct lp_tri_info *info,
+static void linear_coef( struct lp_tri_info *info,
                          unsigned slot,
                          unsigned vert_attr)
 {
@@ -111,7 +107,7 @@ static void linear_coef( struct lp_rast_shader_inputs *inputs,
    __m128 a1 = *(const __m128 *)info->v1[vert_attr];
    __m128 a2 = *(const __m128 *)info->v2[vert_attr];
 
-   calc_coef4(inputs, info, slot, a0, a1, a2);
+   calc_coef4(info, slot, a0, a1, a2);
 }
 
 
@@ -124,8 +120,7 @@ static void linear_coef( struct lp_rast_shader_inputs *inputs,
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void perspective_coef( struct lp_rast_shader_inputs *inputs,
-                              const struct lp_tri_info *info,
+static void perspective_coef( const struct lp_tri_info *info,
                               unsigned slot,
 			      unsigned vert_attr)
 {
@@ -139,7 +134,7 @@ static void perspective_coef( struct lp_rast_shader_inputs *inputs,
    __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(info->v1[0][3]));
    __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(info->v2[0][3]));
 
-   calc_coef4(inputs, info, slot, a0_oow, a1_oow, a2_oow);
+   calc_coef4(info, slot, a0_oow, a1_oow, a2_oow);
 }
 
 
@@ -174,11 +169,14 @@ void lp_setup_tri_coef( struct lp_setup_context *setup,
    info.dx20_ooa  = dx20 * oneoverarea;
    info.dy01_ooa  = dy01 * oneoverarea;
    info.dy20_ooa  = dy20 * oneoverarea;
+   info.a0 = GET_A0(inputs);
+   info.dadx = GET_DADX(inputs);
+   info.dady = GET_DADY(inputs);
 
 
    /* The internal position input is in slot zero:
     */
-   linear_coef(inputs, &info, 0, 0);
+   linear_coef(&info, 0, 0);
 
    /* setup interpolation for all the remaining attributes:
     */
@@ -188,19 +186,19 @@ void lp_setup_tri_coef( struct lp_setup_context *setup,
       switch (setup->fs.input[slot].interp) {
       case LP_INTERP_CONSTANT:
          if (setup->flatshade_first) {
-	    constant_coef4(inputs, &info, slot+1, info.v0[vert_attr]);
+	    constant_coef4(&info, slot+1, info.v0[vert_attr]);
          }
          else {
-	    constant_coef4(inputs, &info, slot+1, info.v2[vert_attr]);
+	    constant_coef4(&info, slot+1, info.v2[vert_attr]);
          }
          break;
 
       case LP_INTERP_LINEAR:
-	 linear_coef(inputs, &info, slot+1, vert_attr);
+	 linear_coef(&info, slot+1, vert_attr);
          break;
 
       case LP_INTERP_PERSPECTIVE:
-	 perspective_coef(inputs, &info, slot+1, vert_attr);
+	 perspective_coef(&info, slot+1, vert_attr);
          break;
 
       case LP_INTERP_POSITION:
@@ -211,7 +209,7 @@ void lp_setup_tri_coef( struct lp_setup_context *setup,
          break;
 
       case LP_INTERP_FACING:
-         setup_facing_coef(inputs, &info, slot+1);
+         setup_facing_coef(&info, slot+1);
          break;
 
       default:
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index efc48eecfe..2fd9f2e2f2 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -46,6 +46,10 @@ struct lp_line_info {
 
    const float (*v1)[4];
    const float (*v2)[4];
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
 };
 
 
@@ -53,14 +57,14 @@ struct lp_line_info {
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  */
 static void constant_coef( struct lp_setup_context *setup,
-                           struct lp_rast_triangle *tri,
+                           struct lp_line_info *info,
                            unsigned slot,
                            const float value,
                            unsigned i )
 {
-   tri->inputs.a0[slot][i] = value;
-   tri->inputs.dadx[slot][i] = 0.0f;
-   tri->inputs.dady[slot][i] = 0.0f;
+   info->a0[slot][i] = value;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
 }
 
 
@@ -69,7 +73,6 @@ static void constant_coef( struct lp_setup_context *setup,
  * for a triangle.
  */
 static void linear_coef( struct lp_setup_context *setup,
-                         struct lp_rast_triangle *tri,
                          struct lp_line_info *info,
                          unsigned slot,
                          unsigned vert_attr,
@@ -82,10 +85,10 @@ static void linear_coef( struct lp_setup_context *setup,
    float dadx = da21 * info->dx * info->oneoverarea;
    float dady = da21 * info->dy * info->oneoverarea;
 
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;  
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;  
    
-   tri->inputs.a0[slot][i] = (a1 -
+   info->a0[slot][i] = (a1 -
                               (dadx * (info->v1[0][0] - setup->pixel_offset) +
                                dady * (info->v1[0][1] - setup->pixel_offset)));
 }
@@ -100,7 +103,6 @@ static void linear_coef( struct lp_setup_context *setup,
  * divide the interpolated value by the interpolated W at that fragment.
  */
 static void perspective_coef( struct lp_setup_context *setup,
-                              struct lp_rast_triangle *tri,
                               struct lp_line_info *info,
                               unsigned slot,
                               unsigned vert_attr,
@@ -115,43 +117,42 @@ static void perspective_coef( struct lp_setup_context *setup,
    float dadx = da21 * info->dx * info->oneoverarea;
    float dady = da21 * info->dy * info->oneoverarea;
 
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;
    
-   tri->inputs.a0[slot][i] = (a1 -
-                              (dadx * (info->v1[0][0] - setup->pixel_offset) +
-                               dady * (info->v1[0][1] - setup->pixel_offset)));
+   info->a0[slot][i] = (a1 -
+                        (dadx * (info->v1[0][0] - setup->pixel_offset) +
+                         dady * (info->v1[0][1] - setup->pixel_offset)));
 }
 
 static void
 setup_fragcoord_coef( struct lp_setup_context *setup,
-                      struct lp_rast_triangle *tri,
                       struct lp_line_info *info,
                       unsigned slot,
                       unsigned usage_mask)
 {
    /*X*/
    if (usage_mask & TGSI_WRITEMASK_X) {
-      tri->inputs.a0[slot][0] = 0.0;
-      tri->inputs.dadx[slot][0] = 1.0;
-      tri->inputs.dady[slot][0] = 0.0;
+      info->a0[slot][0] = 0.0;
+      info->dadx[slot][0] = 1.0;
+      info->dady[slot][0] = 0.0;
    }
 
    /*Y*/
    if (usage_mask & TGSI_WRITEMASK_Y) {
-      tri->inputs.a0[slot][1] = 0.0;
-      tri->inputs.dadx[slot][1] = 0.0;
-      tri->inputs.dady[slot][1] = 1.0;
+      info->a0[slot][1] = 0.0;
+      info->dadx[slot][1] = 0.0;
+      info->dady[slot][1] = 1.0;
    }
 
    /*Z*/
    if (usage_mask & TGSI_WRITEMASK_Z) {
-      linear_coef(setup, tri, info, slot, 0, 2);
+      linear_coef(setup, info, slot, 0, 2);
    }
 
    /*W*/
    if (usage_mask & TGSI_WRITEMASK_W) {
-      linear_coef(setup, tri, info, slot, 0, 3);
+      linear_coef(setup, info, slot, 0, 3);
    }
 }
 
@@ -159,7 +160,6 @@ setup_fragcoord_coef( struct lp_setup_context *setup,
  * Compute the tri->coef[] array dadx, dady, a0 values.
  */
 static void setup_line_coefficients( struct lp_setup_context *setup,
-                                     struct lp_rast_triangle *tri,
                                      struct lp_line_info *info)
 {
    unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
@@ -177,25 +177,25 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
          if (setup->flatshade_first) {
             for (i = 0; i < NUM_CHANNELS; i++)
                if (usage_mask & (1 << i))
-                  constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i);
+                  constant_coef(setup, info, slot+1, info->v1[vert_attr][i], i);
          }
          else {
             for (i = 0; i < NUM_CHANNELS; i++)
                if (usage_mask & (1 << i))
-                  constant_coef(setup, tri, slot+1, info->v2[vert_attr][i], i);
+                  constant_coef(setup, info, slot+1, info->v2[vert_attr][i], i);
          }
          break;
 
       case LP_INTERP_LINEAR:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               linear_coef(setup, tri, info, slot+1, vert_attr, i);
+               linear_coef(setup, info, slot+1, vert_attr, i);
          break;
 
       case LP_INTERP_PERSPECTIVE:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               perspective_coef(setup, tri, info, slot+1, vert_attr, i);
+               perspective_coef(setup, info, slot+1, vert_attr, i);
          fragcoord_usage_mask |= TGSI_WRITEMASK_W;
          break;
 
@@ -211,7 +211,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
       case LP_INTERP_FACING:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               constant_coef(setup, tri, slot+1, 1.0, i);
+               constant_coef(setup, info, slot+1, 1.0, i);
          break;
 
       default:
@@ -221,7 +221,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
 
    /* The internal position input is in slot zero:
     */
-   setup_fragcoord_coef(setup, tri, info, 0,
+   setup_fragcoord_coef(setup, info, 0,
                         fragcoord_usage_mask);
 }
 
@@ -276,6 +276,7 @@ try_setup_line( struct lp_setup_context *setup,
 {
    struct lp_scene *scene = setup->scene;
    struct lp_rast_triangle *line;
+   struct lp_rast_plane *plane;
    struct lp_line_info info;
    float width = MAX2(1.0, setup->line_width);
    struct u_rect bbox;
@@ -581,32 +582,35 @@ try_setup_line( struct lp_setup_context *setup,
 #endif
 
    /* calculate the deltas */
-   line->plane[0].dcdy = x[0] - x[1];
-   line->plane[1].dcdy = x[1] - x[2];
-   line->plane[2].dcdy = x[2] - x[3];
-   line->plane[3].dcdy = x[3] - x[0];
+   plane = GET_PLANES(line);
+   plane[0].dcdy = x[0] - x[1];
+   plane[1].dcdy = x[1] - x[2];
+   plane[2].dcdy = x[2] - x[3];
+   plane[3].dcdy = x[3] - x[0];
 
-   line->plane[0].dcdx = y[0] - y[1];
-   line->plane[1].dcdx = y[1] - y[2];
-   line->plane[2].dcdx = y[2] - y[3];
-   line->plane[3].dcdx = y[3] - y[0];
+   plane[0].dcdx = y[0] - y[1];
+   plane[1].dcdx = y[1] - y[2];
+   plane[2].dcdx = y[2] - y[3];
+   plane[3].dcdx = y[3] - y[0];
 
 
    /* Setup parameter interpolants:
     */
-   setup_line_coefficients( setup, line, &info); 
+   info.a0 = GET_A0(&line->inputs);
+   info.dadx = GET_DADX(&line->inputs);
+   info.dady = GET_DADY(&line->inputs);
+   setup_line_coefficients(setup, &info); 
 
    line->inputs.frontfacing = TRUE;
    line->inputs.disable = FALSE;
    line->inputs.opaque = FALSE;
 
    for (i = 0; i < 4; i++) {
-      struct lp_rast_plane *plane = &line->plane[i];
 
       /* half-edge constants, will be interated over the whole render
        * target.
        */
-      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
+      plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
 
       
       /* correct for top-left vs. bottom-left fill convention.  
@@ -622,38 +626,38 @@ try_setup_line( struct lp_setup_context *setup,
        * to its usual method, in which case it will probably want
        * to use the opposite, top-left convention.
        */         
-      if (plane->dcdx < 0) {
+      if (plane[i].dcdx < 0) {
          /* both fill conventions want this - adjust for left edges */
-         plane->c++;            
+         plane[i].c++;            
       }
-      else if (plane->dcdx == 0) {
+      else if (plane[i].dcdx == 0) {
          if (setup->pixel_offset == 0) {
             /* correct for top-left fill convention:
              */
-            if (plane->dcdy > 0) plane->c++;
+            if (plane[i].dcdy > 0) plane[i].c++;
          }
          else {
             /* correct for bottom-left fill convention:
              */
-            if (plane->dcdy < 0) plane->c++;
+            if (plane[i].dcdy < 0) plane[i].c++;
          }
       }
 
-      plane->dcdx *= FIXED_ONE;
-      plane->dcdy *= FIXED_ONE;
+      plane[i].dcdx *= FIXED_ONE;
+      plane[i].dcdy *= FIXED_ONE;
 
       /* find trivial reject offsets for each edge for a single-pixel
        * sized block.  These will be scaled up at each recursive level to
        * match the active blocksize.  Scaling in this way works best if
        * the blocks are square.
        */
-      plane->eo = 0;
-      if (plane->dcdx < 0) plane->eo -= plane->dcdx;
-      if (plane->dcdy > 0) plane->eo += plane->dcdy;
+      plane[i].eo = 0;
+      if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+      if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
 
       /* Calculate trivial accept offsets from the above.
        */
-      plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+      plane[i].ei = plane[i].dcdy - plane[i].dcdx - plane[i].eo;
    }
 
 
@@ -676,29 +680,29 @@ try_setup_line( struct lp_setup_context *setup,
     * these planes elsewhere.
     */
    if (nr_planes == 8) {
-      line->plane[4].dcdx = -1;
-      line->plane[4].dcdy = 0;
-      line->plane[4].c = 1-bbox.x0;
-      line->plane[4].ei = 0;
-      line->plane[4].eo = 1;
-
-      line->plane[5].dcdx = 1;
-      line->plane[5].dcdy = 0;
-      line->plane[5].c = bbox.x1+1;
-      line->plane[5].ei = -1;
-      line->plane[5].eo = 0;
-
-      line->plane[6].dcdx = 0;
-      line->plane[6].dcdy = 1;
-      line->plane[6].c = 1-bbox.y0;
-      line->plane[6].ei = 0;
-      line->plane[6].eo = 1;
-
-      line->plane[7].dcdx = 0;
-      line->plane[7].dcdy = -1;
-      line->plane[7].c = bbox.y1+1;
-      line->plane[7].ei = -1;
-      line->plane[7].eo = 0;
+      plane[4].dcdx = -1;
+      plane[4].dcdy = 0;
+      plane[4].c = 1-bbox.x0;
+      plane[4].ei = 0;
+      plane[4].eo = 1;
+
+      plane[5].dcdx = 1;
+      plane[5].dcdy = 0;
+      plane[5].c = bbox.x1+1;
+      plane[5].ei = -1;
+      plane[5].eo = 0;
+
+      plane[6].dcdx = 0;
+      plane[6].dcdy = 1;
+      plane[6].c = 1-bbox.y0;
+      plane[6].ei = 0;
+      plane[6].eo = 1;
+
+      plane[7].dcdx = 0;
+      plane[7].dcdy = -1;
+      plane[7].c = bbox.y1+1;
+      plane[7].ei = -1;
+      plane[7].eo = 0;
    }
 
    return lp_setup_bin_triangle(setup, line, &bbox, nr_planes);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 108c831e66..e30e70e16d 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -45,6 +45,10 @@ struct point_info {
    int dx01, dx12;
 
    const float (*v0)[4];
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
 };   
 
 
@@ -53,20 +57,19 @@ struct point_info {
  */
 static void
 constant_coef(struct lp_setup_context *setup,
-              struct lp_rast_triangle *point,
+              struct point_info *info,
               unsigned slot,
               const float value,
               unsigned i)
 {
-   point->inputs.a0[slot][i] = value;
-   point->inputs.dadx[slot][i] = 0.0f;
-   point->inputs.dady[slot][i] = 0.0f;
+   info->a0[slot][i] = value;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
 }
 
 
 static void
 point_persp_coeff(struct lp_setup_context *setup,
-                  struct lp_rast_triangle *point,
                   const struct point_info *info,
                   unsigned slot,
                   unsigned i)
@@ -82,9 +85,9 @@ point_persp_coeff(struct lp_setup_context *setup,
 
    assert(i < 4);
 
-   point->inputs.a0[slot][i] = info->v0[slot][i]*w0;
-   point->inputs.dadx[slot][i] = 0.0f;
-   point->inputs.dady[slot][i] = 0.0f;
+   info->a0[slot][i] = info->v0[slot][i]*w0;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
 }
 
 
@@ -98,7 +101,6 @@ point_persp_coeff(struct lp_setup_context *setup,
  */
 static void
 texcoord_coef(struct lp_setup_context *setup,
-              struct lp_rast_triangle *point,
               const struct point_info *info,
               unsigned slot,
               unsigned i,
@@ -115,14 +117,14 @@ texcoord_coef(struct lp_setup_context *setup,
       float x0 = info->v0[0][0] - setup->pixel_offset;
       float y0 = info->v0[0][1] - setup->pixel_offset;
 
-      point->inputs.dadx[slot][0] = dadx;
-      point->inputs.dady[slot][0] = dady;
-      point->inputs.a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
+      info->dadx[slot][0] = dadx;
+      info->dady[slot][0] = dady;
+      info->a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
 
       if (perspective) {
-         point->inputs.dadx[slot][0] *= w0;
-         point->inputs.dady[slot][0] *= w0;
-         point->inputs.a0[slot][0] *= w0;
+         info->dadx[slot][0] *= w0;
+         info->dady[slot][0] *= w0;
+         info->a0[slot][0] *= w0;
       }
    }
    else if (i == 1) {
@@ -135,25 +137,25 @@ texcoord_coef(struct lp_setup_context *setup,
          dady = -dady;
       }
 
-      point->inputs.dadx[slot][1] = dadx;
-      point->inputs.dady[slot][1] = dady;
-      point->inputs.a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
+      info->dadx[slot][1] = dadx;
+      info->dady[slot][1] = dady;
+      info->a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
 
       if (perspective) {
-         point->inputs.dadx[slot][1] *= w0;
-         point->inputs.dady[slot][1] *= w0;
-         point->inputs.a0[slot][1] *= w0;
+         info->dadx[slot][1] *= w0;
+         info->dady[slot][1] *= w0;
+         info->a0[slot][1] *= w0;
       }
    }
    else if (i == 2) {
-      point->inputs.a0[slot][2] = 0.0f;
-      point->inputs.dadx[slot][2] = 0.0f;
-      point->inputs.dady[slot][2] = 0.0f;
+      info->a0[slot][2] = 0.0f;
+      info->dadx[slot][2] = 0.0f;
+      info->dady[slot][2] = 0.0f;
    }
    else {
-      point->inputs.a0[slot][3] = perspective ? w0 : 1.0f;
-      point->inputs.dadx[slot][3] = 0.0f;
-      point->inputs.dady[slot][3] = 0.0f;
+      info->a0[slot][3] = perspective ? w0 : 1.0f;
+      info->dadx[slot][3] = 0.0f;
+      info->dady[slot][3] = 0.0f;
    }
 }
 
@@ -166,33 +168,32 @@ texcoord_coef(struct lp_setup_context *setup,
  */
 static void
 setup_point_fragcoord_coef(struct lp_setup_context *setup,
-                           struct lp_rast_triangle *point,
-                           const struct point_info *info,
+                           struct point_info *info,
                            unsigned slot,
                            unsigned usage_mask)
 {
    /*X*/
    if (usage_mask & TGSI_WRITEMASK_X) {
-      point->inputs.a0[slot][0] = 0.0;
-      point->inputs.dadx[slot][0] = 1.0;
-      point->inputs.dady[slot][0] = 0.0;
+      info->a0[slot][0] = 0.0;
+      info->dadx[slot][0] = 1.0;
+      info->dady[slot][0] = 0.0;
    }
 
    /*Y*/
    if (usage_mask & TGSI_WRITEMASK_Y) {
-      point->inputs.a0[slot][1] = 0.0;
-      point->inputs.dadx[slot][1] = 0.0;
-      point->inputs.dady[slot][1] = 1.0;
+      info->a0[slot][1] = 0.0;
+      info->dadx[slot][1] = 0.0;
+      info->dady[slot][1] = 1.0;
    }
 
    /*Z*/
    if (usage_mask & TGSI_WRITEMASK_Z) {
-      constant_coef(setup, point, slot, info->v0[0][2], 2);
+      constant_coef(setup, info, slot, info->v0[0][2], 2);
    }
 
    /*W*/
    if (usage_mask & TGSI_WRITEMASK_W) {
-      constant_coef(setup, point, slot, info->v0[0][3], 3);
+      constant_coef(setup, info, slot, info->v0[0][3], 3);
    }
 }
 
@@ -202,8 +203,7 @@ setup_point_fragcoord_coef(struct lp_setup_context *setup,
  */
 static void   
 setup_point_coefficients( struct lp_setup_context *setup,
-                          struct lp_rast_triangle *point,
-                          const struct point_info *info)
+                          struct point_info *info)
 {
    const struct lp_fragment_shader *shader = setup->fs.current.variant->shader;
    unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
@@ -248,7 +248,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
                 (setup->sprite_coord_enable & (1 << semantic_index))) {
                for (i = 0; i < NUM_CHANNELS; i++) {
                   if (usage_mask & (1 << i)) {
-                     texcoord_coef(setup, point, info, slot + 1, i,
+                     texcoord_coef(setup, info, slot + 1, i,
                                    setup->sprite_coord_origin,
                                    perspective);
                   }
@@ -261,10 +261,10 @@ setup_point_coefficients( struct lp_setup_context *setup,
          for (i = 0; i < NUM_CHANNELS; i++) {
             if (usage_mask & (1 << i)) {
                if (perspective) {
-                  point_persp_coeff(setup, point, info, slot+1, i);
+                  point_persp_coeff(setup, info, slot+1, i);
                }
                else {
-                  constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i);
+                  constant_coef(setup, info, slot+1, info->v0[vert_attr][i], i);
                }
             }
          }
@@ -273,7 +273,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
       case LP_INTERP_FACING:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               constant_coef(setup, point, slot+1, 1.0, i);
+               constant_coef(setup, info, slot+1, 1.0, i);
          break;
 
       default:
@@ -284,7 +284,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
 
    /* The internal position input is in slot zero:
     */
-   setup_point_fragcoord_coef(setup, point, info, 0,
+   setup_point_fragcoord_coef(setup, info, 0,
                               fragcoord_usage_mask);
 }
 
@@ -368,39 +368,44 @@ try_setup_point( struct lp_setup_context *setup,
    info.dx12 = fixed_width;
    info.dy01 = fixed_width;
    info.dy12 = 0;
+   info.a0 = GET_A0(&point->inputs);
+   info.dadx = GET_DADX(&point->inputs);
+   info.dady = GET_DADY(&point->inputs);
    
    /* Setup parameter interpolants:
     */
-   setup_point_coefficients(setup, point, &info);
+   setup_point_coefficients(setup, &info);
 
    point->inputs.frontfacing = TRUE;
    point->inputs.disable = FALSE;
    point->inputs.opaque = FALSE;
 
    {
-      point->plane[0].dcdx = -1;
-      point->plane[0].dcdy = 0;
-      point->plane[0].c = 1-bbox.x0;
-      point->plane[0].ei = 0;
-      point->plane[0].eo = 1;
-
-      point->plane[1].dcdx = 1;
-      point->plane[1].dcdy = 0;
-      point->plane[1].c = bbox.x1+1;
-      point->plane[1].ei = -1;
-      point->plane[1].eo = 0;
-
-      point->plane[2].dcdx = 0;
-      point->plane[2].dcdy = 1;
-      point->plane[2].c = 1-bbox.y0;
-      point->plane[2].ei = 0;
-      point->plane[2].eo = 1;
-
-      point->plane[3].dcdx = 0;
-      point->plane[3].dcdy = -1;
-      point->plane[3].c = bbox.y1+1;
-      point->plane[3].ei = -1;
-      point->plane[3].eo = 0;
+      struct lp_rast_plane *plane = GET_PLANES(point);
+
+      plane[0].dcdx = -1;
+      plane[0].dcdy = 0;
+      plane[0].c = 1-bbox.x0;
+      plane[0].ei = 0;
+      plane[0].eo = 1;
+
+      plane[1].dcdx = 1;
+      plane[1].dcdy = 0;
+      plane[1].c = bbox.x1+1;
+      plane[1].ei = -1;
+      plane[1].eo = 0;
+
+      plane[2].dcdx = 0;
+      plane[2].dcdy = 1;
+      plane[2].c = 1-bbox.y0;
+      plane[2].ei = 0;
+      plane[2].eo = 1;
+
+      plane[3].dcdx = 0;
+      plane[3].dcdy = -1;
+      plane[3].c = bbox.y1+1;
+      plane[3].ei = -1;
+      plane[3].eo = 0;
    }
 
    return lp_setup_bin_triangle(setup, point, &bbox, nr_planes);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 3bf0b2d252..937821b4c3 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -75,24 +75,25 @@ lp_setup_alloc_triangle(struct lp_scene *scene,
                         unsigned *tri_size)
 {
    unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
+   unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
    struct lp_rast_triangle *tri;
-   unsigned tri_bytes, bytes;
-   char *inputs;
 
-   tri_bytes = align(Offset(struct lp_rast_triangle, plane[nr_planes]), 16);
-   bytes = tri_bytes + (3 * input_array_sz);
-
-   tri = lp_scene_alloc_aligned( scene, bytes, 16 );
+   *tri_size = (sizeof(struct lp_rast_triangle) +
+                3 * input_array_sz +
+                plane_sz);
 
+   tri = lp_scene_alloc_aligned( scene, *tri_size, 16 );
    if (tri) {
-      inputs = ((char *)tri) + tri_bytes;
-      tri->inputs.a0   = (float (*)[4]) inputs;
-      tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz);
-      tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz);
+      tri->inputs.stride = input_array_sz;
+   }
 
-      *tri_size = bytes;
+   {
+      char *a = (char *)tri;
+      char *b = (char *)&GET_PLANES(tri)[nr_planes];
+      assert(b - a == *tri_size);
    }
 
+
    return tri;
 }
 
@@ -228,6 +229,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
 {
    struct lp_scene *scene = setup->scene;
    struct lp_rast_triangle *tri;
+   struct lp_rast_plane *plane;
    int x[3];
    int y[3];
    struct u_rect bbox;
@@ -296,7 +298,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
    if (!tri)
       return FALSE;
 
-#ifdef DEBUG
+#if 0
    tri->v[0][0] = v0[0][0];
    tri->v[1][0] = v1[0][0];
    tri->v[2][0] = v2[0][0];
@@ -305,13 +307,14 @@ do_triangle_ccw(struct lp_setup_context *setup,
    tri->v[2][1] = v2[0][1];
 #endif
 
-   tri->plane[0].dcdy = x[0] - x[1];
-   tri->plane[1].dcdy = x[1] - x[2];
-   tri->plane[2].dcdy = x[2] - x[0];
+   plane = GET_PLANES(tri);
+   plane[0].dcdy = x[0] - x[1];
+   plane[1].dcdy = x[1] - x[2];
+   plane[2].dcdy = x[2] - x[0];
 
-   tri->plane[0].dcdx = y[0] - y[1];
-   tri->plane[1].dcdx = y[1] - y[2];
-   tri->plane[2].dcdx = y[2] - y[0];
+   plane[0].dcdx = y[0] - y[1];
+   plane[1].dcdx = y[1] - y[2];
+   plane[2].dcdx = y[2] - y[0];
 
    LP_COUNT(nr_tris);
 
@@ -325,12 +328,10 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
   
    for (i = 0; i < 3; i++) {
-      struct lp_rast_plane *plane = &tri->plane[i];
-
       /* half-edge constants, will be interated over the whole render
        * target.
        */
-      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
+      plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
 
       /* correct for top-left vs. bottom-left fill convention.  
        *
@@ -345,38 +346,38 @@ do_triangle_ccw(struct lp_setup_context *setup,
        * to its usual method, in which case it will probably want
        * to use the opposite, top-left convention.
        */         
-      if (plane->dcdx < 0) {
+      if (plane[i].dcdx < 0) {
          /* both fill conventions want this - adjust for left edges */
-         plane->c++;            
+         plane[i].c++;            
       }
-      else if (plane->dcdx == 0) {
+      else if (plane[i].dcdx == 0) {
          if (setup->pixel_offset == 0) {
             /* correct for top-left fill convention:
              */
-            if (plane->dcdy > 0) plane->c++;
+            if (plane[i].dcdy > 0) plane[i].c++;
          }
          else {
             /* correct for bottom-left fill convention:
              */
-            if (plane->dcdy < 0) plane->c++;
+            if (plane[i].dcdy < 0) plane[i].c++;
          }
       }
 
-      plane->dcdx *= FIXED_ONE;
-      plane->dcdy *= FIXED_ONE;
+      plane[i].dcdx *= FIXED_ONE;
+      plane[i].dcdy *= FIXED_ONE;
 
       /* find trivial reject offsets for each edge for a single-pixel
        * sized block.  These will be scaled up at each recursive level to
        * match the active blocksize.  Scaling in this way works best if
        * the blocks are square.
        */
-      plane->eo = 0;
-      if (plane->dcdx < 0) plane->eo -= plane->dcdx;
-      if (plane->dcdy > 0) plane->eo += plane->dcdy;
+      plane[i].eo = 0;
+      if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+      if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
 
       /* Calculate trivial accept offsets from the above.
        */
-      plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+      plane[i].ei = plane[i].dcdy - plane[i].dcdx - plane[i].eo;
    }
 
 
@@ -399,29 +400,29 @@ do_triangle_ccw(struct lp_setup_context *setup,
     * these planes elsewhere.
     */
    if (nr_planes == 7) {
-      tri->plane[3].dcdx = -1;
-      tri->plane[3].dcdy = 0;
-      tri->plane[3].c = 1-bbox.x0;
-      tri->plane[3].ei = 0;
-      tri->plane[3].eo = 1;
-
-      tri->plane[4].dcdx = 1;
-      tri->plane[4].dcdy = 0;
-      tri->plane[4].c = bbox.x1+1;
-      tri->plane[4].ei = -1;
-      tri->plane[4].eo = 0;
-
-      tri->plane[5].dcdx = 0;
-      tri->plane[5].dcdy = 1;
-      tri->plane[5].c = 1-bbox.y0;
-      tri->plane[5].ei = 0;
-      tri->plane[5].eo = 1;
-
-      tri->plane[6].dcdx = 0;
-      tri->plane[6].dcdy = -1;
-      tri->plane[6].c = bbox.y1+1;
-      tri->plane[6].ei = -1;
-      tri->plane[6].eo = 0;
+      plane[3].dcdx = -1;
+      plane[3].dcdy = 0;
+      plane[3].c = 1-bbox.x0;
+      plane[3].ei = 0;
+      plane[3].eo = 1;
+
+      plane[4].dcdx = 1;
+      plane[4].dcdy = 0;
+      plane[4].c = bbox.x1+1;
+      plane[4].ei = -1;
+      plane[4].eo = 0;
+
+      plane[5].dcdx = 0;
+      plane[5].dcdy = 1;
+      plane[5].c = 1-bbox.y0;
+      plane[5].ei = 0;
+      plane[5].eo = 1;
+
+      plane[6].dcdx = 0;
+      plane[6].dcdy = -1;
+      plane[6].c = bbox.y1+1;
+      plane[6].ei = -1;
+      plane[6].eo = 0;
    }
 
    return lp_setup_bin_triangle( setup, tri, &bbox, nr_planes );
@@ -525,6 +526,7 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    }
    else
    {
+      struct lp_rast_plane *plane = GET_PLANES(tri);
       int c[MAX_PLANES];
       int ei[MAX_PLANES];
       int eo[MAX_PLANES];
@@ -538,14 +540,14 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
       int iy1 = bbox->y1 / TILE_SIZE;
       
       for (i = 0; i < nr_planes; i++) {
-         c[i] = (tri->plane[i].c + 
-                 tri->plane[i].dcdy * iy0 * TILE_SIZE - 
-                 tri->plane[i].dcdx * ix0 * TILE_SIZE);
-
-         ei[i] = tri->plane[i].ei << TILE_ORDER;
-         eo[i] = tri->plane[i].eo << TILE_ORDER;
-         xstep[i] = -(tri->plane[i].dcdx << TILE_ORDER);
-         ystep[i] = tri->plane[i].dcdy << TILE_ORDER;
+         c[i] = (plane[i].c + 
+                 plane[i].dcdy * iy0 * TILE_SIZE - 
+                 plane[i].dcdx * ix0 * TILE_SIZE);
+
+         ei[i] = plane[i].ei << TILE_ORDER;
+         eo[i] = plane[i].eo << TILE_ORDER;
+         xstep[i] = -(plane[i].dcdx << TILE_ORDER);
+         ystep[i] = plane[i].dcdy << TILE_ORDER;
       }
 
 
-- 
cgit v1.2.3


From 8965f042b327ad8697963e757f4607f4bb13a045 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 15 Oct 2010 13:04:19 +0100
Subject: llvmpipe: don't store plane.ei value in binned data

Further reduce the size of a binned triangle.
---
 src/gallium/drivers/llvmpipe/lp_rast.h         |  3 ---
 src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h |  6 ++++--
 src/gallium/drivers/llvmpipe/lp_setup_line.c   |  8 --------
 src/gallium/drivers/llvmpipe/lp_setup_point.c  |  4 ----
 src/gallium/drivers/llvmpipe/lp_setup_tri.c    | 13 ++++---------
 5 files changed, 8 insertions(+), 26 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index 8d8b6210ec..a64c152cf8 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -100,9 +100,6 @@ struct lp_rast_plane {
 
    /* one-pixel sized trivial reject offsets for each plane */
    int eo;
-
-   /* one-pixel sized trivial accept offsets for each plane */
-   int ei;
 };
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 9976996719..4825d651c0 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -82,7 +82,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
       const int dcdx = -plane[j].dcdx * 4;
       const int dcdy = plane[j].dcdy * 4;
       const int cox = plane[j].eo * 4;
-      const int cio = plane[j].ei * 4 - 1;
+      const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+      const int cio = ei * 4 - 1;
 
       build_masks(c[j] + cox,
 		  cio - cox,
@@ -181,7 +182,8 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 	 const int dcdx = -plane[j].dcdx * 16;
 	 const int dcdy = plane[j].dcdy * 16;
 	 const int cox = plane[j].eo * 16;
-	 const int cio = plane[j].ei * 16 - 1;
+         const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+         const int cio = ei * 16 - 1;
 
 	 build_masks(c[j] + cox,
 		     cio - cox,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 2fd9f2e2f2..ece8638b5a 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -654,10 +654,6 @@ try_setup_line( struct lp_setup_context *setup,
       plane[i].eo = 0;
       if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
       if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
-
-      /* Calculate trivial accept offsets from the above.
-       */
-      plane[i].ei = plane[i].dcdy - plane[i].dcdx - plane[i].eo;
    }
 
 
@@ -683,25 +679,21 @@ try_setup_line( struct lp_setup_context *setup,
       plane[4].dcdx = -1;
       plane[4].dcdy = 0;
       plane[4].c = 1-bbox.x0;
-      plane[4].ei = 0;
       plane[4].eo = 1;
 
       plane[5].dcdx = 1;
       plane[5].dcdy = 0;
       plane[5].c = bbox.x1+1;
-      plane[5].ei = -1;
       plane[5].eo = 0;
 
       plane[6].dcdx = 0;
       plane[6].dcdy = 1;
       plane[6].c = 1-bbox.y0;
-      plane[6].ei = 0;
       plane[6].eo = 1;
 
       plane[7].dcdx = 0;
       plane[7].dcdy = -1;
       plane[7].c = bbox.y1+1;
-      plane[7].ei = -1;
       plane[7].eo = 0;
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index e30e70e16d..16d21df35e 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -386,25 +386,21 @@ try_setup_point( struct lp_setup_context *setup,
       plane[0].dcdx = -1;
       plane[0].dcdy = 0;
       plane[0].c = 1-bbox.x0;
-      plane[0].ei = 0;
       plane[0].eo = 1;
 
       plane[1].dcdx = 1;
       plane[1].dcdy = 0;
       plane[1].c = bbox.x1+1;
-      plane[1].ei = -1;
       plane[1].eo = 0;
 
       plane[2].dcdx = 0;
       plane[2].dcdy = 1;
       plane[2].c = 1-bbox.y0;
-      plane[2].ei = 0;
       plane[2].eo = 1;
 
       plane[3].dcdx = 0;
       plane[3].dcdy = -1;
       plane[3].c = bbox.y1+1;
-      plane[3].ei = -1;
       plane[3].eo = 0;
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 937821b4c3..6ceda80a71 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -374,10 +374,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
       plane[i].eo = 0;
       if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
       if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
-
-      /* Calculate trivial accept offsets from the above.
-       */
-      plane[i].ei = plane[i].dcdy - plane[i].dcdx - plane[i].eo;
    }
 
 
@@ -403,25 +399,21 @@ do_triangle_ccw(struct lp_setup_context *setup,
       plane[3].dcdx = -1;
       plane[3].dcdy = 0;
       plane[3].c = 1-bbox.x0;
-      plane[3].ei = 0;
       plane[3].eo = 1;
 
       plane[4].dcdx = 1;
       plane[4].dcdy = 0;
       plane[4].c = bbox.x1+1;
-      plane[4].ei = -1;
       plane[4].eo = 0;
 
       plane[5].dcdx = 0;
       plane[5].dcdy = 1;
       plane[5].c = 1-bbox.y0;
-      plane[5].ei = 0;
       plane[5].eo = 1;
 
       plane[6].dcdx = 0;
       plane[6].dcdy = -1;
       plane[6].c = bbox.y1+1;
-      plane[6].ei = -1;
       plane[6].eo = 0;
    }
 
@@ -544,7 +536,10 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                  plane[i].dcdy * iy0 * TILE_SIZE - 
                  plane[i].dcdx * ix0 * TILE_SIZE);
 
-         ei[i] = plane[i].ei << TILE_ORDER;
+         ei[i] = (plane[i].dcdy - 
+                  plane[i].dcdx - 
+                  plane[i].eo) << TILE_ORDER;
+
          eo[i] = plane[i].eo << TILE_ORDER;
          xstep[i] = -(plane[i].dcdx << TILE_ORDER);
          ystep[i] = plane[i].dcdy << TILE_ORDER;
-- 
cgit v1.2.3


From 15f4e3a8b98b5f4ca2833c02192ed9e6c237c5c7 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 12 Oct 2010 18:58:05 +0100
Subject: gallium: move some intrinsics helpers to u_sse.h

---
 src/gallium/auxiliary/util/u_sse.h         | 74 ++++++++++++++++++++++++++++++
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 58 -----------------------
 2 files changed, 74 insertions(+), 58 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 8fd0e52a3a..1df6c87267 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -71,6 +71,12 @@ _mm_castps_si128(__m128 a)
 
 #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
 
+union m128i {
+   __m128i m;
+   ubyte ub[16];
+   ushort us[8];
+   uint ui[4];
+};
 
 static INLINE void u_print_epi8(const char *name, __m128i r)
 {
@@ -149,6 +155,12 @@ static INLINE void u_print_ps(const char *name, __m128 r)
 }
 
 
+#define U_DUMP_EPI32(a) u_print_epi32(#a, a)
+#define U_DUMP_EPI16(a) u_print_epi16(#a, a)
+#define U_DUMP_EPI8(a)  u_print_epi8(#a, a)
+#define U_DUMP_PS(a)    u_print_ps(#a, a)
+
+
 
 #if defined(PIPE_ARCH_SSSE3)
 
@@ -176,6 +188,68 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
 #endif /* !PIPE_ARCH_SSSE3 */
 
 
+
+
+/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * I suspect this works fine for us because one of our operands is
+ * always positive, but not sure that this can be used for general
+ * signed integer multiplication.
+ *
+ * This seems close enough to the speed of SSE4 and the real
+ * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
+ * dependency at this point.
+ */
+static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
+{
+   __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
+   __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
+   __m128i ba   = _mm_mul_epu32(b, a);   /* multply dwords 0, 2 */
+   __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
+
+   /* Interleave the results, either with shuffles or (slightly
+    * faster) direct bit operations:
+    */
+#if 0
+   __m128i ba8             = _mm_shuffle_epi32(ba, 8);
+   __m128i b4a48           = _mm_shuffle_epi32(b4a4, 8);
+   __m128i result          = _mm_unpacklo_epi32(ba8, b4a48);
+#else
+   __m128i mask            = _mm_setr_epi32(~0,0,~0,0);
+   __m128i ba_mask         = _mm_and_si128(ba, mask);
+   __m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32);
+   __m128i result          = _mm_or_si128(ba_mask, b4a4_mask_shift);
+#endif
+
+   return result;
+}
+
+
+static INLINE void
+transpose4_epi32(const __m128i * restrict a,
+                 const __m128i * restrict b,
+                 const __m128i * restrict c,
+                 const __m128i * restrict d,
+                 __m128i * restrict o,
+                 __m128i * restrict p,
+                 __m128i * restrict q,
+                 __m128i * restrict r)
+{
+  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+  *o = _mm_unpacklo_epi64(t0, t1);
+  *p = _mm_unpackhi_epi64(t0, t1);
+  *q = _mm_unpacklo_epi64(t2, t3);
+  *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
+
+
 #endif /* PIPE_ARCH_SSE */
 
 #endif /* U_SSE_H_ */
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 5bdf19712f..659eb1cac3 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -240,68 +240,10 @@ sign_bits4(const __m128i *cstep, int cdiff)
 }
 
 
-static INLINE void
-transpose4_epi32(const __m128i * restrict a,
-                 const __m128i * restrict b,
-                 const __m128i * restrict c,
-                 const __m128i * restrict d,
-                 __m128i * restrict o,
-                 __m128i * restrict p,
-                 __m128i * restrict q,
-                 __m128i * restrict r)
-{
-  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
-  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
-  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
-  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
-
-  *o = _mm_unpacklo_epi64(t0, t1);
-  *p = _mm_unpackhi_epi64(t0, t1);
-  *q = _mm_unpacklo_epi64(t2, t3);
-  *r = _mm_unpackhi_epi64(t2, t3);
-}
-
-
-#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
-
 #define NR_PLANES 3
 
 
-/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
- * _mm_mul_epu32().
- *
- * I suspect this works fine for us because one of our operands is
- * always positive, but not sure that this can be used for general
- * signed integer multiplication.
- *
- * This seems close enough to the speed of SSE4 and the real
- * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
- * dependency at this point.
- */
-static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
-{
-   __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
-   __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
-   __m128i ba   = _mm_mul_epu32(b, a);   /* multply dwords 0, 2 */
-   __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
-
-   /* Interleave the results, either with shuffles or (slightly
-    * faster) direct bit operations:
-    */
-#if 0
-   __m128i ba8             = _mm_shuffle_epi32(ba, 8);
-   __m128i b4a48           = _mm_shuffle_epi32(b4a4, 8);
-   __m128i result          = _mm_unpacklo_epi32(ba8, b4a48);
-#else
-   __m128i mask            = _mm_setr_epi32(~0,0,~0,0);
-   __m128i ba_mask         = _mm_and_si128(ba, mask);
-   __m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32);
-   __m128i result          = _mm_or_si128(ba_mask, b4a4_mask_shift);
-#endif
-
-   return result;
-}
 
 
-- 
cgit v1.2.3


From 9f9a17eba8d6080bf30f17c8a7eaed97b10a559f Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Tue, 12 Oct 2010 18:59:15 +0100
Subject: llvmpipe: do plane calculations with intrinsics

This is a step towards moving this code into the rasterizer.
---
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 205 ++++++++++++++++++++--------
 1 file changed, 148 insertions(+), 57 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 6ceda80a71..49ded9e045 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -32,6 +32,7 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_rect.h"
+#include "util/u_sse.h"
 #include "lp_perf.h"
 #include "lp_setup_context.h"
 #include "lp_setup_coef.h"
@@ -40,7 +41,9 @@
 
 #define NUM_CHANNELS 4
 
-
+#if defined(PIPE_ARCH_SSE)
+#include <emmintrin.h>
+#endif
    
 static INLINE int
 subpixel_snap(float a)
@@ -230,11 +233,10 @@ do_triangle_ccw(struct lp_setup_context *setup,
    struct lp_scene *scene = setup->scene;
    struct lp_rast_triangle *tri;
    struct lp_rast_plane *plane;
-   int x[3];
-   int y[3];
+   int x[4];
+   int y[4];
    struct u_rect bbox;
    unsigned tri_bytes;
-   int i;
    int nr_planes = 3;
 
    if (0)
@@ -251,10 +253,12 @@ do_triangle_ccw(struct lp_setup_context *setup,
    x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
    x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
    x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
+   x[3] = 0;
    y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
    y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
    y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
-
+   y[3] = 0;
+   
 
    /* Bounding rectangle (in pixels) */
    {
@@ -307,15 +311,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
    tri->v[2][1] = v2[0][1];
 #endif
 
-   plane = GET_PLANES(tri);
-   plane[0].dcdy = x[0] - x[1];
-   plane[1].dcdy = x[1] - x[2];
-   plane[2].dcdy = x[2] - x[0];
-
-   plane[0].dcdx = y[0] - y[1];
-   plane[1].dcdx = y[1] - y[2];
-   plane[2].dcdx = y[2] - y[0];
-
    LP_COUNT(nr_tris);
 
    /* Setup parameter interpolants:
@@ -326,54 +321,150 @@ do_triangle_ccw(struct lp_setup_context *setup,
    tri->inputs.disable = FALSE;
    tri->inputs.opaque = setup->fs.current.variant->opaque;
 
-  
-   for (i = 0; i < 3; i++) {
-      /* half-edge constants, will be interated over the whole render
-       * target.
+   plane = GET_PLANES(tri);
+
+#if defined(PIPE_ARCH_SSE)
+   {
+      __m128i vertx, verty;
+      __m128i shufx, shufy;
+      __m128i dcdx, dcdy, c;
+      __m128i unused;
+      __m128i dcdx_neg_mask;
+      __m128i dcdy_neg_mask;
+      __m128i dcdx_zero_mask;
+      __m128i top_left_flag;
+      __m128i c_inc_mask, c_inc;
+      __m128i eo, p0, p1, p2;
+      __m128i zero = _mm_setzero_si128();
+
+      vertx = _mm_loadu_si128((__m128i *)x); /* vertex x coords */
+      verty = _mm_loadu_si128((__m128i *)y); /* vertex y coords */
+
+      shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
+      shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
+
+      dcdx = _mm_sub_epi32(verty, shufy);
+      dcdy = _mm_sub_epi32(vertx, shufx);
+
+      dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
+      dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero);
+      dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
+
+      top_left_flag = _mm_set1_epi32((setup->pixel_offset == 0) ? ~0 : 0);
+
+      c_inc_mask = _mm_or_si128(dcdx_neg_mask,
+                                _mm_and_si128(dcdx_zero_mask,
+                                              _mm_xor_si128(dcdy_neg_mask,
+                                                            top_left_flag)));
+
+      c_inc = _mm_srli_epi32(c_inc_mask, 31);
+
+      c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
+                        mm_mullo_epi32(dcdy, verty));
+
+      c = _mm_add_epi32(c, c_inc);
+
+      /* Scale up to match c:
        */
-      plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
-
-      /* correct for top-left vs. bottom-left fill convention.  
-       *
-       * note that we're overloading gl_rasterization_rules to mean
-       * both (0.5,0.5) pixel centers *and* bottom-left filling
-       * convention.
-       *
-       * GL actually has a top-left filling convention, but GL's
-       * notion of "top" differs from gallium's...
-       *
-       * Also, sometimes (in FBO cases) GL will render upside down
-       * to its usual method, in which case it will probably want
-       * to use the opposite, top-left convention.
-       */         
-      if (plane[i].dcdx < 0) {
-         /* both fill conventions want this - adjust for left edges */
-         plane[i].c++;            
-      }
-      else if (plane[i].dcdx == 0) {
-         if (setup->pixel_offset == 0) {
-            /* correct for top-left fill convention:
-             */
-            if (plane[i].dcdy > 0) plane[i].c++;
+      dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
+      dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
+
+      /* Calculate trivial reject values:
+       */
+      eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
+                         _mm_and_si128(dcdx_neg_mask, dcdx));
+
+      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
+      /* Pointless transpose which gets undone immediately in
+       * rasterization:
+       */
+      transpose4_epi32(&c, &dcdx, &dcdy, &eo,
+                       &p0, &p1, &p2, &unused);
+
+      _mm_storeu_si128((__m128i *)&plane[0], p0);
+      _mm_storeu_si128((__m128i *)&plane[1], p1);
+      _mm_storeu_si128((__m128i *)&plane[2], p2);
+   }
+#else
+   {
+      int i;
+      plane[0].dcdy = x[0] - x[1];
+      plane[1].dcdy = x[1] - x[2];
+      plane[2].dcdy = x[2] - x[0];
+      plane[0].dcdx = y[0] - y[1];
+      plane[1].dcdx = y[1] - y[2];
+      plane[2].dcdx = y[2] - y[0];
+  
+      for (i = 0; i < 3; i++) {
+         /* half-edge constants, will be interated over the whole render
+          * target.
+          */
+         plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
+
+         /* correct for top-left vs. bottom-left fill convention.  
+          *
+          * note that we're overloading gl_rasterization_rules to mean
+          * both (0.5,0.5) pixel centers *and* bottom-left filling
+          * convention.
+          *
+          * GL actually has a top-left filling convention, but GL's
+          * notion of "top" differs from gallium's...
+          *
+          * Also, sometimes (in FBO cases) GL will render upside down
+          * to its usual method, in which case it will probably want
+          * to use the opposite, top-left convention.
+          */         
+         if (plane[i].dcdx < 0) {
+            /* both fill conventions want this - adjust for left edges */
+            plane[i].c++;            
          }
-         else {
-            /* correct for bottom-left fill convention:
-             */
-            if (plane[i].dcdy < 0) plane[i].c++;
+         else if (plane[i].dcdx == 0) {
+            if (setup->pixel_offset == 0) {
+               /* correct for top-left fill convention:
+                */
+               if (plane[i].dcdy > 0) plane[i].c++;
+            }
+            else {
+               /* correct for bottom-left fill convention:
+                */
+               if (plane[i].dcdy < 0) plane[i].c++;
+            }
          }
-      }
 
-      plane[i].dcdx *= FIXED_ONE;
-      plane[i].dcdy *= FIXED_ONE;
+         plane[i].dcdx *= FIXED_ONE;
+         plane[i].dcdy *= FIXED_ONE;
 
-      /* find trivial reject offsets for each edge for a single-pixel
-       * sized block.  These will be scaled up at each recursive level to
-       * match the active blocksize.  Scaling in this way works best if
-       * the blocks are square.
-       */
-      plane[i].eo = 0;
-      if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
-      if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
+         /* find trivial reject offsets for each edge for a single-pixel
+          * sized block.  These will be scaled up at each recursive level to
+          * match the active blocksize.  Scaling in this way works best if
+          * the blocks are square.
+          */
+         plane[i].eo = 0;
+         if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+         if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
+      }
+   }
+#endif
+
+   if (0) {
+      debug_printf("p0: %08x/%08x/%08x/%08x\n",
+                   plane[0].c,
+                   plane[0].dcdx,
+                   plane[0].dcdy,
+                   plane[0].eo);
+      
+      debug_printf("p1: %08x/%08x/%08x/%08x\n",
+                   plane[1].c,
+                   plane[1].dcdx,
+                   plane[1].dcdy,
+                   plane[1].eo);
+      
+      debug_printf("p0: %08x/%08x/%08x/%08x\n",
+                   plane[2].c,
+                   plane[2].dcdx,
+                   plane[2].dcdy,
+                   plane[2].eo);
    }
 
 
-- 
cgit v1.2.3


From 392b0954c265fdd66b2de99ab677d2e662935682 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 15 Oct 2010 13:52:00 +0100
Subject: llvmpipe: use aligned loads/stores for plane values

---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c  | 12 ++++++------
 src/gallium/drivers/llvmpipe/lp_setup_tri.c |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 659eb1cac3..042c315635 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -261,9 +261,9 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
    unsigned nr = 0;
 
-   __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
    __m128i zero = _mm_setzero_si128();
 
    __m128i c;
@@ -367,9 +367,9 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    int x = (arg.triangle.plane_mask & 0xff) + task->x;
    int y = (arg.triangle.plane_mask >> 8) + task->y;
 
-   __m128i p0 = _mm_loadu_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = _mm_loadu_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
    __m128i zero = _mm_setzero_si128();
 
    __m128i c;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 49ded9e045..c6cb9afda4 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -382,9 +382,9 @@ do_triangle_ccw(struct lp_setup_context *setup,
       transpose4_epi32(&c, &dcdx, &dcdy, &eo,
                        &p0, &p1, &p2, &unused);
 
-      _mm_storeu_si128((__m128i *)&plane[0], p0);
-      _mm_storeu_si128((__m128i *)&plane[1], p1);
-      _mm_storeu_si128((__m128i *)&plane[2], p2);
+      _mm_store_si128((__m128i *)&plane[0], p0);
+      _mm_store_si128((__m128i *)&plane[1], p1);
+      _mm_store_si128((__m128i *)&plane[2], p2);
    }
 #else
    {
-- 
cgit v1.2.3


From 39185efd3a891b0d66b1ded10d165dd9aee94464 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 15 Oct 2010 14:11:22 +0100
Subject: llvmpipe: fix non-sse build after recent changes

---
 src/gallium/drivers/llvmpipe/lp_setup_coef.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
index a835df6af2..95d6615bb9 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
@@ -145,12 +145,12 @@ setup_fragcoord_coef(struct lp_tri_info *info,
 
    /*Z*/
    if (usage_mask & TGSI_WRITEMASK_Z) {
-      linear_coef(inputs, info, slot, 0, 2);
+      linear_coef(info, slot, 0, 2);
    }
 
    /*W*/
    if (usage_mask & TGSI_WRITEMASK_W) {
-      linear_coef(inputs, info, slot, 0, 3);
+      linear_coef(info, slot, 0, 3);
    }
 }
 
-- 
cgit v1.2.3


From ffab84c9a27a229e6fa14c3de63868bb843c0f3e Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 15 Oct 2010 13:23:05 +0100
Subject: llvmpipe: check shader outputs are non-null before using

---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 8df807cec8..c4b1b868b6 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -345,7 +345,7 @@ generate_fs(struct llvmpipe_context *lp,
                                            TGSI_SEMANTIC_COLOR,
                                            0);
 
-      if (color0 != -1) {
+      if (color0 != -1 && outputs[color0][3]) {
          LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
          LLVMValueRef alpha_ref_value;
 
@@ -364,7 +364,7 @@ generate_fs(struct llvmpipe_context *lp,
                                          TGSI_SEMANTIC_POSITION,
                                          0);
          
-      if (pos0 != -1) {
+      if (pos0 != -1 && outputs[pos0][2]) {
          z = LLVMBuildLoad(builder, outputs[pos0][2], "z");
          lp_build_name(z, "output%u.%u.%c", i, pos0, "xyzw"[chan]);
       }
-- 
cgit v1.2.3


From ac98519c4eed0daf770a9ba380056978e4420352 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Fri, 15 Oct 2010 13:23:30 +0100
Subject: llvmpipe: validate color outputs against key->nr_cbufs

---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index c4b1b868b6..c070b55d3d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -404,7 +404,8 @@ generate_fs(struct llvmpipe_context *lp,
    /* Color write  */
    for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
    {
-      if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR)
+      if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR &&
+          shader->info.base.output_semantic_index[attrib] < key->nr_cbufs)
       {
          unsigned cbuf = shader->info.base.output_semantic_index[attrib];
          for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-- 
cgit v1.2.3


From 992e7c72797545e5d7dac11c4714c107be07d41c Mon Sep 17 00:00:00 2001
From: Jakob Bornecrantz <jakob@vmware.com>
Date: Tue, 12 Oct 2010 18:41:24 +0100
Subject: llvmpipe: Move makefile include to before targets

Or plain make inside of the directory wont build libllvmpipe.a
---
 src/gallium/drivers/llvmpipe/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 55b877b4ab..d71f09eeb3 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -63,12 +63,12 @@ PROGS := lp_test_format	\
 # Need this for the lp_test_*.o files
 CLEAN_EXTRA = *.o
 
+include ../../Makefile.template
+
 lp_test_sincos.o : sse_mathfun.h
 
 PROGS_DEPS := ../../auxiliary/libgallium.a
 
-include ../../Makefile.template
-
 lp_tile_soa.c: lp_tile_soa.py ../../auxiliary/util/u_format_parse.py ../../auxiliary/util/u_format_pack.py ../../auxiliary/util/u_format.csv
 	python lp_tile_soa.py ../../auxiliary/util/u_format.csv > $@
 
-- 
cgit v1.2.3


From 914b0d34e89ee53ef3d7d8aac4baa794492a2064 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Sun, 17 Oct 2010 07:15:58 -0700
Subject: llvmpipe: Fix depth-stencil regression.

If stencil is enabled then we need to load the z_dst, even if depth
testing is disabled.

This fixes reflect mesa demo.
---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c | 47 ++++++++++++++++-------------
 1 file changed, 26 insertions(+), 21 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index ddf7da0b14..167ac0ed2e 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -330,7 +330,7 @@ lp_depth_type(const struct util_format_description *format_desc,
  * in the Z buffer (typically 0xffffff00 or 0x00ffffff).  That lets us
  * get by with fewer bit twiddling steps.
  */
-static void
+static boolean
 get_z_shift_and_mask(const struct util_format_description *format_desc,
                      unsigned *shift, unsigned *width, unsigned *mask)
 {
@@ -345,7 +345,8 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
 
    z_swizzle = format_desc->swizzle[0];
 
-   assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
+   if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+      return FALSE;
 
    *width = format_desc->channel[z_swizzle].size;
 
@@ -366,6 +367,8 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
    }
 
    *shift = padding_right;
+
+   return TRUE;
 }
 
 
@@ -554,6 +557,27 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    {
       unsigned s_shift, s_mask;
 
+      if (get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask)) {
+         if (z_mask != 0xffffffff) {
+            z_bitmask = lp_build_const_int_vec(z_type, z_mask);
+         }
+
+         /*
+          * Align the framebuffer Z 's LSB to the right.
+          */
+         if (z_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
+            z_dst = LLVMBuildLShr(builder, zs_dst, shift, "z_dst");
+         } else if (z_bitmask) {
+	    /* TODO: Instead of loading a mask from memory and ANDing, it's
+	     * probably faster to just shake the bits with two shifts. */
+            z_dst = LLVMBuildAnd(builder, zs_dst, z_bitmask, "z_dst");
+         } else {
+            z_dst = zs_dst;
+            lp_build_name(z_dst, "z_dst");
+         }
+      }
+
       if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
          if (s_shift) {
             LLVMValueRef shift = lp_build_const_int_vec(s_type, s_shift);
@@ -605,8 +629,6 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    }
 
    if (depth->enabled) {
-      get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
-
       /*
        * Convert fragment Z to the desired type, aligning the LSB to the right.
        */
@@ -644,23 +666,6 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
 
       lp_build_name(z_src, "z_src");
 
-      if (z_mask != 0xffffffff) {
-         z_bitmask = lp_build_const_int_vec(z_type, z_mask);
-      }
-
-      /*
-       * Align the framebuffer Z 's LSB to the right.
-       */
-      if (z_shift) {
-         LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
-         z_dst = LLVMBuildLShr(builder, zs_dst, shift, "z_dst");
-      } else if (z_bitmask) {
-         z_dst = LLVMBuildAnd(builder, zs_dst, z_bitmask, "z_dst");
-      } else {
-         z_dst = zs_dst;
-         lp_build_name(z_dst, "z_dst");
-      }
-
       /* compare src Z to dst Z, returning 'pass' mask */
       z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
 
-- 
cgit v1.2.3


From 709699d2e25146ac16af7e939de4398fdc50307e Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Sun, 17 Oct 2010 07:45:08 -0700
Subject: llvmpipe: Ensure z_shift and z_width is initialized.

---
 src/gallium/drivers/llvmpipe/lp_bld_depth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 167ac0ed2e..7eb76d4fb3 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -469,7 +469,7 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    struct lp_build_context z_bld;
    struct lp_build_context s_bld;
    struct lp_type s_type;
-   unsigned z_shift, z_width, z_mask;
+   unsigned z_shift = 0, z_width = 0, z_mask = 0;
    LLVMValueRef zs_dst, z_dst = NULL;
    LLVMValueRef stencil_vals = NULL;
    LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
-- 
cgit v1.2.3


From a0add0446ca9dce6d4a96014c42ba6cf3a73a44a Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Sun, 17 Oct 2010 09:58:04 -0700
Subject: llvmpipe: Fix bad refactoring.

'i' and 'chan' have random values here, which could cause a buffer
overflow in debug builds, if chan > 4.
---
 src/gallium/drivers/llvmpipe/lp_state_fs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index c070b55d3d..7acbe7e86c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -365,8 +365,7 @@ generate_fs(struct llvmpipe_context *lp,
                                          0);
          
       if (pos0 != -1 && outputs[pos0][2]) {
-         z = LLVMBuildLoad(builder, outputs[pos0][2], "z");
-         lp_build_name(z, "output%u.%u.%c", i, pos0, "xyzw"[chan]);
+         z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
       }
 
       lp_build_depth_stencil_test(builder,
-- 
cgit v1.2.3


From 4afad7d3edcaaa62b748cc9bd4d88a626ac0920a Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Sun, 17 Oct 2010 10:15:15 -0700
Subject: llvmpipe: Initialize bld ctx via lp_build_context_init instead of
 ad-hoc and broken code.

---
 src/gallium/drivers/llvmpipe/lp_test_round.c  | 5 +----
 src/gallium/drivers/llvmpipe/lp_test_sincos.c | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c
index 57b0ee5776..0770c7ab9a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_round.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_round.c
@@ -75,10 +75,7 @@ add_test(LLVMModuleRef module, const char *name, lp_func_t lp_func)
    LLVMValueRef ret;
    struct lp_build_context bld;
 
-   bld.builder = builder;
-   bld.type.floating = 1;
-   bld.type.width = 32;
-   bld.type.length = 4;
+   lp_build_context_init(&bld, builder, lp_float32_vec4_type());
 
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_sincos.c b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
index 7ab357f162..79939b1a39 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_sincos.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
@@ -72,10 +72,7 @@ add_sincos_test(LLVMModuleRef module, boolean sin)
    LLVMValueRef ret;
    struct lp_build_context bld;
 
-   bld.builder = builder;
-   bld.type.floating = 1;
-   bld.type.width = 32;
-   bld.type.length = 4;
+   lp_build_context_init(&bld, builder, lp_float32_vec4_type());
 
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
 
-- 
cgit v1.2.3


From c9d297162ad7efb87ab3e29c14259147d891baf9 Mon Sep 17 00:00:00 2001
From: Vinson Lee <vlee@vmware.com>
Date: Sun, 17 Oct 2010 14:09:53 -0700
Subject: llvmpipe: Return non-zero exit code for lp_test_round failures.

---
 src/gallium/drivers/llvmpipe/lp_test_round.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c
index 0770c7ab9a..0ca2791592 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_round.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_round.c
@@ -97,9 +97,10 @@ printv(char* string, v4sf value)
            f[0], f[1], f[2], f[3]);
 }
 
-static void
+static boolean
 compare(v4sf x, v4sf y)
 {
+   boolean success = TRUE;
    float *xp = (float *) &x;
    float *yp = (float *) &y;
    if (xp[0] != yp[0] ||
@@ -107,7 +108,9 @@ compare(v4sf x, v4sf y)
        xp[2] != yp[2] ||
        xp[3] != yp[3]) {
       printf(" Incorrect result! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n");
+      success = FALSE;
    }
+   return success;
 }
 
 
@@ -188,7 +191,7 @@ test_round(unsigned verbose, FILE *fp)
       y = round_func(x);
       printv("C round(x)   ", ref);
       printv("LLVM round(x)", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
 
       refp[0] = trunc(xp[0]);
       refp[1] = trunc(xp[1]);
@@ -197,7 +200,7 @@ test_round(unsigned verbose, FILE *fp)
       y = trunc_func(x);
       printv("C trunc(x)   ", ref);
       printv("LLVM trunc(x)", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
 
       refp[0] = floor(xp[0]);
       refp[1] = floor(xp[1]);
@@ -206,7 +209,7 @@ test_round(unsigned verbose, FILE *fp)
       y = floor_func(x);
       printv("C floor(x)   ", ref);
       printv("LLVM floor(x)", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
 
       refp[0] = ceil(xp[0]);
       refp[1] = ceil(xp[1]);
@@ -215,7 +218,7 @@ test_round(unsigned verbose, FILE *fp)
       y = ceil_func(x);
       printv("C ceil(x)    ", ref);
       printv("LLVM ceil(x) ", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
    }
 
    LLVMFreeMachineCodeForFunction(engine, test_round);
@@ -244,11 +247,7 @@ test_round(unsigned verbose, FILE *fp)
 boolean
 test_all(unsigned verbose, FILE *fp)
 {
-   boolean success = TRUE;
-
-   test_round(verbose, fp);
-
-   return success;
+   return test_round(verbose, fp);
 }
 
 
-- 
cgit v1.2.3


From ca2b2ac131933b4171b519813df1aaa3a81621cd Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Sun, 17 Oct 2010 18:48:11 -0700
Subject: llvmpipe: fail cleanly on malloc failure in lp_setup_alloc_triangle

---
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'src/gallium/drivers/llvmpipe')

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index c6cb9afda4..15c414d8c3 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -86,9 +86,10 @@ lp_setup_alloc_triangle(struct lp_scene *scene,
                 plane_sz);
 
    tri = lp_scene_alloc_aligned( scene, *tri_size, 16 );
-   if (tri) {
-      tri->inputs.stride = input_array_sz;
-   }
+   if (tri == NULL)
+      return NULL;
+
+   tri->inputs.stride = input_array_sz;
 
    {
       char *a = (char *)tri;
@@ -96,7 +97,6 @@ lp_setup_alloc_triangle(struct lp_scene *scene,
       assert(b - a == *tri_size);
    }
 
-
    return tri;
 }
 
-- 
cgit v1.2.3