35 files changed, 2274 insertions, 604 deletions
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 2892b62920..dec874623e 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -27,6 +27,8 @@ C_SOURCES = \
 	lp_scene_queue.c \
 	lp_screen.c \
 	lp_setup.c \
+	lp_setup_coef.c \
+	lp_setup_coef_intrin.c \
 	lp_setup_line.c \
 	lp_setup_point.c \
 	lp_setup_tri.c \
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 5583fca38e..8d57db72cf 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -63,6 +63,8 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_setup_line.c',
 		'lp_setup_point.c',
 		'lp_setup_tri.c',
+		'lp_setup_coef.c',
+		'lp_setup_coef_intrin.c',
 		'lp_setup_vbuf.c',
 		'lp_state_blend.c',
 		'lp_state_clip.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 7543bd7b2b..39f2c6085e 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -85,6 +85,14 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
    align_free( llvmpipe );
 }
 
+static void
+do_flush( struct pipe_context *pipe,
+          unsigned flags,
+          struct pipe_fence_handle **fence)
+{
+   llvmpipe_flush(pipe, flags, fence, __FUNCTION__);
+}
+
 
 struct pipe_context *
 llvmpipe_create_context( struct pipe_screen *screen, void *priv )
@@ -109,7 +117,7 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
    llvmpipe->pipe.destroy = llvmpipe_destroy;
    llvmpipe->pipe.set_framebuffer_state = llvmpipe_set_framebuffer_state;
    llvmpipe->pipe.clear = llvmpipe_clear;
-   llvmpipe->pipe.flush = llvmpipe_flush;
+   llvmpipe->pipe.flush = do_flush;
 
    llvmpipe_init_blend_funcs(llvmpipe);
    llvmpipe_init_clip_funcs(llvmpipe);
@@ -147,9 +155,13 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
    draw_install_aapoint_stage(llvmpipe->draw, &llvmpipe->pipe);
    draw_install_pstipple_stage(llvmpipe->draw, &llvmpipe->pipe);
 
-   /* convert points and lines into triangles: */
-   draw_wide_point_threshold(llvmpipe->draw, 0.0);
-   draw_wide_line_threshold(llvmpipe->draw, 0.0);
+   /* convert points and lines into triangles: 
+    * (otherwise, draw points and lines natively)
+    */
+   draw_wide_point_sprites(llvmpipe->draw, FALSE);
+   draw_enable_point_sprites(llvmpipe->draw, FALSE);
+   draw_wide_point_threshold(llvmpipe->draw, 10000.0);
+   draw_wide_line_threshold(llvmpipe->draw, 10000.0);
 
 #if USE_DRAW_STAGE_PSTIPPLE
    /* Do polygon stipple w/ texture map + frag prog? */
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 50f9091c3c..34fa20e204 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -101,6 +101,9 @@ struct llvmpipe_context {
    
    /** Vertex format */
    struct vertex_info vertex_info;
+   
+   /** Which vertex shader output slot contains point size */
+   int psize_slot;
 
    /** Fragment shader input interpolation info */
    unsigned num_inputs;
diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h
index 92fb2b3ee5..a928ee38be 100644
--- a/src/gallium/drivers/llvmpipe/lp_debug.h
+++ b/src/gallium/drivers/llvmpipe/lp_debug.h
@@ -46,6 +46,8 @@ st_print_current(void);
 #define DEBUG_SHOW_TILES    0x200
 #define DEBUG_SHOW_SUBTILES 0x400
 #define DEBUG_COUNTERS      0x800
+#define DEBUG_SCENE         0x1000
+#define DEBUG_FENCE         0x2000
 
 
 #ifdef DEBUG
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index e73b431cb4..3af5c8d5c5 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -68,25 +68,17 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    }
 
    /* Map index buffer, if present */
-   if (info->indexed && lp->index_buffer.buffer) {
-      char *indices = (char *) llvmpipe_resource_data(lp->index_buffer.buffer);
-      mapped_indices = (void *) (indices + lp->index_buffer.offset);
-   }
+   if (info->indexed && lp->index_buffer.buffer)
+      mapped_indices = llvmpipe_resource_data(lp->index_buffer.buffer);
 
-   draw_set_mapped_element_buffer_range(draw, (mapped_indices) ?
-                                        lp->index_buffer.index_size : 0,
-                                        info->index_bias,
-                                        info->min_index,
-                                        info->max_index,
-                                        mapped_indices);
+   draw_set_mapped_index_buffer(draw, mapped_indices);
 
    llvmpipe_prepare_vertex_sampling(lp,
                                     lp->num_vertex_sampler_views,
                                     lp->vertex_sampler_views);
 
    /* draw! */
-   draw_arrays_instanced(draw, info->mode, info->start, info->count,
-         info->start_instance, info->instance_count);
+   draw_vbo(draw, info);
 
    /*
     * unmap vertex/index buffers
@@ -95,7 +87,7 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       draw_set_mapped_vertex_buffer(draw, i, NULL);
    }
    if (mapped_indices) {
-      draw_set_mapped_element_buffer(draw, 0, 0, NULL);
+      draw_set_mapped_index_buffer(draw, NULL);
    }
    llvmpipe_cleanup_vertex_sampling(lp);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.c b/src/gallium/drivers/llvmpipe/lp_fence.c
index f9805e5d68..3a55e76bc3 100644
--- a/src/gallium/drivers/llvmpipe/lp_fence.c
+++ b/src/gallium/drivers/llvmpipe/lp_fence.c
@@ -44,6 +44,7 @@
 struct lp_fence *
 lp_fence_create(unsigned rank)
 {
+   static int fence_id;
    struct lp_fence *fence = CALLOC_STRUCT(lp_fence);
 
    pipe_reference_init(&fence->reference, 1);
@@ -51,8 +52,12 @@ lp_fence_create(unsigned rank)
    pipe_mutex_init(fence->mutex);
    pipe_condvar_init(fence->signalled);
 
+   fence->id = fence_id++;
    fence->rank = rank;
 
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, fence->id);
+
    return fence;
 }
 
@@ -61,6 +66,9 @@ lp_fence_create(unsigned rank)
 void
 lp_fence_destroy(struct lp_fence *fence)
 {
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, fence->id);
+
    pipe_mutex_destroy(fence->mutex);
    pipe_condvar_destroy(fence->signalled);
    FREE(fence);
@@ -68,82 +76,49 @@ lp_fence_destroy(struct lp_fence *fence)
 
 
 /**
- * For reference counting.
- * This is a Gallium API function.
- */
-static void
-llvmpipe_fence_reference(struct pipe_screen *screen,
-                         struct pipe_fence_handle **ptr,
-                         struct pipe_fence_handle *fence)
-{
-   struct lp_fence **old = (struct lp_fence **) ptr;
-   struct lp_fence *f = (struct lp_fence *) fence;
-
-   lp_fence_reference(old, f);
-}
-
-
-/**
- * Has the fence been executed/finished?
- * This is a Gallium API function.
- */
-static int
-llvmpipe_fence_signalled(struct pipe_screen *screen,
-                         struct pipe_fence_handle *fence,
-                         unsigned flag)
-{
-   struct lp_fence *f = (struct lp_fence *) fence;
-
-   return f->count == f->rank;
-}
-
-
-/**
- * Wait for the fence to finish.
- * This is a Gallium API function.
- */
-static int
-llvmpipe_fence_finish(struct pipe_screen *screen,
-                      struct pipe_fence_handle *fence_handle,
-                      unsigned flag)
-{
-   struct lp_fence *fence = (struct lp_fence *) fence_handle;
-
-   pipe_mutex_lock(fence->mutex);
-   while (fence->count < fence->rank) {
-      pipe_condvar_wait(fence->signalled, fence->mutex);
-   }
-   pipe_mutex_unlock(fence->mutex);
-
-   return 0;
-}
-
-
-/**
  * Called by the rendering threads to increment the fence counter.
  * When the counter == the rank, the fence is finished.
  */
 void
 lp_fence_signal(struct lp_fence *fence)
 {
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, fence->id);
+
    pipe_mutex_lock(fence->mutex);
 
    fence->count++;
    assert(fence->count <= fence->rank);
 
-   LP_DBG(DEBUG_RAST, "%s count=%u rank=%u\n", __FUNCTION__,
-          fence->count, fence->rank);
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s count=%u rank=%u\n", __FUNCTION__,
+                   fence->count, fence->rank);
 
-   pipe_condvar_signal(fence->signalled);
+   /* Wakeup all threads waiting on the mutex:
+    */
+   pipe_condvar_broadcast(fence->signalled);
 
    pipe_mutex_unlock(fence->mutex);
 }
 
+boolean
+lp_fence_signalled(struct lp_fence *f)
+{
+   return f->count == f->rank;
+}
 
 void
-llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen)
+lp_fence_wait(struct lp_fence *f)
 {
-   screen->fence_reference = llvmpipe_fence_reference;
-   screen->fence_signalled = llvmpipe_fence_signalled;
-   screen->fence_finish = llvmpipe_fence_finish;
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, f->id);
+
+   pipe_mutex_lock(f->mutex);
+   assert(f->issued);
+   while (f->count < f->rank) {
+      pipe_condvar_wait(f->signalled, f->mutex);
+   }
+   pipe_mutex_unlock(f->mutex);
 }
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.h b/src/gallium/drivers/llvmpipe/lp_fence.h
index 13358fb99f..3c59118780 100644
--- a/src/gallium/drivers/llvmpipe/lp_fence.h
+++ b/src/gallium/drivers/llvmpipe/lp_fence.h
@@ -41,10 +41,12 @@ struct pipe_screen;
 struct lp_fence
 {
    struct pipe_reference reference;
+   unsigned id;
 
    pipe_mutex mutex;
    pipe_condvar signalled;
 
+   boolean issued;
    unsigned rank;
    unsigned count;
 };
@@ -57,6 +59,11 @@ lp_fence_create(unsigned rank);
 void
 lp_fence_signal(struct lp_fence *fence);
 
+boolean
+lp_fence_signalled(struct lp_fence *fence);
+
+void
+lp_fence_wait(struct lp_fence *fence);
 
 void
 llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen);
@@ -78,5 +85,11 @@ lp_fence_reference(struct lp_fence **ptr,
    *ptr = f;
 }
 
+static INLINE boolean
+lp_fence_issued(const struct lp_fence *fence)
+{
+   return fence->issued;
+}
+
 
 #endif /* LP_FENCE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c
index 845292f4ab..e2c723b7a8 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.c
+++ b/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -31,6 +31,7 @@
 
 
 #include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
 #include "util/u_string.h"
 #include "draw/draw_context.h"
 #include "lp_flush.h"
@@ -45,14 +46,15 @@
 void
 llvmpipe_flush( struct pipe_context *pipe,
                 unsigned flags,
-                struct pipe_fence_handle **fence )
+                struct pipe_fence_handle **fence,
+                const char *reason)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
    draw_flush(llvmpipe->draw);
 
    /* ask the setup module to flush */
-   lp_setup_flush(llvmpipe->setup, flags, fence);
+   lp_setup_flush(llvmpipe->setup, flags, fence, reason);
 
    /* Enable to dump BMPs of the color/depth buffers each frame */
    if (0) {
@@ -76,6 +78,17 @@ llvmpipe_flush( struct pipe_context *pipe,
    }
 }
 
+void
+llvmpipe_finish( struct pipe_context *pipe,
+                 const char *reason )
+{
+   struct pipe_fence_handle *fence = NULL;
+   llvmpipe_flush(pipe, 0, &fence, reason);
+   if (fence) {
+      pipe->screen->fence_finish(pipe->screen, fence, 0);
+      pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+   }
+}
 
 /**
  * Flush context if necessary.
@@ -93,7 +106,8 @@ llvmpipe_flush_resource(struct pipe_context *pipe,
                         unsigned flush_flags,
                         boolean read_only,
                         boolean cpu_access,
-                        boolean do_not_block)
+                        boolean do_not_block,
+                        const char *reason)
 {
    unsigned referenced;
 
@@ -106,31 +120,16 @@ llvmpipe_flush_resource(struct pipe_context *pipe,
          /*
           * Flush and wait.
           */
-
-         struct pipe_fence_handle *fence = NULL;
-
          if (do_not_block)
             return FALSE;
 
-         /*
-          * Do the unswizzling in parallel.
-          *
-          * XXX: Don't abuse the PIPE_FLUSH_FRAME flag for this.
-          */
-         flush_flags |= PIPE_FLUSH_FRAME;
-
-         llvmpipe_flush(pipe, flush_flags, &fence);
-
-         if (fence) {
-            pipe->screen->fence_finish(pipe->screen, fence, 0);
-            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-         }
+         llvmpipe_finish(pipe, reason);
       } else {
          /*
           * Just flush.
           */
 
-         llvmpipe_flush(pipe, flush_flags, NULL);
+         llvmpipe_flush(pipe, flush_flags, NULL, reason);
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h
index 7b605681a9..bb538b2bd8 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.h
+++ b/src/gallium/drivers/llvmpipe/lp_flush.h
@@ -34,8 +34,14 @@ struct pipe_context;
 struct pipe_fence_handle;
 
 void
-llvmpipe_flush(struct pipe_context *pipe, unsigned flags,
-               struct pipe_fence_handle **fence);
+llvmpipe_flush(struct pipe_context *pipe,
+               unsigned flags,
+               struct pipe_fence_handle **fence,
+               const char *reason);
+
+void
+llvmpipe_finish( struct pipe_context *pipe,
+                 const char *reason );
 
 boolean
 llvmpipe_flush_resource(struct pipe_context *pipe,
@@ -45,6 +51,7 @@ llvmpipe_flush_resource(struct pipe_context *pipe,
                         unsigned flush_flags,
                         boolean read_only,
                         boolean cpu_access,
-                        boolean do_not_block);
+                        boolean do_not_block,
+                        const char *reason);
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_perf.c b/src/gallium/drivers/llvmpipe/lp_perf.c
index 083e7e30a5..e22532f25c 100644
--- a/src/gallium/drivers/llvmpipe/lp_perf.c
+++ b/src/gallium/drivers/llvmpipe/lp_perf.c
@@ -46,7 +46,7 @@ lp_print_counters(void)
 {
    if (LP_DEBUG & DEBUG_COUNTERS) {
       unsigned total_64, total_16, total_4;
-      float p1, p2, p3, p4;
+      float p1, p2, p3, p5, p6;
 
       debug_printf("llvmpipe: nr_triangles:                 %9u\n", lp_count.nr_tris);
       debug_printf("llvmpipe: nr_culled_triangles:          %9u\n", lp_count.nr_culled_tris);
@@ -58,11 +58,15 @@ lp_print_counters(void)
       p1 = 100.0 * (float) lp_count.nr_empty_64 / (float) total_64;
       p2 = 100.0 * (float) lp_count.nr_fully_covered_64 / (float) total_64;
       p3 = 100.0 * (float) lp_count.nr_partially_covered_64 / (float) total_64;
-      p4 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64;
+      p5 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64;
+      p6 = 100.0 * (float) lp_count.nr_shade_64 / (float) total_64;
 
       debug_printf("llvmpipe: nr_64x64:                     %9u\n", total_64);
       debug_printf("llvmpipe:   nr_fully_covered_64x64:     %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64);
-      debug_printf("llvmpipe:     nr_shade_opaque_64x64:    %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p4, total_64);
+      debug_printf("llvmpipe:     nr_shade_opaque_64x64:    %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p5, total_64);
+      debug_printf("llvmpipe:        nr_pure_shade_opaque:  %9u (%3.0f%% of %u)\n", lp_count.nr_pure_shade_opaque_64, 0.0, lp_count.nr_shade_opaque_64);
+      debug_printf("llvmpipe:     nr_shade_64x64:           %9u (%3.0f%% of %u)\n", lp_count.nr_shade_64, p6, total_64);
+      debug_printf("llvmpipe:        nr_pure_shade:         %9u (%3.0f%% of %u)\n", lp_count.nr_pure_shade_64, 0.0, lp_count.nr_shade_64);
       debug_printf("llvmpipe:   nr_partially_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64);
       debug_printf("llvmpipe:   nr_empty_64x64:             %9u (%3.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64);
 
@@ -79,12 +83,17 @@ lp_print_counters(void)
       debug_printf("llvmpipe:   nr_partially_covered_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16);
       debug_printf("llvmpipe:   nr_empty_16x16:             %9u (%3.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16);
 
-      total_4 = (lp_count.nr_empty_4 + lp_count.nr_non_empty_4);
+      total_4 = (lp_count.nr_empty_4 +
+                 lp_count.nr_fully_covered_4 +
+                 lp_count.nr_partially_covered_4);
 
       p1 = 100.0 * (float) lp_count.nr_empty_4 / (float) total_4;
-      p2 = 100.0 * (float) lp_count.nr_non_empty_4 / (float) total_4;
+      p2 = 100.0 * (float) lp_count.nr_fully_covered_4 / (float) total_4;
+      p3 = 100.0 * (float) lp_count.nr_partially_covered_4 / (float) total_4;
 
-      debug_printf("llvmpipe: nr_4x4:                       %9u\n", total_4);
+      debug_printf("llvmpipe: nr_tri_4x4:                   %9u\n", total_4);
+      debug_printf("llvmpipe:   nr_fully_covered_4x4:       %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_4, p2, total_4);
+      debug_printf("llvmpipe:   nr_partially_covered_4x4:   %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_4, p3, total_4);
       debug_printf("llvmpipe:   nr_empty_4x4:               %9u (%3.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4);
       debug_printf("llvmpipe:   nr_non_empty_4x4:           %9u (%3.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_perf.h b/src/gallium/drivers/llvmpipe/lp_perf.h
index 4774f64550..c28652fc30 100644
--- a/src/gallium/drivers/llvmpipe/lp_perf.h
+++ b/src/gallium/drivers/llvmpipe/lp_perf.h
@@ -44,11 +44,16 @@ struct lp_counters
    unsigned nr_empty_64;
    unsigned nr_fully_covered_64;
    unsigned nr_partially_covered_64;
+   unsigned nr_pure_shade_opaque_64;
+   unsigned nr_pure_shade_64;
+   unsigned nr_shade_64;
    unsigned nr_shade_opaque_64;
    unsigned nr_empty_16;
    unsigned nr_fully_covered_16;
    unsigned nr_partially_covered_16;
    unsigned nr_empty_4;
+   unsigned nr_fully_covered_4;
+   unsigned nr_partially_covered_4;
    unsigned nr_non_empty_4;
    unsigned nr_llvm_compiles;
    int64_t llvm_compile_time;  /**< total, in microseconds */
@@ -66,9 +71,11 @@ extern struct lp_counters lp_count;
 #ifdef DEBUG
 #define LP_COUNT(counter) lp_count.counter++
 #define LP_COUNT_ADD(counter, incr)  lp_count.counter += (incr)
+#define LP_COUNT_GET(counter) (lp_count.counter)
 #else
 #define LP_COUNT(counter)
 #define LP_COUNT_ADD(counter, incr) (void) incr
+#define LP_COUNT_GET(counter) 0
 #endif
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c
index 02eeaf6487..67fd797af2 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -35,9 +35,8 @@
 #include "util/u_memory.h"
 #include "lp_context.h"
 #include "lp_flush.h"
+#include "lp_fence.h"
 #include "lp_query.h"
-#include "lp_rast.h"
-#include "lp_rast_priv.h"
 #include "lp_state.h"
 
 
@@ -69,12 +68,7 @@ llvmpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
    struct llvmpipe_query *pq = llvmpipe_query(q);
    /* query might still be in process if we never waited for the result */
    if (!pq->done) {
-     struct pipe_fence_handle *fence = NULL;
-     llvmpipe_flush(pipe, 0, &fence);
-     if (fence) {
-         pipe->screen->fence_finish(pipe->screen, fence, 0);
-         pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-      }
+      llvmpipe_finish(pipe, __FUNCTION__);
    }
 
    pipe_mutex_destroy(pq->mutex);
@@ -93,16 +87,11 @@ llvmpipe_get_query_result(struct pipe_context *pipe,
 
    if (!pq->done) {
       if (wait) {
-         struct pipe_fence_handle *fence = NULL;
-         llvmpipe_flush(pipe, 0, &fence);
-         if (fence) {
-            pipe->screen->fence_finish(pipe->screen, fence, 0);
-            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-         }
+         llvmpipe_finish(pipe, __FUNCTION__);
       }
       /* this is a bit inconsequent but should be ok */
       else {
-         llvmpipe_flush(pipe, 0, NULL);
+         llvmpipe_flush(pipe, 0, NULL, __FUNCTION__);
       }
    }
 
@@ -125,12 +114,7 @@ llvmpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q)
     * frame of rendering.
     */
    if (pq->binned) {
-      struct pipe_fence_handle *fence;
-      llvmpipe_flush(pipe, 0, &fence);
-      if (fence) {
-         pipe->screen->fence_finish(pipe->screen, fence, 0);
-         pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-      }
+      llvmpipe_finish(pipe, __FUNCTION__);
    }
 
    lp_setup_begin_query(llvmpipe->setup, pq);
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index 3215d0f652..b1c306bbe9 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -316,43 +316,6 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
 }
 
 
-/**
- * Load tile color from the framebuffer surface.
- * This is a bin command called during bin processing.
- */
-#if 0
-void
-lp_rast_load_color(struct lp_rasterizer_task *task,
-                   const union lp_rast_cmd_arg arg)
-{
-   struct lp_rasterizer *rast = task->rast;
-   unsigned buf;
-   enum lp_texture_usage usage;
-
-   LP_DBG(DEBUG_RAST, "%s at %u, %u\n", __FUNCTION__, x, y);
-
-   if (scene->has_color_clear)
-      usage = LP_TEX_USAGE_WRITE_ALL;
-   else
-      usage = LP_TEX_USAGE_READ_WRITE;
-
-   /* Get pointers to color tile(s).
-    * This will convert linear data to tiled if needed.
-    */
-   for (buf = 0; buf < rast->state.nr_cbufs; buf++) {
-      struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf];
-      struct llvmpipe_texture *lpt;
-      assert(cbuf);
-      lpt = llvmpipe_texture(cbuf->texture);
-      task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt,
-                                                         cbuf->face + cbuf->zslice,
-                                                         cbuf->level,
-                                                         usage,
-                                                         task->x, task->y);
-      assert(task->color_tiles[buf]);
-   }
-}
-#endif
 
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index 44319a0ad6..b4564ef33b 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -120,7 +120,7 @@ struct lp_rast_triangle {
    float v[3][2];
 #endif
 
-   struct lp_rast_plane plane[7]; /* NOTE: may allocate fewer planes */
+   struct lp_rast_plane plane[8]; /* NOTE: may allocate fewer planes */
 };
 
 
@@ -236,6 +236,8 @@ void lp_rast_triangle_6( struct lp_rasterizer_task *,
                          const union lp_rast_cmd_arg );
 void lp_rast_triangle_7( struct lp_rasterizer_task *, 
                          const union lp_rast_cmd_arg );
+void lp_rast_triangle_8( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
 
 void lp_rast_shade_tile( struct lp_rasterizer_task *,
                          const union lp_rast_cmd_arg );
@@ -256,5 +258,9 @@ void lp_rast_begin_query(struct lp_rasterizer_task *,
 void lp_rast_end_query(struct lp_rasterizer_task *,
                        const union lp_rast_cmd_arg );
 
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg);
+
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 980c18c024..dbaa8e023a 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -67,7 +67,7 @@ block_full_16(struct lp_rasterizer_task *task,
 	 block_full_4(task, tri, x + ix, y + iy);
 }
 
-
+#if !defined(PIPE_ARCH_SSE)
 static INLINE unsigned
 build_mask(int c, int dcdx, int dcdy)
 {
@@ -98,6 +98,7 @@ build_mask(int c, int dcdx, int dcdy)
    return mask;
 }
 
+
 static INLINE unsigned
 build_mask_linear(int c, int dcdx, int dcdy)
 {
@@ -129,6 +130,137 @@ build_mask_linear(int c, int dcdx, int dcdy)
 }
 
 
+static INLINE void
+build_masks(int c, 
+	    int cdiff,
+	    int dcdx,
+	    int dcdy,
+	    unsigned *outmask,
+	    unsigned *partmask)
+{
+   *outmask |= build_mask_linear(c, dcdx, dcdy);
+   *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
+}
+
+#else
+#include <emmintrin.h>
+#include "util/u_sse.h"
+
+
+static INLINE void
+build_masks(int c, 
+	    int cdiff,
+	    int dcdx,
+	    int dcdy,
+	    unsigned *outmask,
+	    unsigned *partmask)
+{
+   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+   {
+      __m128i cstep01, cstep23, result;
+
+      cstep01 = _mm_packs_epi32(cstep0, cstep1);
+      cstep23 = _mm_packs_epi32(cstep2, cstep3);
+      result = _mm_packs_epi16(cstep01, cstep23);
+
+      *outmask |= _mm_movemask_epi8(result);
+   }
+
+
+   {
+      __m128i cio4 = _mm_set1_epi32(cdiff);
+      __m128i cstep01, cstep23, result;
+
+      cstep0 = _mm_add_epi32(cstep0, cio4);
+      cstep1 = _mm_add_epi32(cstep1, cio4);
+      cstep2 = _mm_add_epi32(cstep2, cio4);
+      cstep3 = _mm_add_epi32(cstep3, cio4);
+
+      cstep01 = _mm_packs_epi32(cstep0, cstep1);
+      cstep23 = _mm_packs_epi32(cstep2, cstep3);
+      result = _mm_packs_epi16(cstep01, cstep23);
+
+      *partmask |= _mm_movemask_epi8(result);
+   }
+}
+
+
+static INLINE unsigned
+build_mask_linear(int c, int dcdx, int dcdy)
+{
+   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return _mm_movemask_epi8(result);
+}
+
+static INLINE unsigned
+build_mask(int c, int dcdx, int dcdy)
+{
+   __m128i step = _mm_setr_epi32(0, dcdx, dcdy, dcdx + dcdy);
+   __m128i c0 = _mm_set1_epi32(c);
+
+   /* Get values across the quad
+    */
+   __m128i cstep0 = _mm_add_epi32(c0, step);
+
+   /* Scale up step for moving between quads.
+    */
+   __m128i step4 = _mm_add_epi32(step, step);
+
+   /* Get values for the remaining quads:
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, 
+				  _mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1)));
+   __m128i cstep2 = _mm_add_epi32(cstep0,
+				  _mm_shuffle_epi32(step4, _MM_SHUFFLE(2,2,2,2)));
+   __m128i cstep3 = _mm_add_epi32(cstep2,
+				  _mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1)));
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return _mm_movemask_epi8(result);
+}
+
+#endif
+
+
+
+
 #define TAG(x) x##_1
 #define NR_PLANES 1
 #include "lp_rast_tri_tmp.h"
@@ -157,3 +289,92 @@ build_mask_linear(int c, int dcdx, int dcdy)
 #define NR_PLANES 7
 #include "lp_rast_tri_tmp.h"
 
+#define TAG(x) x##_8
+#define NR_PLANES 8
+#include "lp_rast_tri_tmp.h"
+
+
+/* Special case for 3 plane triangle which is contained entirely
+ * within a 16x16 block.
+ */
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = tri->plane;
+   unsigned mask = arg.triangle.plane_mask;
+   const int x = task->x + (mask & 0xf) * 16;
+   const int y = task->y + (mask >> 4) * 16;
+   unsigned outmask, inmask, partmask, partial_mask;
+   unsigned j;
+   int c[3];
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   partmask = 0;                /* outside one or more trivial accept planes */
+
+   for (j = 0; j < 3; j++) {
+      c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+
+      {
+	 const int dcdx = -plane[j].dcdx * 4;
+	 const int dcdy = plane[j].dcdy * 4;
+	 const int cox = plane[j].eo * 4;
+	 const int cio = plane[j].ei * 4 - 1;
+
+	 build_masks(c[j] + cox,
+		     cio - cox,
+		     dcdx, dcdy, 
+		     &outmask,   /* sign bits from c[i][0..15] + cox */
+		     &partmask); /* sign bits from c[i][0..15] + cio */
+      }
+   }
+
+   if (outmask == 0xffff)
+      return;
+
+   /* Mask of sub-blocks which are inside all trivial accept planes:
+    */
+   inmask = ~partmask & 0xffff;
+
+   /* Mask of sub-blocks which are inside all trivial reject planes,
+    * but outside at least one trivial accept plane:
+    */
+   partial_mask = partmask & ~outmask;
+
+   assert((partial_mask & inmask) == 0);
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+      int cx[3];
+
+      partial_mask &= ~(1 << i);
+
+      for (j = 0; j < 3; j++)
+         cx[j] = (c[j] 
+		  - plane[j].dcdx * ix
+		  + plane[j].dcdy * iy);
+
+      do_block_4_3(task, tri, plane, px, py, cx);
+   }
+
+   /* Iterate over fulls: 
+    */
+   while (inmask) {
+      int i = ffs(inmask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+
+      inmask &= ~(1 << i);
+
+      block_full_4(task, tri, px, py);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 43f72d8ca8..99a0bae45d 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -32,7 +32,7 @@
 
 
 /**
- * Prototype for a 7 plane rasterizer function.  Will codegenerate
+ * Prototype for a 8 plane rasterizer function.  Will codegenerate
  * several of these.
  *
  * XXX: Varients for more/fewer planes.
@@ -81,11 +81,14 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
    for (j = 0; j < NR_PLANES; j++) {
       const int dcdx = -plane[j].dcdx * 4;
       const int dcdy = plane[j].dcdy * 4;
-      const int cox = c[j] + plane[j].eo * 4;
-      const int cio = c[j] + plane[j].ei * 4 - 1;
-
-      outmask |= build_mask_linear(cox, dcdx, dcdy);
-      partmask |= build_mask_linear(cio, dcdx, dcdy);
+      const int cox = plane[j].eo * 4;
+      const int cio = plane[j].ei * 4 - 1;
+
+      build_masks(c[j] + cox,
+		  cio - cox,
+		  dcdx, dcdy, 
+		  &outmask,   /* sign bits from c[i][0..15] + cox */
+		  &partmask); /* sign bits from c[i][0..15] + cio */
    }
 
    if (outmask == 0xffff)
@@ -102,6 +105,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
 
    assert((partial_mask & inmask) == 0);
 
+   LP_COUNT_ADD(nr_empty_4, util_bitcount(0xffff & ~(partial_mask | inmask)));
+
    /* Iterate over partials:
     */
    while (partial_mask) {
@@ -114,6 +119,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
 
       partial_mask &= ~(1 << i);
 
+      LP_COUNT(nr_partially_covered_4);
+
       for (j = 0; j < NR_PLANES; j++)
          cx[j] = (c[j] 
 		  - plane[j].dcdx * ix
@@ -133,6 +140,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
 
       inmask &= ~(1 << i);
 
+      LP_COUNT(nr_fully_covered_4);
       block_full_4(task, tri, px, py);
    }
 }
@@ -166,11 +174,14 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
       {
 	 const int dcdx = -plane[j].dcdx * 16;
 	 const int dcdy = plane[j].dcdy * 16;
-	 const int cox = c[j] + plane[j].eo * 16;
-	 const int cio = c[j] + plane[j].ei * 16 - 1;
-
-	 outmask |= build_mask_linear(cox, dcdx, dcdy);
-	 partmask |= build_mask_linear(cio, dcdx, dcdy);
+	 const int cox = plane[j].eo * 16;
+	 const int cio = plane[j].ei * 16 - 1;
+
+	 build_masks(c[j] + cox,
+		     cio - cox,
+		     dcdx, dcdy, 
+		     &outmask,   /* sign bits from c[i][0..15] + cox */
+		     &partmask); /* sign bits from c[i][0..15] + cio */
       }
 
       j++;
@@ -190,6 +201,8 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 
    assert((partial_mask & inmask) == 0);
 
+   LP_COUNT_ADD(nr_empty_16, util_bitcount(0xffff & ~(partial_mask | inmask)));
+
    /* Iterate over partials:
     */
    while (partial_mask) {
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index f88a759fe7..15a09b7100 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -163,12 +163,15 @@ lp_scene_reset(struct lp_scene *scene )
 
    /* Free all but last binner command lists:
     */
-   for (i = 0; i < TILES_X; i++) {
-      for (j = 0; j < TILES_Y; j++) {
+   for (i = 0; i < scene->tiles_x; i++) {
+      for (j = 0; j < scene->tiles_y; j++) {
          lp_scene_bin_reset(scene, i, j);
       }
    }
 
+   /* If there are any bins which weren't cleared by the loop above,
+    * they will be caught (on debug builds at least) by this assert:
+    */
    assert(lp_scene_is_empty(scene));
 
    /* Free all but last binned data block:
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 167cb2ee2e..1e65a91fc6 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -61,6 +61,8 @@ static const struct debug_named_value lp_debug_flags[] = {
    { "show_tiles",    DEBUG_SHOW_TILES, NULL },
    { "show_subtiles", DEBUG_SHOW_SUBTILES, NULL },
    { "counters", DEBUG_COUNTERS, NULL },
+   { "scene", DEBUG_SCENE, NULL },
+   { "fence", DEBUG_FENCE, NULL },
    DEBUG_NAMED_VALUE_END
 };
 #endif
@@ -87,7 +89,14 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
       return PIPE_MAX_SAMPLERS;
    case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
-      return PIPE_MAX_VERTEX_SAMPLERS;
+      /* At this time, the draw module and llvmpipe driver only
+       * support vertex shader texture lookups when LLVM is enabled in
+       * the draw module.
+       */
+      if (debug_get_bool_option("DRAW_USE_LLVM", TRUE))
+         return PIPE_MAX_VERTEX_SAMPLERS;
+      else
+         return 0;
    case PIPE_CAP_MAX_COMBINED_SAMPLERS:
       return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS;
    case PIPE_CAP_NPOT_TEXTURES:
@@ -230,6 +239,7 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
    assert(target == PIPE_BUFFER ||
           target == PIPE_TEXTURE_1D ||
           target == PIPE_TEXTURE_2D ||
+          target == PIPE_TEXTURE_RECT ||
           target == PIPE_TEXTURE_3D ||
           target == PIPE_TEXTURE_CUBE);
 
@@ -314,6 +324,51 @@ llvmpipe_destroy_screen( struct pipe_screen *_screen )
 
 
 
+
+/**
+ * Fence reference counting.
+ */
+static void
+llvmpipe_fence_reference(struct pipe_screen *screen,
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *fence)
+{
+   struct lp_fence **old = (struct lp_fence **) ptr;
+   struct lp_fence *f = (struct lp_fence *) fence;
+
+   lp_fence_reference(old, f);
+}
+
+
+/**
+ * Has the fence been executed/finished?
+ */
+static int
+llvmpipe_fence_signalled(struct pipe_screen *screen,
+                         struct pipe_fence_handle *fence,
+                         unsigned flag)
+{
+   struct lp_fence *f = (struct lp_fence *) fence;
+   return lp_fence_signalled(f);
+}
+
+
+/**
+ * Wait for the fence to finish.
+ */
+static int
+llvmpipe_fence_finish(struct pipe_screen *screen,
+                      struct pipe_fence_handle *fence_handle,
+                      unsigned flag)
+{
+   struct lp_fence *f = (struct lp_fence *) fence_handle;
+
+   lp_fence_wait(f);
+   return 0;
+}
+
+
+
 /**
  * Create a new pipe_screen object
  * Note: we're not presently subclassing pipe_screen (no llvmpipe_screen).
@@ -351,9 +406,11 @@ llvmpipe_create_screen(struct sw_winsys *winsys)
 
    screen->base.context_create = llvmpipe_create_context;
    screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer;
+   screen->base.fence_reference = llvmpipe_fence_reference;
+   screen->base.fence_signalled = llvmpipe_fence_signalled;
+   screen->base.fence_finish = llvmpipe_fence_finish;
 
    llvmpipe_init_screen_resource_funcs(&screen->base);
-   llvmpipe_init_screen_fence_funcs(&screen->base);
 
    lp_jit_screen_init(screen);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 556e571585..3da9097154 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -275,9 +275,10 @@ set_scene_state( struct lp_setup_context *setup,
 void
 lp_setup_flush( struct lp_setup_context *setup,
                 unsigned flags,
-                struct pipe_fence_handle **fence)
+                struct pipe_fence_handle **fence,
+                const char *reason)
 {
-   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+   LP_DBG(DEBUG_SETUP, "%s %s\n", __FUNCTION__, reason);
 
    if (setup->scene) {
       if (fence) {
@@ -287,6 +288,8 @@ lp_setup_flush( struct lp_setup_context *setup,
          *fence = lp_setup_fence( setup );
       }
 
+      if (setup->scene->fence)
+         setup->scene->fence->issued = TRUE;
    }
 
    set_scene_state( setup, SETUP_FLUSHED );
@@ -312,6 +315,11 @@ lp_setup_bind_framebuffer( struct lp_setup_context *setup,
     * scene.
     */
    util_copy_framebuffer_state(&setup->fb, fb);
+   setup->framebuffer.x0 = 0;
+   setup->framebuffer.y0 = 0;
+   setup->framebuffer.x1 = fb->width-1;
+   setup->framebuffer.y1 = fb->height-1;
+   setup->dirty |= LP_SETUP_NEW_SCISSOR;
 }
 
 
@@ -469,11 +477,35 @@ lp_setup_set_triangle_state( struct lp_setup_context *setup,
    setup->ccw_is_frontface = ccw_is_frontface;
    setup->cullmode = cull_mode;
    setup->triangle = first_triangle;
-   setup->scissor_test = scissor;
    setup->pixel_offset = gl_rasterization_rules ? 0.5f : 0.0f;
+
+   if (setup->scissor_test != scissor) {
+      setup->dirty |= LP_SETUP_NEW_SCISSOR;
+      setup->scissor_test = scissor;
+   }
 }
 
+void 
+lp_setup_set_line_state( struct lp_setup_context *setup,
+			 float line_width)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
+   setup->line_width = line_width;
+}
+
+void 
+lp_setup_set_point_state( struct lp_setup_context *setup,
+                          float point_size,                          
+                          boolean point_size_per_vertex,
+                          uint sprite)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   setup->point_size = point_size;
+   setup->sprite = sprite;
+   setup->point_size_per_vertex = point_size_per_vertex;
+}
 
 void
 lp_setup_set_fs_inputs( struct lp_setup_context *setup,
@@ -559,10 +591,11 @@ lp_setup_set_scissor( struct lp_setup_context *setup,
 
    assert(scissor);
 
-   if (memcmp(&setup->scissor.current, scissor, sizeof(*scissor)) != 0) {
-      setup->scissor.current = *scissor; /* struct copy */
-      setup->dirty |= LP_SETUP_NEW_SCISSOR;
-   }
+   setup->scissor.x0 = scissor->minx;
+   setup->scissor.x1 = scissor->maxx-1;
+   setup->scissor.y0 = scissor->miny;
+   setup->scissor.y1 = scissor->maxy-1;
+   setup->dirty |= LP_SETUP_NEW_SCISSOR;
 }
 
 
@@ -713,6 +746,12 @@ lp_setup_update_state( struct lp_setup_context *setup )
     */
    {
       struct llvmpipe_context *lp = llvmpipe_context(scene->pipe);
+
+      /* Will probably need to move this somewhere else, just need  
+       * to know about vertex shader point size attribute.
+       */
+      setup->psize = lp->psize_slot;
+
       if (lp->dirty) {
          llvmpipe_update_derived(lp);
       }
@@ -806,6 +845,14 @@ lp_setup_update_state( struct lp_setup_context *setup )
       }
    }
 
+   if (setup->dirty & LP_SETUP_NEW_SCISSOR) {
+      setup->draw_region = setup->framebuffer;
+      if (setup->scissor_test) {
+         u_rect_possible_intersection(&setup->scissor,
+                                      &setup->draw_region);
+      }
+   }
+                                      
    setup->dirty = 0;
 
    assert(setup->fs.stored);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index 73b1c85325..821ebb1087 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -85,7 +85,8 @@ lp_setup_fence( struct lp_setup_context *setup );
 void
 lp_setup_flush( struct lp_setup_context *setup,
                 unsigned flags,
-                struct pipe_fence_handle **fence);
+                struct pipe_fence_handle **fence,
+                const char *reason);
 
 
 void
@@ -99,6 +100,16 @@ lp_setup_set_triangle_state( struct lp_setup_context *setup,
                              boolean scissor,
                              boolean gl_rasterization_rules );
 
+void 
+lp_setup_set_line_state( struct lp_setup_context *setup,
+                         float line_width);
+
+void 
+lp_setup_set_point_state( struct lp_setup_context *setup,
+                          float point_size,                          
+                          boolean point_size_per_vertex,
+                          uint sprite);
+
 void
 lp_setup_set_fs_inputs( struct lp_setup_context *setup,
                         const struct lp_shader_input *interp,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
new file mode 100644
index 0000000000..95e3e8fffe
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
@@ -0,0 +1,258 @@
+/**************************************************************************
+ *
+ * Copyright 2010, VMware.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for triangles
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_setup_coef.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
+
+#if !defined(PIPE_ARCH_SSE)
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct lp_rast_shader_inputs *inputs,
+                           unsigned slot,
+			   const float value,
+                           unsigned i )
+{
+   inputs->a0[slot][i] = value;
+   inputs->dadx[slot][i] = 0.0f;
+   inputs->dady[slot][i] = 0.0f;
+}
+
+
+
+static void linear_coef( struct lp_rast_shader_inputs *inputs,
+                         const struct lp_tri_info *info,
+                         unsigned slot,
+                         unsigned vert_attr,
+                         unsigned i)
+{
+   float a0 = info->v0[vert_attr][i];
+   float a1 = info->v1[vert_attr][i];
+   float a2 = info->v2[vert_attr][i];
+
+   float da01 = a0 - a1;
+   float da20 = a2 - a0;
+   float dadx = (da01 * info->dy20_ooa - info->dy01_ooa * da20);
+   float dady = (da20 * info->dx01_ooa - info->dx20_ooa * da01);
+
+   inputs->dadx[slot][i] = dadx;
+   inputs->dady[slot][i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (0.5, 0.5).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
+				   dady * info->y0_center);
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct lp_rast_shader_inputs *inputs,
+                              const struct lp_tri_info *info,
+                              unsigned slot,
+			      unsigned vert_attr,
+                              unsigned i)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   float a0 = info->v0[vert_attr][i] * info->v0[0][3];
+   float a1 = info->v1[vert_attr][i] * info->v1[0][3];
+   float a2 = info->v2[vert_attr][i] * info->v2[0][3];
+   float da01 = a0 - a1;
+   float da20 = a2 - a0;
+   float dadx = da01 * info->dy20_ooa - info->dy01_ooa * da20;
+   float dady = da20 * info->dx01_ooa - info->dx20_ooa * da01;
+
+   inputs->dadx[slot][i] = dadx;
+   inputs->dady[slot][i] = dady;
+   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
+				   dady * info->y0_center);
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial
+ * Z and W are copied from position_coef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs,
+                     const struct lp_tri_info *info,
+                     unsigned slot,
+                     unsigned usage_mask)
+{
+   /*X*/
+   if (usage_mask & TGSI_WRITEMASK_X) {
+      inputs->a0[slot][0] = 0.0;
+      inputs->dadx[slot][0] = 1.0;
+      inputs->dady[slot][0] = 0.0;
+   }
+
+   /*Y*/
+   if (usage_mask & TGSI_WRITEMASK_Y) {
+      inputs->a0[slot][1] = 0.0;
+      inputs->dadx[slot][1] = 0.0;
+      inputs->dady[slot][1] = 1.0;
+   }
+
+   /*Z*/
+   if (usage_mask & TGSI_WRITEMASK_Z) {
+      linear_coef(inputs, info, slot, 0, 2);
+   }
+
+   /*W*/
+   if (usage_mask & TGSI_WRITEMASK_W) {
+      linear_coef(inputs, info, slot, 0, 3);
+   }
+}
+
+
+/**
+ * Setup the fragment input attribute with the front-facing value.
+ * \param frontface  is the triangle front facing?
+ */
+static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
+                               unsigned slot,
+                               boolean frontface,
+                               unsigned usage_mask)
+{
+   /* convert TRUE to 1.0 and FALSE to -1.0 */
+   if (usage_mask & TGSI_WRITEMASK_X)
+      constant_coef( inputs, slot, 2.0f * frontface - 1.0f, 0 );
+
+   if (usage_mask & TGSI_WRITEMASK_Y)
+      constant_coef( inputs, slot, 0.0f, 1 ); /* wasted */
+
+   if (usage_mask & TGSI_WRITEMASK_Z)
+      constant_coef( inputs, slot, 0.0f, 2 ); /* wasted */
+
+   if (usage_mask & TGSI_WRITEMASK_W)
+      constant_coef( inputs, slot, 0.0f, 3 ); /* wasted */
+}
+
+
+/**
+ * Compute the tri->coef[] array dadx, dady, a0 values.
+ */
+void lp_setup_tri_coef( struct lp_setup_context *setup,
+			struct lp_rast_shader_inputs *inputs,
+			const struct lp_tri_info *info)
+{
+   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
+   unsigned slot;
+   unsigned i;
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         if (setup->flatshade_first) {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(inputs, slot+1, info->v0[vert_attr][i], i);
+         }
+         else {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(inputs, slot+1, info->v2[vert_attr][i], i);
+         }
+         break;
+
+      case LP_INTERP_LINEAR:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               linear_coef(inputs, info, slot+1, vert_attr, i);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               perspective_coef(inputs, info, slot+1, vert_attr, i);
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0, so all need to ensure that the usage mask is covers all
+          * usages.
+          */
+         fragcoord_usage_mask |= usage_mask;
+         break;
+
+      case LP_INTERP_FACING:
+         setup_facing_coef(inputs, slot+1, info->frontfacing, usage_mask);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   /* The internal position input is in slot zero:
+    */
+   setup_fragcoord_coef(inputs, info, 0, fragcoord_usage_mask);
+}
+
+#else
+extern void lp_setup_coef_dummy(void);
+void lp_setup_coef_dummy(void)
+{
+}
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.h b/src/gallium/drivers/llvmpipe/lp_setup_coef.h
new file mode 100644
index 0000000000..d68b39c603
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef.h
@@ -0,0 +1,61 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * The setup code is concerned with point/line/triangle setup and
+ * putting commands/data into the bins.
+ */
+
+
+#ifndef LP_SETUP_COEF_H
+#define LP_SETUP_COEF_H
+
+
+struct lp_tri_info {
+
+   float x0_center;
+   float y0_center;
+
+   /* turn these into an aligned float[4] */
+   float dy01_ooa;
+   float dy20_ooa;
+   float dx01_ooa;
+   float dx20_ooa;
+
+   const float (*v0)[4];
+   const float (*v1)[4];
+   const float (*v2)[4];
+
+   boolean frontfacing;		/* remove eventually */
+};
+
+void lp_setup_tri_coef( struct lp_setup_context *setup,
+			struct lp_rast_shader_inputs *inputs,
+			const struct lp_tri_info *info);
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
new file mode 100644
index 0000000000..73fb70599c
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
@@ -0,0 +1,207 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for triangles
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_setup_coef.h"
+#include "lp_rast.h"
+
+#if defined(PIPE_ARCH_SSE)
+#include <emmintrin.h>
+
+
+static void constant_coef4( struct lp_rast_shader_inputs *inputs,
+			    const struct lp_tri_info *info,
+			    unsigned slot,
+			    const float *attr)
+{
+   *(__m128 *)inputs->a0[slot]   = *(__m128 *)attr;
+   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
+   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
+}
+
+
+
+/**
+ * Setup the fragment input attribute with the front-facing value.
+ * \param frontface  is the triangle front facing?
+ */
+static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
+			       const struct lp_tri_info *info,
+			       unsigned slot )
+{
+   /* XXX: just pass frontface directly to the shader, don't bother
+    * treating it as an input.
+    */
+   __m128 a0 = _mm_setr_ps(info->frontfacing ? 1.0 : -1.0,
+			   0, 0, 0);
+
+   *(__m128 *)inputs->a0[slot]   = a0;
+   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
+   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
+}
+
+
+
+static void calc_coef4( struct lp_rast_shader_inputs *inputs,
+			const struct lp_tri_info *info,
+			unsigned slot,
+			__m128 a0,
+			__m128 a1,
+			__m128 a2)
+{
+   __m128 da01          = _mm_sub_ps(a0, a1);
+   __m128 da20          = _mm_sub_ps(a2, a0);
+
+   __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dy20_ooa));
+   __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dy01_ooa));   
+   __m128 dadx          = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa);
+
+   __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dx20_ooa));
+   __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dx01_ooa));
+   __m128 dady          = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa);
+
+   __m128 dadx_x0       = _mm_mul_ps(dadx, _mm_set1_ps(info->x0_center));
+   __m128 dady_y0       = _mm_mul_ps(dady, _mm_set1_ps(info->y0_center));
+   __m128 attr_v0       = _mm_add_ps(dadx_x0, dady_y0);
+   __m128 attr_0        = _mm_sub_ps(a0, attr_v0);
+
+   *(__m128 *)inputs->a0[slot]   = attr_0;
+   *(__m128 *)inputs->dadx[slot] = dadx;
+   *(__m128 *)inputs->dady[slot] = dady;
+}
+
+
+static void linear_coef( struct lp_rast_shader_inputs *inputs,
+                         const struct lp_tri_info *info,
+                         unsigned slot,
+                         unsigned vert_attr)
+{
+   __m128 a0 = *(const __m128 *)info->v0[vert_attr];
+   __m128 a1 = *(const __m128 *)info->v1[vert_attr];
+   __m128 a2 = *(const __m128 *)info->v2[vert_attr];
+
+   calc_coef4(inputs, info, slot, a0, a1, a2);
+}
+
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct lp_rast_shader_inputs *inputs,
+                              const struct lp_tri_info *info,
+                              unsigned slot,
+			      unsigned vert_attr)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   __m128 a0 = *(const __m128 *)info->v0[vert_attr];
+   __m128 a1 = *(const __m128 *)info->v1[vert_attr];
+   __m128 a2 = *(const __m128 *)info->v2[vert_attr];
+
+   __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(info->v0[0][3]));
+   __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(info->v1[0][3]));
+   __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(info->v2[0][3]));
+
+   calc_coef4(inputs, info, slot, a0_oow, a1_oow, a2_oow);
+}
+
+
+
+
+
+/**
+ * Compute the inputs-> dadx, dady, a0 values.
+ */
+void lp_setup_tri_coef( struct lp_setup_context *setup,
+			struct lp_rast_shader_inputs *inputs,
+			const struct lp_tri_info *info)
+{
+   unsigned slot;
+
+   /* The internal position input is in slot zero:
+    */
+   linear_coef(inputs, info, 0, 0);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         if (setup->flatshade_first) {
+	    constant_coef4(inputs, info, slot+1, info->v0[vert_attr]);
+         }
+         else {
+	    constant_coef4(inputs, info, slot+1, info->v2[vert_attr]);
+         }
+         break;
+
+      case LP_INTERP_LINEAR:
+	 linear_coef(inputs, info, slot+1, vert_attr);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+	 perspective_coef(inputs, info, slot+1, vert_attr);
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0.
+          */
+         break;
+
+      case LP_INTERP_FACING:
+         setup_facing_coef(inputs, info, slot+1);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+}
+
+#else
+extern void lp_setup_coef_dummy(void);
+void lp_setup_coef_dummy(void)
+{
+}
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index a0606f5034..877a492c6d 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -41,6 +41,7 @@
 #include "lp_scene.h"
 
 #include "draw/draw_vbuf.h"
+#include "util/u_rect.h"
 
 #define LP_SETUP_NEW_FS          0x01
 #define LP_SETUP_NEW_CONSTANTS   0x02
@@ -73,6 +74,7 @@ struct lp_setup_context
    uint prim;
    uint vertex_size;
    uint nr_vertices;
+   uint sprite;
    uint vertex_buffer_size;
    void *vertex_buffer;
 
@@ -88,10 +90,17 @@ struct lp_setup_context
    boolean flatshade_first;
    boolean ccw_is_frontface;
    boolean scissor_test;
+   boolean point_size_per_vertex;
    unsigned cullmode;
    float pixel_offset;
+   float line_width;
+   float point_size;
+   float psize;
 
    struct pipe_framebuffer_state fb;
+   struct u_rect framebuffer;
+   struct u_rect scissor;
+   struct u_rect draw_region;   /* intersection of fb & scissor */
 
    struct {
       unsigned flags;
@@ -127,9 +136,6 @@ struct lp_setup_context
       uint8_t *stored;
    } blend_color;
 
-   struct {
-      struct pipe_scissor_state current;
-   } scissor;
 
    unsigned dirty;   /**< bitmask of LP_SETUP_NEW_x bits */
 
@@ -158,4 +164,29 @@ void lp_setup_update_state( struct lp_setup_context *setup );
 
 void lp_setup_destroy( struct lp_setup_context *setup );
 
+void
+lp_setup_print_triangle(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4]);
+
+void
+lp_setup_print_vertex(struct lp_setup_context *setup,
+                      const char *name,
+                      const float (*v)[4]);
+
+
+struct lp_rast_triangle *
+lp_setup_alloc_triangle(struct lp_scene *scene,
+                        unsigned nr_inputs,
+                        unsigned nr_planes,
+                        unsigned *tri_size);
+
+void
+lp_setup_bin_triangle( struct lp_setup_context *setup,
+                       struct lp_rast_triangle *tri,
+                       const struct u_rect *bbox,
+                       int nr_planes );
+
 #endif
+
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index be41c44e6f..ce2da55cf4 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -29,19 +29,671 @@
  * Binning code for lines
  */
 
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
 #include "lp_setup_context.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
 
-static void line_nop( struct lp_setup_context *setup,
-                      const float (*v0)[4],
-                      const float (*v1)[4] )
+#define NUM_CHANNELS 4
+
+struct lp_line_info {
+
+   float dx;
+   float dy;
+   float oneoverarea;
+
+   const float (*v1)[4];
+   const float (*v2)[4];
+};
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct lp_setup_context *setup,
+                           struct lp_rast_triangle *tri,
+                           unsigned slot,
+                           const float value,
+                           unsigned i )
+{
+   tri->inputs.a0[slot][i] = value;
+   tri->inputs.dadx[slot][i] = 0.0f;
+   tri->inputs.dady[slot][i] = 0.0f;
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void linear_coef( struct lp_setup_context *setup,
+                         struct lp_rast_triangle *tri,
+                         struct lp_line_info *info,
+                         unsigned slot,
+                         unsigned vert_attr,
+                         unsigned i)
+{
+   float a1 = info->v1[vert_attr][i]; 
+   float a2 = info->v2[vert_attr][i];
+      
+   float da21 = a1 - a2;   
+   float dadx = da21 * info->dx * info->oneoverarea;
+   float dady = da21 * info->dy * info->oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;  
+   
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (info->v1[0][0] - setup->pixel_offset) +
+                               dady * (info->v1[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct lp_setup_context *setup,
+                              struct lp_rast_triangle *tri,
+                              struct lp_line_info *info,
+                              unsigned slot,
+                              unsigned vert_attr,
+                              unsigned i)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   float a1 = info->v1[vert_attr][i] * info->v1[0][3];
+   float a2 = info->v2[vert_attr][i] * info->v2[0][3];
+
+   float da21 = a1 - a2;   
+   float dadx = da21 * info->dx * info->oneoverarea;
+   float dady = da21 * info->dy * info->oneoverarea;
+
+   tri->inputs.dadx[slot][i] = dadx;
+   tri->inputs.dady[slot][i] = dady;
+   
+   tri->inputs.a0[slot][i] = (a1 -
+                              (dadx * (info->v1[0][0] - setup->pixel_offset) +
+                               dady * (info->v1[0][1] - setup->pixel_offset)));
+}
+
+static void
+setup_fragcoord_coef( struct lp_setup_context *setup,
+                      struct lp_rast_triangle *tri,
+                      struct lp_line_info *info,
+                      unsigned slot,
+                      unsigned usage_mask)
+{
+   /*X*/
+   if (usage_mask & TGSI_WRITEMASK_X) {
+      tri->inputs.a0[slot][0] = 0.0;
+      tri->inputs.dadx[slot][0] = 1.0;
+      tri->inputs.dady[slot][0] = 0.0;
+   }
+
+   /*Y*/
+   if (usage_mask & TGSI_WRITEMASK_Y) {
+      tri->inputs.a0[slot][1] = 0.0;
+      tri->inputs.dadx[slot][1] = 0.0;
+      tri->inputs.dady[slot][1] = 1.0;
+   }
+
+   /*Z*/
+   if (usage_mask & TGSI_WRITEMASK_Z) {
+      linear_coef(setup, tri, info, slot, 0, 2);
+   }
+
+   /*W*/
+   if (usage_mask & TGSI_WRITEMASK_W) {
+      linear_coef(setup, tri, info, slot, 0, 3);
+   }
+}
+
+/**
+ * Compute the tri->coef[] array dadx, dady, a0 values.
+ */
+static void setup_line_coefficients( struct lp_setup_context *setup,
+                                     struct lp_rast_triangle *tri,
+                                     struct lp_line_info *info)
+{
+   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
+   unsigned slot;
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+      unsigned i;
+           
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         if (setup->flatshade_first) {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i);
+         }
+         else {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(setup, tri, slot+1, info->v2[vert_attr][i], i);
+         }
+         break;
+
+      case LP_INTERP_LINEAR:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               linear_coef(setup, tri, info, slot+1, vert_attr, i);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               perspective_coef(setup, tri, info, slot+1, vert_attr, i);
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0, so all need to ensure that the usage mask is covers all
+          * usages.
+          */
+         fragcoord_usage_mask |= usage_mask;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   /* The internal position input is in slot zero:
+    */
+   setup_fragcoord_coef(setup, tri, info, 0,
+                        fragcoord_usage_mask);
+}
+
+
+
+static INLINE int subpixel_snap( float a )
+{
+   return util_iround(FIXED_ONE * a);
+}
+
+
+/**
+ * Print line vertex attribs (for debug).
+ */
+static void
+print_line(struct lp_setup_context *setup,
+           const float (*v1)[4],
+           const float (*v2)[4])
+{
+   uint i;
+
+   debug_printf("llvmpipe line\n");
+   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
+      debug_printf("  v1[%d]:  %f %f %f %f\n", i,
+                   v1[i][0], v1[i][1], v1[i][2], v1[i][3]);
+   }
+   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
+      debug_printf("  v2[%d]:  %f %f %f %f\n", i,
+                   v2[i][0], v2[i][1], v2[i][2], v2[i][3]);
+   }
+}
+
+
+static INLINE boolean sign(float x){
+   return x >= 0;  
+}  
+
+
+/* Used on positive floats only:
+ */
+static INLINE float fracf(float f)
 {
+   return f - floorf(f);
 }
 
 
-void 
-lp_setup_choose_line( struct lp_setup_context *setup )
+
+static void
+lp_setup_line( struct lp_setup_context *setup,
+               const float (*v1)[4],
+               const float (*v2)[4])
 {
-   setup->line = line_nop;
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   struct lp_rast_triangle *line;
+   struct lp_line_info info;
+   float width = MAX2(1.0, setup->line_width);
+   struct u_rect bbox;
+   unsigned tri_bytes;
+   int x[4]; 
+   int y[4];
+   int i;
+   int nr_planes = 4;
+   
+   /* linewidth should be interpreted as integer */
+   int fixed_width = util_iround(width) * FIXED_ONE;
+
+   float x_offset=0;
+   float y_offset=0;
+   float x_offset_end=0;
+   float y_offset_end=0;
+      
+   float x1diff;
+   float y1diff;
+   float x2diff;
+   float y2diff;
+   float dx, dy;
+
+   boolean draw_start;
+   boolean draw_end;
+   boolean will_draw_start;
+   boolean will_draw_end;
+
+   if (0)
+      print_line(setup, v1, v2);
+
+   if (setup->scissor_test) {
+      nr_planes = 8;
+   }
+   else {
+      nr_planes = 4;
+   }
+
+
+   dx = v1[0][0] - v2[0][0];
+   dy = v1[0][1] - v2[0][1];
+  
+   /* X-MAJOR LINE */
+   if (fabsf(dx) >= fabsf(dy)) {
+      float dydx = dy / dx;
+
+      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
+      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
+      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
+      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;
+
+      if (y2diff==-0.5 && dy<0){
+         y2diff = 0.5;
+      }
+      
+      /* 
+       * Diamond exit rule test for starting point 
+       */    
+      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
+         draw_start = TRUE;
+      }
+      else if (sign(x1diff) == sign(-dx)) {
+         draw_start = FALSE;
+      }
+      else if (sign(-y1diff) != sign(dy)) {
+         draw_start = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float yintersect = fracf(v1[0][1]) + x1diff * dydx;
+         draw_start = (yintersect < 1.0 && yintersect > 0.0);
+      }
+
+
+      /* 
+       * Diamond exit rule test for ending point 
+       */    
+      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
+         draw_end = FALSE;
+      }
+      else if (sign(x2diff) != sign(-dx)) {
+         draw_end = FALSE;
+      }
+      else if (sign(-y2diff) == sign(dy)) {
+         draw_end = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float yintersect = fracf(v2[0][1]) + x2diff * dydx;
+         draw_end = (yintersect < 1.0 && yintersect > 0.0);
+      }
+
+      /* Are we already drawing start/end?
+       */
+      will_draw_start = sign(-x1diff) != sign(dx);
+      will_draw_end = (sign(x2diff) == sign(-dx)) || x2diff==0;
+
+      if (dx < 0) {
+         /* if v2 is to the right of v1, swap pointers */
+         const float (*temp)[4] = v1;
+         v1 = v2;
+         v2 = temp;
+         dx = -dx;
+         dy = -dy;
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            x_offset_end = - x1diff - 0.5;
+            y_offset_end = x_offset_end * dydx;
+
+         }
+         if (will_draw_end != draw_end) {
+            x_offset = - x2diff - 0.5;
+            y_offset = x_offset * dydx;
+         }
+
+      }
+      else{
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            x_offset = - x1diff + 0.5;
+            y_offset = x_offset * dydx;
+         }
+         if (will_draw_end != draw_end) {
+            x_offset_end = - x2diff + 0.5;
+            y_offset_end = x_offset_end * dydx;
+         }
+      }
+  
+      /* x/y positions in fixed point */
+      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
+      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
+      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
+      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
+      
+      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) - fixed_width/2;
+      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) - fixed_width/2;
+      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) + fixed_width/2;
+      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) + fixed_width/2;
+      
+   }
+   else {
+      const float dxdy = dx / dy;
+
+      /* Y-MAJOR LINE */      
+      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
+      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
+      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
+      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;
+
+      if (x2diff==-0.5 && dx<0) {
+         x2diff = 0.5;
+      }
+
+      /* 
+       * Diamond exit rule test for starting point 
+       */    
+      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
+         draw_start = TRUE;
+      }
+      else if (sign(-y1diff) == sign(dy)) {
+         draw_start = FALSE;
+      }
+      else if (sign(x1diff) != sign(-dx)) {
+         draw_start = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float xintersect = fracf(v1[0][0]) + y1diff * dxdy;
+         draw_start = (xintersect < 1.0 && xintersect > 0.0);
+      }
+
+      /* 
+       * Diamond exit rule test for ending point 
+       */    
+      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
+         draw_end = FALSE;
+      }
+      else if (sign(-y2diff) != sign(dy) ) {
+         draw_end = FALSE;
+      }
+      else if (sign(x2diff) == sign(-dx) ) {
+         draw_end = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float xintersect = fracf(v2[0][0]) + y2diff * dxdy;
+         draw_end = (xintersect < 1.0 && xintersect > 0.0);
+      }
+
+      /* Are we already drawing start/end?
+       */
+      will_draw_start = sign(y1diff) == sign(dy);
+      will_draw_end = (sign(-y2diff) == sign(dy)) || y2diff==0;
+
+      if (dy > 0) {
+         /* if v2 is on top of v1, swap pointers */
+         const float (*temp)[4] = v1;
+         v1 = v2;
+         v2 = temp; 
+         dx = -dx;
+         dy = -dy;
+
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            y_offset_end = - y1diff + 0.5;
+            x_offset_end = y_offset_end * dxdy;
+         }
+         if (will_draw_end != draw_end) {
+            y_offset = - y2diff + 0.5;
+            x_offset = y_offset * dxdy;
+         }
+      }
+      else {
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            y_offset = - y1diff - 0.5;
+            x_offset = y_offset * dxdy;
+                     
+         }
+         if (will_draw_end != draw_end) {
+            y_offset_end = - y2diff - 0.5;
+            x_offset_end = y_offset_end * dxdy;
+         }
+      }
+ 
+      /* x/y positions in fixed point */
+      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) - fixed_width/2;
+      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2;
+      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) + fixed_width/2;
+      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) + fixed_width/2;
+     
+      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset); 
+      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
+      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
+      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset);
+   }
+
+
+
+   LP_COUNT(nr_tris);
+
+ 
+   /* Bounding rectangle (in pixels) */
+   {
+      /* Yes this is necessary to accurately calculate bounding boxes
+       * with the two fill-conventions we support.  GL (normally) ends
+       * up needing a bottom-left fill convention, which requires
+       * slightly different rounding.
+       */
+      int adj = (setup->pixel_offset != 0) ? 1 : 0;
+
+      bbox.x0 = (MIN4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.x1 = (MAX4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.y0 = (MIN4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.y1 = (MAX4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+
+      /* Inclusive coordinates:
+       */
+      bbox.x1--;
+      bbox.y1--;
+   }
+
+   if (bbox.x1 < bbox.x0 ||
+       bbox.y1 < bbox.y0) {
+      if (0) debug_printf("empty bounding box\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   if (!u_rect_test_intersection(&setup->draw_region, &bbox)) {
+      if (0) debug_printf("offscreen\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   u_rect_find_intersection(&setup->draw_region, &bbox);
+
+   line = lp_setup_alloc_triangle(scene,
+                                  setup->fs.nr_inputs,
+                                  nr_planes,
+                                  &tri_bytes);
+   if (!line)
+      return;
+
+#ifdef DEBUG
+   line->v[0][0] = v1[0][0];
+   line->v[1][0] = v2[0][0];   
+   line->v[0][1] = v1[0][1];
+   line->v[1][1] = v2[0][1];
+#endif
+
+   /* calculate the deltas */
+   line->plane[0].dcdy = x[0] - x[1];
+   line->plane[1].dcdy = x[1] - x[2];
+   line->plane[2].dcdy = x[2] - x[3];
+   line->plane[3].dcdy = x[3] - x[0];
+
+   line->plane[0].dcdx = y[0] - y[1];
+   line->plane[1].dcdx = y[1] - y[2];
+   line->plane[2].dcdx = y[2] - y[3];
+   line->plane[3].dcdx = y[3] - y[0];
+
+
+   info.oneoverarea = 1.0f / (dx * dx  + dy * dy);    
+   info.dx = dx;
+   info.dy = dy;
+   info.v1 = v1;
+   info.v2 = v2;
+
+   /* Setup parameter interpolants:
+    */
+   setup_line_coefficients( setup, line, &info); 
+
+   line->inputs.facing = 1.0F;
+   line->inputs.state = setup->fs.stored;
+
+   for (i = 0; i < 4; i++) {
+      struct lp_rast_plane *plane = &line->plane[i];
+
+      /* half-edge constants, will be interated over the whole render
+       * target.
+       */
+      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
+
+      
+      /* correct for top-left vs. bottom-left fill convention.  
+       *
+       * note that we're overloading gl_rasterization_rules to mean
+       * both (0.5,0.5) pixel centers *and* bottom-left filling
+       * convention.
+       *
+       * GL actually has a top-left filling convention, but GL's
+       * notion of "top" differs from gallium's...
+       *
+       * Also, sometimes (in FBO cases) GL will render upside down
+       * to its usual method, in which case it will probably want
+       * to use the opposite, top-left convention.
+       */         
+      if (plane->dcdx < 0) {
+         /* both fill conventions want this - adjust for left edges */
+         plane->c++;            
+      }
+      else if (plane->dcdx == 0) {
+         if (setup->pixel_offset == 0) {
+            /* correct for top-left fill convention:
+             */
+            if (plane->dcdy > 0) plane->c++;
+         }
+         else {
+            /* correct for bottom-left fill convention:
+             */
+            if (plane->dcdy < 0) plane->c++;
+         }
+      }
+
+      plane->dcdx *= FIXED_ONE;
+      plane->dcdy *= FIXED_ONE;
+
+      /* find trivial reject offsets for each edge for a single-pixel
+       * sized block.  These will be scaled up at each recursive level to
+       * match the active blocksize.  Scaling in this way works best if
+       * the blocks are square.
+       */
+      plane->eo = 0;
+      if (plane->dcdx < 0) plane->eo -= plane->dcdx;
+      if (plane->dcdy > 0) plane->eo += plane->dcdy;
+
+      /* Calculate trivial accept offsets from the above.
+       */
+      plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+   }
+
+
+   /* 
+    * When rasterizing scissored tris, use the intersection of the
+    * triangle bounding box and the scissor rect to generate the
+    * scissor planes.
+    *
+    * This permits us to cut off the triangle "tails" that are present
+    * in the intermediate recursive levels caused when two of the
+    * triangles edges don't diverge quickly enough to trivially reject
+    * exterior blocks from the triangle.
+    *
+    * It's not really clear if it's worth worrying about these tails,
+    * but since we generate the planes for each scissored tri, it's
+    * free to trim them in this case.
+    * 
+    * Note that otherwise, the scissor planes only vary in 'C' value,
+    * and even then only on state-changes.  Could alternatively store
+    * these planes elsewhere.
+    */
+   if (nr_planes == 8) {
+      line->plane[4].dcdx = -1;
+      line->plane[4].dcdy = 0;
+      line->plane[4].c = 1-bbox.x0;
+      line->plane[4].ei = 0;
+      line->plane[4].eo = 1;
+
+      line->plane[5].dcdx = 1;
+      line->plane[5].dcdy = 0;
+      line->plane[5].c = bbox.x1+1;
+      line->plane[5].ei = -1;
+      line->plane[5].eo = 0;
+
+      line->plane[6].dcdx = 0;
+      line->plane[6].dcdy = 1;
+      line->plane[6].c = 1-bbox.y0;
+      line->plane[6].ei = 0;
+      line->plane[6].eo = 1;
+
+      line->plane[7].dcdx = 0;
+      line->plane[7].dcdy = -1;
+      line->plane[7].c = bbox.y1+1;
+      line->plane[7].ei = -1;
+      line->plane[7].eo = 0;
+   }
+
+   lp_setup_bin_triangle(setup, line, &bbox, nr_planes);
+}
+   
+
+void lp_setup_choose_line( struct lp_setup_context *setup ) 
+{ 
+   setup->line = lp_setup_line;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 9f69e6c5ce..6ae318d328 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2010, VMware Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -30,17 +30,299 @@
  */
 
 #include "lp_setup_context.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
+#include "tgsi/tgsi_scan.h"
+
+#define NUM_CHANNELS 4
+
+struct point_info {
+   /* x,y deltas */
+   int dy01, dy12;
+   int dx01, dx12;
+
+   const float (*v0)[4];
+};   
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct lp_setup_context *setup,
+                           struct lp_rast_triangle *point,
+                           unsigned slot,
+                           const float value,
+                           unsigned i )
+{
+   point->inputs.a0[slot][i] = value;
+   point->inputs.dadx[slot][i] = 0.0f;
+   point->inputs.dady[slot][i] = 0.0f;
+}
+
+static void perspective_coef( struct lp_setup_context *setup,
+                              struct lp_rast_triangle *point,
+                              const struct point_info *info,
+                              unsigned slot,
+                              unsigned vert_attr,
+                              unsigned i)
+{
+   if (i == 0) {   
+      float dadx = FIXED_ONE / (float)info->dx12;  
+      float dady =  0.0f;
+      point->inputs.dadx[slot][i] = dadx;
+      point->inputs.dady[slot][i] = dady;
+      point->inputs.a0[slot][i] = (0.5 -
+                                  (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
+                                   dady * ((float)info->v0[0][1] - setup->pixel_offset)));
+   }
+
+   else if (i == 1) {
+      float dadx =  0.0f; 
+      float dady =  FIXED_ONE / (float)info->dx12;
+   
+      point->inputs.dadx[slot][i] = dadx;
+      point->inputs.dady[slot][i] = dady;
+      point->inputs.a0[slot][i] = (0.5 -
+                                  (dadx * ((float)info->v0[0][0] - setup->pixel_offset) +
+                                   dady * ((float)info->v0[0][1] - setup->pixel_offset)));
+   }
+
+   else if (i == 2) {
+      point->inputs.a0[slot][i] = 0.0f;
+      point->inputs.dadx[slot][i] = 0.0f;
+      point->inputs.dady[slot][i] = 0.0f;
+   }
+      
+   else if (i == 3) {
+      point->inputs.a0[slot][i] = 1.0f;
+      point->inputs.dadx[slot][i] = 0.0f;
+      point->inputs.dady[slot][i] = 0.0f;
+   }
+
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial
+ * Z and W are copied from position_coef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_point_fragcoord_coef(struct lp_setup_context *setup,
+                           struct lp_rast_triangle *point,
+                           const struct point_info *info,
+                           unsigned slot,
+                           unsigned usage_mask)
+{
+   /*X*/
+   if (usage_mask & TGSI_WRITEMASK_X) {
+      point->inputs.a0[slot][0] = 0.0;
+      point->inputs.dadx[slot][0] = 1.0;
+      point->inputs.dady[slot][0] = 0.0;
+   }
+
+   /*Y*/
+   if (usage_mask & TGSI_WRITEMASK_Y) {
+      point->inputs.a0[slot][1] = 0.0;
+      point->inputs.dadx[slot][1] = 0.0;
+      point->inputs.dady[slot][1] = 1.0;
+   }
+
+   /*Z*/
+   if (usage_mask & TGSI_WRITEMASK_Z) {
+      constant_coef(setup, point, slot, info->v0[0][2], 2);
+   }
+
+   /*W*/
+   if (usage_mask & TGSI_WRITEMASK_W) {
+      constant_coef(setup, point, slot, info->v0[0][3], 3);
+   }
+}
+
+/**
+ * Compute the point->coef[] array dadx, dady, a0 values.
+ */
+static void   
+setup_point_coefficients( struct lp_setup_context *setup,
+                          struct lp_rast_triangle *point,
+                          const struct point_info *info)
+{
+   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
+   unsigned slot;
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+      unsigned vert_attr = setup->fs.input[slot].src_index;
+      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+      unsigned i;
+      
+      switch (setup->fs.input[slot].interp) {
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0, so all need to ensure that the usage mask is covers all
+          * usages.
+          */
+         fragcoord_usage_mask |= usage_mask;
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         /* For point sprite textures */        
+         if (setup->fs.current.variant->shader->info.input_semantic_name[slot] 
+             == TGSI_SEMANTIC_GENERIC) 
+         {
+            int index = setup->fs.current.variant->shader->info.input_semantic_index[slot];
+            
+            if (setup->sprite & (1 << index)) {
+               for (i = 0; i < NUM_CHANNELS; i++)
+                  if (usage_mask & (1 << i))
+                     perspective_coef(setup, point, info, slot+1, vert_attr, i);
+               fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+               break;                     
+            }
+         }
+
+         /* Otherwise fallthrough */
+      default:
+         for (i = 0; i < NUM_CHANNELS; i++) {
+            if (usage_mask & (1 << i))
+               constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i);
+         }
+      }
+   }
 
-static void point_nop( struct lp_setup_context *setup,
-                       const float (*v0)[4] )
+   /* The internal position input is in slot zero:
+    */
+   setup_point_fragcoord_coef(setup, point, info, 0,
+                              fragcoord_usage_mask);
+}
+
+static INLINE int
+subpixel_snap(float a)
 {
+   return util_iround(FIXED_ONE * a);
+}
+
+
+static void lp_setup_point( struct lp_setup_context *setup,
+                            const float (*v0)[4] )
+{
+   /* x/y positions in fixed point */
+   const int sizeAttr = setup->psize;
+   const float size
+      = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0]
+      : setup->point_size;
+   
+   /* Point size as fixed point integer, remove rounding errors 
+    * and gives minimum width for very small points
+    */
+   int fixed_width = MAX2(FIXED_ONE,
+                          (subpixel_snap(size) + FIXED_ONE/2 - 1) & ~(FIXED_ONE-1));
+
+   const int x0 = subpixel_snap(v0[0][0] - setup->pixel_offset) - fixed_width/2;
+   const int y0 = subpixel_snap(v0[0][1] - setup->pixel_offset) - fixed_width/2;
+     
+   struct lp_scene *scene = lp_setup_get_current_scene(setup);
+   struct lp_rast_triangle *point;
+   unsigned bytes;
+   struct u_rect bbox;
+   unsigned nr_planes = 4;
+   struct point_info info;
+
+
+   /* Bounding rectangle (in pixels) */
+   {
+      /* Yes this is necessary to accurately calculate bounding boxes
+       * with the two fill-conventions we support.  GL (normally) ends
+       * up needing a bottom-left fill convention, which requires
+       * slightly different rounding.
+       */
+      int adj = (setup->pixel_offset != 0) ? 1 : 0;
+
+      bbox.x0 = (x0 + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.x1 = (x0 + fixed_width + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.y0 = (y0 + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.y1 = (y0 + fixed_width + (FIXED_ONE-1)) >> FIXED_ORDER;
+
+      /* Inclusive coordinates:
+       */
+      bbox.x1--;
+      bbox.y1--;
+   }
+   
+   if (!u_rect_test_intersection(&setup->draw_region, &bbox)) {
+      if (0) debug_printf("offscreen\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   u_rect_find_intersection(&setup->draw_region, &bbox);
+
+   point = lp_setup_alloc_triangle(scene,
+                                   setup->fs.nr_inputs,
+                                   nr_planes,
+                                   &bytes);
+   if (!point)
+      return;
+
+#ifdef DEBUG
+   point->v[0][0] = v0[0][0];
+   point->v[0][1] = v0[0][1];
+#endif
+
+   info.v0 = v0;
+   info.dx01 = 0;
+   info.dx12 = fixed_width;
+   info.dy01 = fixed_width;
+   info.dy12 = 0;
+   
+   /* Setup parameter interpolants:
+    */
+   setup_point_coefficients(setup, point, &info);
+
+   point->inputs.facing = 1.0F;
+   point->inputs.state = setup->fs.stored;
+
+   {
+      point->plane[0].dcdx = -1;
+      point->plane[0].dcdy = 0;
+      point->plane[0].c = 1-bbox.x0;
+      point->plane[0].ei = 0;
+      point->plane[0].eo = 1;
+
+      point->plane[1].dcdx = 1;
+      point->plane[1].dcdy = 0;
+      point->plane[1].c = bbox.x1+1;
+      point->plane[1].ei = -1;
+      point->plane[1].eo = 0;
+
+      point->plane[2].dcdx = 0;
+      point->plane[2].dcdy = 1;
+      point->plane[2].c = 1-bbox.y0;
+      point->plane[2].ei = 0;
+      point->plane[2].eo = 1;
+
+      point->plane[3].dcdx = 0;
+      point->plane[3].dcdy = -1;
+      point->plane[3].c = bbox.y1+1;
+      point->plane[3].ei = -1;
+      point->plane[3].eo = 0;
+   }
+
+   lp_setup_bin_triangle(setup, point, &bbox, nr_planes);
 }
 
 
 void 
 lp_setup_choose_point( struct lp_setup_context *setup )
 {
-   setup->point = point_nop;
+   setup->point = lp_setup_point;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 393533ebee..0180d95090 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -31,35 +31,15 @@
 
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_rect.h"
 #include "lp_perf.h"
 #include "lp_setup_context.h"
+#include "lp_setup_coef.h"
 #include "lp_rast.h"
 #include "lp_state_fs.h"
 
 #define NUM_CHANNELS 4
 
-struct tri_info {
-
-   float pixel_offset;
-
-   /* fixed point vertex coordinates */
-   int x[3];
-   int y[3];
-
-   /* float x,y deltas - all from the original coordinates
-    */
-   float dy01, dy20;
-   float dx01, dx20;
-   float oneoverarea;
-
-   const float (*v0)[4];
-   const float (*v1)[4];
-   const float (*v2)[4];
-
-   boolean frontfacing;
-};
-
-
 
    
 static INLINE int
@@ -76,247 +56,6 @@ fixed_to_float(int a)
 
 
 
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- */
-static void constant_coef( struct lp_rast_triangle *tri,
-                           unsigned slot,
-			   const float value,
-                           unsigned i )
-{
-   tri->inputs.a0[slot][i] = value;
-   tri->inputs.dadx[slot][i] = 0.0f;
-   tri->inputs.dady[slot][i] = 0.0f;
-}
-
-
-
-static void linear_coef( struct lp_rast_triangle *tri,
-                         const struct tri_info *info,
-                         unsigned slot,
-                         unsigned vert_attr,
-                         unsigned i)
-{
-   float a0 = info->v0[vert_attr][i];
-   float a1 = info->v1[vert_attr][i];
-   float a2 = info->v2[vert_attr][i];
-
-   float da01 = a0 - a1;
-   float da20 = a2 - a0;
-   float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea;
-   float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea;
-
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;
-
-   /* calculate a0 as the value which would be sampled for the
-    * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
-    *
-    * this is neat but unfortunately not a good way to do things for
-    * triangles with very large values of dadx or dady as it will
-    * result in the subtraction and re-addition from a0 of a very
-    * large number, which means we'll end up loosing a lot of the
-    * fractional bits and precision from a0.  the way to fix this is
-    * to define a0 as the sample at a pixel center somewhere near vmin
-    * instead - i'll switch to this later.
-    */
-   tri->inputs.a0[slot][i] = (a0 -
-                              (dadx * (info->v0[0][0] - info->pixel_offset) +
-                               dady * (info->v0[0][1] - info->pixel_offset)));
-}
-
-
-/**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a triangle.
- * We basically multiply the vertex value by 1/w before computing
- * the plane coefficients (a0, dadx, dady).
- * Later, when we compute the value at a particular fragment position we'll
- * divide the interpolated value by the interpolated W at that fragment.
- */
-static void perspective_coef( struct lp_rast_triangle *tri,
-                              const struct tri_info *info,
-                              unsigned slot,
-			      unsigned vert_attr,
-                              unsigned i)
-{
-   /* premultiply by 1/w  (v[0][3] is always 1/w):
-    */
-   float a0 = info->v0[vert_attr][i] * info->v0[0][3];
-   float a1 = info->v1[vert_attr][i] * info->v1[0][3];
-   float a2 = info->v2[vert_attr][i] * info->v2[0][3];
-   float da01 = a0 - a1;
-   float da20 = a2 - a0;
-   float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea;
-   float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea;
-
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;
-   tri->inputs.a0[slot][i] = (a0 -
-                              (dadx * (info->v0[0][0] - info->pixel_offset) +
-                               dady * (info->v0[0][1] - info->pixel_offset)));
-}
-
-
-/**
- * Special coefficient setup for gl_FragCoord.
- * X and Y are trivial
- * Z and W are copied from position_coef which should have already been computed.
- * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
- */
-static void
-setup_fragcoord_coef(struct lp_rast_triangle *tri,
-                     const struct tri_info *info,
-                     unsigned slot,
-                     unsigned usage_mask)
-{
-   /*X*/
-   if (usage_mask & TGSI_WRITEMASK_X) {
-      tri->inputs.a0[slot][0] = 0.0;
-      tri->inputs.dadx[slot][0] = 1.0;
-      tri->inputs.dady[slot][0] = 0.0;
-   }
-
-   /*Y*/
-   if (usage_mask & TGSI_WRITEMASK_Y) {
-      tri->inputs.a0[slot][1] = 0.0;
-      tri->inputs.dadx[slot][1] = 0.0;
-      tri->inputs.dady[slot][1] = 1.0;
-   }
-
-   /*Z*/
-   if (usage_mask & TGSI_WRITEMASK_Z) {
-      linear_coef(tri, info, slot, 0, 2);
-   }
-
-   /*W*/
-   if (usage_mask & TGSI_WRITEMASK_W) {
-      linear_coef(tri, info, slot, 0, 3);
-   }
-}
-
-
-/**
- * Setup the fragment input attribute with the front-facing value.
- * \param frontface  is the triangle front facing?
- */
-static void setup_facing_coef( struct lp_rast_triangle *tri,
-                               unsigned slot,
-                               boolean frontface,
-                               unsigned usage_mask)
-{
-   /* convert TRUE to 1.0 and FALSE to -1.0 */
-   if (usage_mask & TGSI_WRITEMASK_X)
-      constant_coef( tri, slot, 2.0f * frontface - 1.0f, 0 );
-
-   if (usage_mask & TGSI_WRITEMASK_Y)
-      constant_coef( tri, slot, 0.0f, 1 ); /* wasted */
-
-   if (usage_mask & TGSI_WRITEMASK_Z)
-      constant_coef( tri, slot, 0.0f, 2 ); /* wasted */
-
-   if (usage_mask & TGSI_WRITEMASK_W)
-      constant_coef( tri, slot, 0.0f, 3 ); /* wasted */
-}
-
-
-/**
- * Compute the tri->coef[] array dadx, dady, a0 values.
- */
-static void setup_tri_coefficients( struct lp_setup_context *setup,
-				    struct lp_rast_triangle *tri,
-                                    const struct tri_info *info)
-{
-   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
-   unsigned slot;
-   unsigned i;
-
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-      unsigned usage_mask = setup->fs.input[slot].usage_mask;
-
-      switch (setup->fs.input[slot].interp) {
-      case LP_INTERP_CONSTANT:
-         if (setup->flatshade_first) {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               if (usage_mask & (1 << i))
-                  constant_coef(tri, slot+1, info->v0[vert_attr][i], i);
-         }
-         else {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               if (usage_mask & (1 << i))
-                  constant_coef(tri, slot+1, info->v2[vert_attr][i], i);
-         }
-         break;
-
-      case LP_INTERP_LINEAR:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            if (usage_mask & (1 << i))
-               linear_coef(tri, info, slot+1, vert_attr, i);
-         break;
-
-      case LP_INTERP_PERSPECTIVE:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            if (usage_mask & (1 << i))
-               perspective_coef(tri, info, slot+1, vert_attr, i);
-         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
-         break;
-
-      case LP_INTERP_POSITION:
-         /*
-          * The generated pixel interpolators will pick up the coeffs from
-          * slot 0, so all need to ensure that the usage mask is covers all
-          * usages.
-          */
-         fragcoord_usage_mask |= usage_mask;
-         break;
-
-      case LP_INTERP_FACING:
-         setup_facing_coef(tri, slot+1, info->frontfacing, usage_mask);
-         break;
-
-      default:
-         assert(0);
-      }
-   }
-
-   /* The internal position input is in slot zero:
-    */
-   setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask);
-
-   if (0) {
-      for (i = 0; i < NUM_CHANNELS; i++) {
-         float a0   = tri->inputs.a0  [0][i];
-         float dadx = tri->inputs.dadx[0][i];
-         float dady = tri->inputs.dady[0][i];
-
-         debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n",
-                      "xyzw"[i],
-                      a0, dadx, dady);
-      }
-
-      for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-         unsigned usage_mask = setup->fs.input[slot].usage_mask;
-         for (i = 0; i < NUM_CHANNELS; i++) {
-            if (usage_mask & (1 << i)) {
-               float a0   = tri->inputs.a0  [1 + slot][i];
-               float dadx = tri->inputs.dadx[1 + slot][i];
-               float dady = tri->inputs.dady[1 + slot][i];
-
-               debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n",
-                            slot,
-                            "xyzw"[i],
-                            a0, dadx, dady);
-            }
-         }
-      }
-   }
-}
-
-
 
 
 
@@ -329,11 +68,11 @@ static void setup_tri_coefficients( struct lp_setup_context *setup,
  * \param nr_inputs  number of fragment shader inputs
  * \return pointer to triangle space
  */
-static INLINE struct lp_rast_triangle *
-alloc_triangle(struct lp_scene *scene,
-               unsigned nr_inputs,
-               unsigned nr_planes,
-               unsigned *tri_size)
+struct lp_rast_triangle *
+lp_setup_alloc_triangle(struct lp_scene *scene,
+                        unsigned nr_inputs,
+                        unsigned nr_planes,
+                        unsigned *tri_size)
 {
    unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
    struct lp_rast_triangle *tri;
@@ -357,35 +96,71 @@ alloc_triangle(struct lp_scene *scene,
    return tri;
 }
 
+void
+lp_setup_print_vertex(struct lp_setup_context *setup,
+                      const char *name,
+                      const float (*v)[4])
+{
+   int i, j;
+
+   debug_printf("   wpos (%s[0]) xyzw %f %f %f %f\n",
+                name,
+                v[0][0], v[0][1], v[0][2], v[0][3]);
+
+   for (i = 0; i < setup->fs.nr_inputs; i++) {
+      const float *in = v[setup->fs.input[i].src_index];
+
+      debug_printf("  in[%d] (%s[%d]) %s%s%s%s ",
+                   i, 
+                   name, setup->fs.input[i].src_index,
+                   (setup->fs.input[i].usage_mask & 0x1) ? "x" : " ",
+                   (setup->fs.input[i].usage_mask & 0x2) ? "y" : " ",
+                   (setup->fs.input[i].usage_mask & 0x4) ? "z" : " ",
+                   (setup->fs.input[i].usage_mask & 0x8) ? "w" : " ");
+
+      for (j = 0; j < 4; j++)
+         if (setup->fs.input[i].usage_mask & (1<<j))
+            debug_printf("%.5f ", in[j]);
+
+      debug_printf("\n");
+   }
+}
+
 
 /**
  * Print triangle vertex attribs (for debug).
  */
-static void
-print_triangle(struct lp_setup_context *setup,
-               const float (*v1)[4],
-               const float (*v2)[4],
-               const float (*v3)[4])
+void
+lp_setup_print_triangle(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4])
 {
-   uint i;
+   debug_printf("triangle\n");
 
-   debug_printf("llvmpipe triangle\n");
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
-      debug_printf("  v1[%d]:  %f %f %f %f\n", i,
-                   v1[i][0], v1[i][1], v1[i][2], v1[i][3]);
-   }
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
-      debug_printf("  v2[%d]:  %f %f %f %f\n", i,
-                   v2[i][0], v2[i][1], v2[i][2], v2[i][3]);
-   }
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
-      debug_printf("  v3[%d]:  %f %f %f %f\n", i,
-                   v3[i][0], v3[i][1], v3[i][2], v3[i][3]);
+   {
+      const float ex = v0[0][0] - v2[0][0];
+      const float ey = v0[0][1] - v2[0][1];
+      const float fx = v1[0][0] - v2[0][0];
+      const float fy = v1[0][1] - v2[0][1];
+
+      /* det = cross(e,f).z */
+      const float det = ex * fy - ey * fx;
+      if (det < 0.0f) 
+         debug_printf("   - ccw\n");
+      else if (det > 0.0f)
+         debug_printf("   - cw\n");
+      else
+         debug_printf("   - zero area\n");
    }
+
+   lp_setup_print_vertex(setup, "v0", v0);
+   lp_setup_print_vertex(setup, "v1", v1);
+   lp_setup_print_vertex(setup, "v2", v2);
 }
 
 
-lp_rast_cmd lp_rast_tri_tab[8] = {
+lp_rast_cmd lp_rast_tri_tab[9] = {
    NULL,               /* should be impossible */
    lp_rast_triangle_1,
    lp_rast_triangle_2,
@@ -393,7 +168,8 @@ lp_rast_cmd lp_rast_tri_tab[8] = {
    lp_rast_triangle_4,
    lp_rast_triangle_5,
    lp_rast_triangle_6,
-   lp_rast_triangle_7
+   lp_rast_triangle_7,
+   lp_rast_triangle_8
 };
 
 /**
@@ -403,25 +179,27 @@ lp_rast_cmd lp_rast_tri_tab[8] = {
  */
 static void
 do_triangle_ccw(struct lp_setup_context *setup,
+		const float (*v0)[4],
 		const float (*v1)[4],
 		const float (*v2)[4],
-		const float (*v3)[4],
 		boolean frontfacing )
 {
-
    struct lp_scene *scene = lp_setup_get_current_scene(setup);
-   struct lp_fragment_shader_variant *variant = setup->fs.current.variant;
    struct lp_rast_triangle *tri;
-   struct tri_info info;
+   int x[3];
+   int y[3];
+   float dy01, dy20;
+   float dx01, dx20;
+   float oneoverarea;
+   struct lp_tri_info info;
    int area;
-   int minx, maxx, miny, maxy;
-   int ix0, ix1, iy0, iy1;
+   struct u_rect bbox;
    unsigned tri_bytes;
    int i;
    int nr_planes = 3;
       
    if (0)
-      print_triangle(setup, v1, v2, v3);
+      lp_setup_print_triangle(setup, v0, v1, v2);
 
    if (setup->scissor_test) {
       nr_planes = 7;
@@ -430,38 +208,73 @@ do_triangle_ccw(struct lp_setup_context *setup,
       nr_planes = 3;
    }
 
+   /* x/y positions in fixed point */
+   x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
+   x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
+   x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
+   y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
+   y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
+   y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
+
+
+   /* Bounding rectangle (in pixels) */
+   {
+      /* Yes this is necessary to accurately calculate bounding boxes
+       * with the two fill-conventions we support.  GL (normally) ends
+       * up needing a bottom-left fill convention, which requires
+       * slightly different rounding.
+       */
+      int adj = (setup->pixel_offset != 0) ? 1 : 0;
+
+      bbox.x0 = (MIN3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.x1 = (MAX3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.y0 = (MIN3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.y1 = (MAX3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+
+      /* Inclusive coordinates:
+       */
+      bbox.x1--;
+      bbox.y1--;
+   }
+
+   if (bbox.x1 < bbox.x0 ||
+       bbox.y1 < bbox.y0) {
+      if (0) debug_printf("empty bounding box\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   if (!u_rect_test_intersection(&setup->draw_region, &bbox)) {
+      if (0) debug_printf("offscreen\n");
+      LP_COUNT(nr_culled_tris);
+      return;
+   }
+
+   u_rect_find_intersection(&setup->draw_region, &bbox);
 
-   tri = alloc_triangle(scene,
-                        setup->fs.nr_inputs,
-                        nr_planes,
-                        &tri_bytes);
+   tri = lp_setup_alloc_triangle(scene,
+                                 setup->fs.nr_inputs,
+                                 nr_planes,
+                                 &tri_bytes);
    if (!tri)
       return;
 
 #ifdef DEBUG
-   tri->v[0][0] = v1[0][0];
-   tri->v[1][0] = v2[0][0];
-   tri->v[2][0] = v3[0][0];
-   tri->v[0][1] = v1[0][1];
-   tri->v[1][1] = v2[0][1];
-   tri->v[2][1] = v3[0][1];
+   tri->v[0][0] = v0[0][0];
+   tri->v[1][0] = v1[0][0];
+   tri->v[2][0] = v2[0][0];
+   tri->v[0][1] = v0[0][1];
+   tri->v[1][1] = v1[0][1];
+   tri->v[2][1] = v2[0][1];
 #endif
 
-   /* x/y positions in fixed point */
-   info.x[0] = subpixel_snap(v1[0][0] - setup->pixel_offset);
-   info.x[1] = subpixel_snap(v2[0][0] - setup->pixel_offset);
-   info.x[2] = subpixel_snap(v3[0][0] - setup->pixel_offset);
-   info.y[0] = subpixel_snap(v1[0][1] - setup->pixel_offset);
-   info.y[1] = subpixel_snap(v2[0][1] - setup->pixel_offset);
-   info.y[2] = subpixel_snap(v3[0][1] - setup->pixel_offset);
-
-   tri->plane[0].dcdy = info.x[0] - info.x[1];
-   tri->plane[1].dcdy = info.x[1] - info.x[2];
-   tri->plane[2].dcdy = info.x[2] - info.x[0];
+   tri->plane[0].dcdy = x[0] - x[1];
+   tri->plane[1].dcdy = x[1] - x[2];
+   tri->plane[2].dcdy = x[2] - x[0];
 
-   tri->plane[0].dcdx = info.y[0] - info.y[1];
-   tri->plane[1].dcdx = info.y[1] - info.y[2];
-   tri->plane[2].dcdx = info.y[2] - info.y[0];
+   tri->plane[0].dcdx = y[0] - y[1];
+   tri->plane[1].dcdx = y[1] - y[2];
+   tri->plane[2].dcdx = y[2] - y[0];
 
    area = (tri->plane[0].dcdy * tri->plane[2].dcdx -
            tri->plane[2].dcdy * tri->plane[0].dcdx);
@@ -478,57 +291,29 @@ do_triangle_ccw(struct lp_setup_context *setup,
       return;
    }
 
-   /* Bounding rectangle (in pixels) */
-   {
-      /* Yes this is necessary to accurately calculate bounding boxes
-       * with the two fill-conventions we support.  GL (normally) ends
-       * up needing a bottom-left fill convention, which requires
-       * slightly different rounding.
-       */
-      int adj = (setup->pixel_offset != 0) ? 1 : 0;
-
-      minx = (MIN3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
-      maxx = (MAX3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
-      miny = (MIN3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
-      maxy = (MAX3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
-   }
-
-   if (setup->scissor_test) {
-      minx = MAX2(minx, setup->scissor.current.minx);
-      maxx = MIN2(maxx, setup->scissor.current.maxx);
-      miny = MAX2(miny, setup->scissor.current.miny);
-      maxy = MIN2(maxy, setup->scissor.current.maxy);
-   }
-   else {
-      minx = MAX2(minx, 0);
-      miny = MAX2(miny, 0);
-      maxx = MIN2(maxx, scene->fb.width);
-      maxy = MIN2(maxy, scene->fb.height);
-   }
-
-
-   if (miny >= maxy || minx >= maxx) {
-      lp_scene_putback_data( scene, tri_bytes );
-      LP_COUNT(nr_culled_tris);
-      return;
-   }
 
    /* 
     */
-   info.pixel_offset = setup->pixel_offset;
-   info.v0 = v1;
-   info.v1 = v2;
-   info.v2 = v3;
-   info.dx01 = info.v0[0][0] - info.v1[0][0];
-   info.dx20 = info.v2[0][0] - info.v0[0][0];
-   info.dy01 = info.v0[0][1] - info.v1[0][1];
-   info.dy20 = info.v2[0][1] - info.v0[0][1];
-   info.oneoverarea = 1.0f / (info.dx01 * info.dy20 - info.dx20 * info.dy01);
+   dx01 = v0[0][0] - v1[0][0];
+   dy01 = v0[0][1] - v1[0][1];
+   dx20 = v2[0][0] - v0[0][0];
+   dy20 = v2[0][1] - v0[0][1];
+   oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01);
+
+   info.v0 = v0;
+   info.v1 = v1;
+   info.v2 = v2;
    info.frontfacing = frontfacing;
+   info.x0_center = v0[0][0] - setup->pixel_offset;
+   info.y0_center = v0[0][1] - setup->pixel_offset;
+   info.dx01_ooa  = dx01 * oneoverarea;
+   info.dx20_ooa  = dx20 * oneoverarea;
+   info.dy01_ooa  = dy01 * oneoverarea;
+   info.dy20_ooa  = dy20 * oneoverarea;
 
    /* Setup parameter interpolants:
     */
-   setup_tri_coefficients( setup, tri, &info );
+   lp_setup_tri_coef( setup, &tri->inputs, &info );
 
    tri->inputs.facing = frontfacing ? 1.0F : -1.0F;
    tri->inputs.state = setup->fs.stored;
@@ -541,7 +326,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
       /* half-edge constants, will be interated over the whole render
        * target.
        */
-      plane->c = plane->dcdx * info.x[i] - plane->dcdy * info.y[i];
+      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
 
       /* correct for top-left vs. bottom-left fill convention.  
        *
@@ -612,29 +397,43 @@ do_triangle_ccw(struct lp_setup_context *setup,
    if (nr_planes == 7) {
       tri->plane[3].dcdx = -1;
       tri->plane[3].dcdy = 0;
-      tri->plane[3].c = 1-minx;
+      tri->plane[3].c = 1-bbox.x0;
       tri->plane[3].ei = 0;
       tri->plane[3].eo = 1;
 
       tri->plane[4].dcdx = 1;
       tri->plane[4].dcdy = 0;
-      tri->plane[4].c = maxx;
+      tri->plane[4].c = bbox.x1+1;
       tri->plane[4].ei = -1;
       tri->plane[4].eo = 0;
 
       tri->plane[5].dcdx = 0;
       tri->plane[5].dcdy = 1;
-      tri->plane[5].c = 1-miny;
+      tri->plane[5].c = 1-bbox.y0;
       tri->plane[5].ei = 0;
       tri->plane[5].eo = 1;
 
       tri->plane[6].dcdx = 0;
       tri->plane[6].dcdy = -1;
-      tri->plane[6].c = maxy;
+      tri->plane[6].c = bbox.y1+1;
       tri->plane[6].ei = -1;
       tri->plane[6].eo = 0;
    }
 
+   lp_setup_bin_triangle( setup, tri, &bbox, nr_planes );
+}
+
+
+void
+lp_setup_bin_triangle( struct lp_setup_context *setup,
+                       struct lp_rast_triangle *tri,
+                       const struct u_rect *bbox,
+                       int nr_planes )
+{
+   struct lp_scene *scene = setup->scene;
+   struct lp_fragment_shader_variant *variant = setup->fs.current.variant;
+   int ix0, ix1, iy0, iy1;
+   int i;
 
    /*
     * All fields of 'tri' are now set.  The remaining code here is
@@ -643,10 +442,30 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    /* Convert to tile coordinates, and inclusive ranges:
     */
-   ix0 = minx / TILE_SIZE;
-   iy0 = miny / TILE_SIZE;
-   ix1 = (maxx-1) / TILE_SIZE;
-   iy1 = (maxy-1) / TILE_SIZE;
+   if (nr_planes == 3) {
+      int ix0 = bbox->x0 / 16;
+      int iy0 = bbox->y0 / 16;
+      int ix1 = bbox->x1 / 16;
+      int iy1 = bbox->y1 / 16;
+      
+      if (iy0 == iy1 && ix0 == ix1)
+      {
+
+	 /* Triangle is contained in a single 16x16 block:
+	  */
+	 int mask = (ix0 & 3) | ((iy0 & 3) << 4);
+
+	 lp_scene_bin_command( scene, ix0/4, iy0/4,
+			       lp_rast_triangle_3_16,
+			       lp_rast_arg_triangle(tri, mask) );
+	 return;
+      }
+   }
+
+   ix0 = bbox->x0 / TILE_SIZE;
+   iy0 = bbox->y0 / TILE_SIZE;
+   ix1 = bbox->x1 / TILE_SIZE;
+   iy1 = bbox->y1 / TILE_SIZE;
 
    /*
     * Clamp to framebuffer size
@@ -799,9 +618,10 @@ static void triangle_both( struct lp_setup_context *setup,
    const float fy = v1[0][1] - v2[0][1];
 
    /* det = cross(e,f).z */
-   if (ex * fy - ey * fx < 0.0f) 
+   const float det = ex * fy - ey * fx;
+   if (det < 0.0f) 
       triangle_ccw( setup, v0, v1, v2 );
-   else
+   else if (det > 0.0f)
       triangle_cw( setup, v0, v1, v2 );
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 77bec4640b..edd723f65f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -74,6 +74,15 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       vs_index = draw_find_shader_output(llvmpipe->draw,
                                          lpfs->info.input_semantic_name[i],
                                          lpfs->info.input_semantic_index[i]);
+      if (vs_index < 0) {
+         /*
+          * This can happen with sprite coordinates - the vertex
+          * shader doesn't need to provide an output as we generate
+          * them internally.  However, lets keep pretending that there
+          * is something there to not confuse other code.
+          */
+         vs_index = 0;
+      }
 
       /* This can be pre-computed, except for flatshade:
        */
@@ -125,6 +134,17 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       inputs[i].src_index = vinfo->num_attribs;
       draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
    }
+
+   /* Figure out if we need pointsize as well.
+    */
+   vs_index = draw_find_shader_output(llvmpipe->draw,
+                                      TGSI_SEMANTIC_PSIZE, 0);
+
+   if (vs_index > 0) {
+      llvmpipe->psize_slot = vinfo->num_attribs;
+      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+   }
+
    llvmpipe->num_inputs = lpfs->info.num_inputs;
 
    draw_compute_vertex_size(vinfo);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index dbca49a2ef..33c1a49efe 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -808,7 +808,7 @@ generate_variant(struct llvmpipe_context *lp,
    variant->list_item_local.base = variant;
    variant->no = shader->variants_created++;
 
-   memcpy(&variant->key, key, sizeof *key);
+   memcpy(&variant->key, key, shader->variant_key_size);
 
    if (gallivm_debug & GALLIVM_DEBUG_IR) {
       debug_printf("llvmpipe: Creating fragment shader #%u variant #%u:\n", 
@@ -840,6 +840,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
                          const struct pipe_shader_state *templ)
 {
    struct lp_fragment_shader *shader;
+   int nr_samplers;
 
    shader = CALLOC_STRUCT(lp_fragment_shader);
    if (!shader)
@@ -854,6 +855,11 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
    /* we need to keep a local copy of the tokens */
    shader->base.tokens = tgsi_dup_tokens(templ->tokens);
 
+   nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+
+   shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
+				     sampler[nr_samplers]);
+
    if (LP_DEBUG & DEBUG_TGSI) {
       unsigned attrib;
       debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader);
@@ -921,7 +927,6 @@ static void
 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
-   struct pipe_fence_handle *fence = NULL;
    struct lp_fragment_shader *shader = fs;
    struct lp_fs_variant_list_item *li;
 
@@ -934,12 +939,7 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
     * Flushing alone might not sufficient we need to wait on it too.
     */
 
-   llvmpipe_flush(pipe, 0, &fence);
-
-   if (fence) {
-      pipe->screen->fence_finish(pipe->screen, fence, 0);
-      pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-   }
+   llvmpipe_finish(pipe, __FUNCTION__);
 
    li = first_elem(&shader->variants);
    while(!at_end(&shader->variants, li)) {
@@ -1027,7 +1027,7 @@ make_variant_key(struct llvmpipe_context *lp,
 {
    unsigned i;
 
-   memset(key, 0, sizeof *key);
+   memset(key, 0, shader->variant_key_size);
 
    if (lp->framebuffer.zsbuf) {
       if (lp->depth_stencil->depth.enabled) {
@@ -1097,9 +1097,17 @@ make_variant_key(struct llvmpipe_context *lp,
       }
    }
 
-   for(i = 0; i < PIPE_MAX_SAMPLERS; ++i)
-      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i))
-         lp_sampler_static_state(&key->sampler[i], lp->fragment_sampler_views[i], lp->sampler[i]);
+   /* This value will be the same for all the variants of a given shader:
+    */
+   key->nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+
+   for(i = 0; i < key->nr_samplers; ++i) {
+      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+         lp_sampler_static_state(&key->sampler[i],
+				 lp->fragment_sampler_views[i],
+				 lp->sampler[i]);
+      }
+   }
 }
 
 /**
@@ -1118,7 +1126,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
 
    li = first_elem(&shader->variants);
    while(!at_end(&shader->variants, li)) {
-      if(memcmp(&li->base->key, &key, sizeof key) == 0) {
+      if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) {
          variant = li->base;
          break;
       }
@@ -1134,19 +1142,14 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
       unsigned i;
       if (lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS) {
          struct pipe_context *pipe = &lp->pipe;
-         struct pipe_fence_handle *fence = NULL;
 
          /*
           * XXX: we need to flush the context until we have some sort of reference
           * counting in fragment shaders as they may still be binned
           * Flushing alone might not be sufficient we need to wait on it too.
           */
-         llvmpipe_flush(pipe, 0, &fence);
+         llvmpipe_finish(pipe, __FUNCTION__);
 
-         if (fence) {
-            pipe->screen->fence_finish(pipe->screen, fence, 0);
-            pipe->screen->fence_reference(pipe->screen, &fence, NULL);
-         }
          for (i = 0; i < LP_MAX_SHADER_VARIANTS / 4; i++) {
             struct lp_fs_variant_list_item *item = last_elem(&lp->fs_variants_list);
             remove_shader_variant(lp, item->base);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 37900fc544..33c480010d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -53,13 +53,10 @@ struct lp_fragment_shader_variant_key
    struct pipe_blend_state blend;
    enum pipe_format zsbuf_format;
    unsigned nr_cbufs:8;
+   unsigned nr_samplers:8;	/* actually derivable from just the shader */
    unsigned flatshade:1;
    unsigned occlusion_count:1;
 
-   struct {
-      ubyte colormask;
-   } cbuf_blend[PIPE_MAX_COLOR_BUFS];
-
    struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS];
 };
 
@@ -97,6 +94,7 @@ struct lp_fragment_shader
    struct lp_fs_variant_list_item variants;
 
    /* For debugging/profiling purposes */
+   unsigned variant_key_size;
    unsigned no;
    unsigned variants_created;
    unsigned variants_cached;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
index afd3e0b21c..0bad7320f3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -73,7 +73,13 @@ llvmpipe_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
                    llvmpipe->rasterizer->gl_rasterization_rules);
       lp_setup_set_flatshade_first( llvmpipe->setup,
                    llvmpipe->rasterizer->flatshade_first);
-   }
+      lp_setup_set_line_state( llvmpipe->setup,
+                   llvmpipe->rasterizer->line_width);
+      lp_setup_set_point_state( llvmpipe->setup,
+                   llvmpipe->rasterizer->point_size,
+                   llvmpipe->rasterizer->point_size_per_vertex,
+                   llvmpipe->rasterizer->sprite_coord_enable);
+       }
 
    llvmpipe->dirty |= LP_NEW_RASTERIZER;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vertex.c b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
index d86e66b4fb..fb29423dd3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_vertex.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_vertex.c
@@ -100,7 +100,7 @@ llvmpipe_set_index_buffer(struct pipe_context *pipe,
    else
       memset(&llvmpipe->index_buffer, 0, sizeof(llvmpipe->index_buffer));
 
-   /* TODO make this more like a state */
+   draw_set_index_buffer(llvmpipe->draw, ib);
 }
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c b/src/gallium/drivers/llvmpipe/lp_surface.c
index f761e82850..63ddc669c2 100644
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -68,14 +68,16 @@ lp_resource_copy(struct pipe_context *pipe,
                            0, /* flush_flags */
                            FALSE, /* read_only */
                            TRUE, /* cpu_access */
-                           FALSE); /* do_not_block */
+                           FALSE,
+                           "blit dst"); /* do_not_block */
 
    llvmpipe_flush_resource(pipe,
                            src, subsrc.face, subsrc.level,
                            0, /* flush_flags */
                            TRUE, /* read_only */
                            TRUE, /* cpu_access */
-                           FALSE); /* do_not_block */
+                           FALSE,
+                           "blit src"); /* do_not_block */
 
    /*
    printf("surface copy from %u to %u: %u,%u to %u,%u %u x %u\n",
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 25112c10a6..5832ea2744 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -67,6 +67,7 @@ resource_is_texture(const struct pipe_resource *resource)
       return FALSE;
    case PIPE_TEXTURE_1D:
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
    case PIPE_TEXTURE_3D:
    case PIPE_TEXTURE_CUBE:
       return TRUE;
@@ -583,7 +584,8 @@ llvmpipe_get_transfer(struct pipe_context *pipe,
                                    0, /* flush_flags */
                                    read_only,
                                    TRUE, /* cpu_access */
-                                   do_not_block)) {
+                                   do_not_block,
+                                   "transfer dest")) {
          /*
           * It would have blocked, but state tracker requested no to.
           */