i965: Move repeat-instruction-suppression to batchbuffer core

Move the tracking of the last emitted instructions into the core batchbuffer routines and take advantage of the shadow batch copy to avoid extra memory allocations and copies. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
author: Chris Wilson <chris@chris-wilson.co.uk> 2011-02-20 13:23:47 +0000
committer: Chris Wilson <chris@chris-wilson.co.uk> 2011-02-21 12:59:35 +0000
commit: aac120977d1ead319141d48d65c9bba626ec03b8 (patch)
tree: d1ac3a43eb7d784883c6d8076cd3ca5b9bd53909
parent: 8d68a90e225d831a395ba788e425cb717eec1f9a (diff)
9 files changed, 119 insertions, 151 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index c37376ef0d..b7048ecb39 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -233,18 +233,16 @@ const struct brw_tracked_state brw_cc_unit = {
 
 static void upload_blend_constant_color(struct brw_context *brw)
 {
-   struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_blend_constant_color bcc;
-
-   memset(&bcc, 0, sizeof(bcc));
-   bcc.header.opcode = _3DSTATE_BLEND_CONSTANT_COLOR;
-   bcc.header.length = sizeof(bcc)/4-2;
-   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
-   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
-   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
-   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
+   struct intel_context *intel = &brw->intel;
+   struct gl_context *ctx = &intel->ctx;
 
-   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+   BEGIN_BATCH(5);
+   OUT_BATCH(_3DSTATE_BLEND_CONSTANT_COLOR << 16 | (5-2));
+   OUT_BATCH(ctx->Color.BlendColor[0]);
+   OUT_BATCH(ctx->Color.BlendColor[1]);
+   OUT_BATCH(ctx->Color.BlendColor[2]);
+   OUT_BATCH(ctx->Color.BlendColor[3]);
+   CACHED_BATCH();
 }
 
 const struct brw_tracked_state brw_blend_constant_color = {
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 877b22fec1..ae11c487a2 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -146,22 +146,24 @@ const struct brw_tracked_state brw_curbe_offsets = {
  */
 void brw_upload_cs_urb_state(struct brw_context *brw)
 {
-   struct brw_cs_urb_state cs_urb;
-   memset(&cs_urb, 0, sizeof(cs_urb));
+   struct intel_context *intel = &brw->intel;
 
+   BEGIN_BATCH(2);
    /* It appears that this is the state packet for the CS unit, ie. the
     * urb entries detailed here are housed in the CS range from the
     * URB_FENCE command.
     */
-   cs_urb.header.opcode = CMD_CS_URB_STATE;
-   cs_urb.header.length = sizeof(cs_urb)/4 - 2;
+   OUT_BATCH(CMD_CS_URB_STATE << 16 | (2-2));
 
    /* BRW_NEW_URB_FENCE */
-   cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
-   cs_urb.bits0.urb_entry_size = brw->urb.csize - 1;
-
-   assert(brw->urb.nr_cs_entries);
-   BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
+   if (brw->urb.csize == 0) {
+      OUT_BATCH(0);
+   } else {
+      /* BRW_NEW_URB_FENCE */
+      assert(brw->urb.nr_cs_entries);
+      OUT_BATCH((brw->urb.csize - 1) << 4 | brw->urb.nr_cs_entries);
+   }
+   CACHED_BATCH();
 }
 
 static GLfloat fixed_plane[6][4] = {
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index c5c7409824..c768be23fa 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -301,16 +301,15 @@ const struct brw_tracked_state brw_depthbuffer = {
 
 static void upload_polygon_stipple(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_polygon_stipple bps;
    GLuint i;
 
    if (!ctx->Polygon.StippleFlag)
       return;
 
-   memset(&bps, 0, sizeof(bps));
-   bps.header.opcode = _3DSTATE_POLY_STIPPLE_PATTERN;
-   bps.header.length = sizeof(bps)/4-2;
+   BEGIN_BATCH(33);
+   OUT_BATCH(_3DSTATE_POLY_STIPPLE_PATTERN << 16 | (33 - 2));
 
    /* Polygon stipple is provided in OpenGL order, i.e. bottom
     * row first.  If we're rendering to a window (i.e. the
@@ -321,14 +320,13 @@ static void upload_polygon_stipple(struct brw_context *brw)
     */
    if (ctx->DrawBuffer->Name == 0) {
       for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */
+	  OUT_BATCH(ctx->PolygonStipple[31 - i]); /* invert */
    }
    else {
       for (i = 0; i < 32; i++)
-         bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */
+	 OUT_BATCH(ctx->PolygonStipple[i]);
    }
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bps);
+   CACHED_BATCH();
 }
 
 const struct brw_tracked_state brw_polygon_stipple = {
@@ -347,15 +345,14 @@ const struct brw_tracked_state brw_polygon_stipple = {
 
 static void upload_polygon_stipple_offset(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_polygon_stipple_offset bpso;
 
    if (!ctx->Polygon.StippleFlag)
       return;
 
-   memset(&bpso, 0, sizeof(bpso));
-   bpso.header.opcode = _3DSTATE_POLY_STIPPLE_OFFSET;
-   bpso.header.length = sizeof(bpso)/4-2;
+   BEGIN_BATCH(2);
+   OUT_BATCH(_3DSTATE_POLY_STIPPLE_OFFSET << 16 | (2-2));
 
    /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
     * we have to invert the Y axis in order to match the OpenGL
@@ -365,16 +362,11 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
     * system works just fine, and there's no window system to
     * worry about.
     */
-   if (brw->intel.ctx.DrawBuffer->Name == 0) {
-      bpso.bits0.x_offset = 0;
-      bpso.bits0.y_offset = (32 - (ctx->DrawBuffer->Height & 31)) & 31;
-   }
-   else {
-      bpso.bits0.y_offset = 0;
-      bpso.bits0.x_offset = 0;
-   }
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bpso);
+   if (brw->intel.ctx.DrawBuffer->Name == 0)
+      OUT_BATCH((32 - (ctx->DrawBuffer->Height & 31)) & 31);
+   else
+      OUT_BATCH(0);
+   CACHED_BATCH();
 }
 
 #define _NEW_WINDOW_POS 0x40000000
@@ -393,18 +385,17 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
  */
 static void upload_aa_line_parameters(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_aa_line_parameters balp;
 
    if (!ctx->Line.SmoothFlag || !brw->has_aa_line_parameters)
       return;
 
+   OUT_BATCH(_3DSTATE_AA_LINE_PARAMETERS << 16 | (3 - 2));
    /* use legacy aa line coverage computation */
-   memset(&balp, 0, sizeof(balp));
-   balp.header.opcode = _3DSTATE_AA_LINE_PARAMETERS;
-   balp.header.length = sizeof(balp) / 4 - 2;
-   
-   BRW_CACHED_BATCH_STRUCT(brw, &balp);
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   CACHED_BATCH();
 }
 
 const struct brw_tracked_state brw_aa_line_parameters = {
@@ -422,28 +413,21 @@ const struct brw_tracked_state brw_aa_line_parameters = {
 
 static void upload_line_stipple(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_line_stipple bls;
    GLfloat tmp;
    GLint tmpi;
 
    if (!ctx->Line.StippleFlag)
       return;
 
-   memset(&bls, 0, sizeof(bls));
-   bls.header.opcode = _3DSTATE_LINE_STIPPLE_PATTERN;
-   bls.header.length = sizeof(bls)/4 - 2;
-
-   bls.bits0.pattern = ctx->Line.StipplePattern;
-   bls.bits1.repeat_count = ctx->Line.StippleFactor;
-
+   BEGIN_BATCH(3);
+   OUT_BATCH(_3DSTATE_LINE_STIPPLE_PATTERN << 16 | (3 - 2));
+   OUT_BATCH(ctx->Line.StipplePattern);
    tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
    tmpi = tmp * (1<<13);
-
-
-   bls.bits1.inverse_repeat_count = tmpi;
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bls);
+   OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor);
+   CACHED_BATCH();
 }
 
 const struct brw_tracked_state brw_line_stipple = {
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 23d9e90c6e..86b0caa4a4 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -166,13 +166,7 @@ void brw_destroy_caches( struct brw_context *brw );
  */
 #define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data(&brw->intel, (s), \
 							sizeof(*(s)), false)
-#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
 
-GLboolean brw_cached_batch_struct( struct brw_context *brw,
-				   const void *data,
-				   GLuint sz );
-void brw_destroy_batch_cache( struct brw_context *brw );
-void brw_clear_batch_cache( struct brw_context *brw );
 void *brw_state_batch(struct brw_context *brw,
 		      int size,
 		      int alignment,
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index f363a922c1..213c7a38d8 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -29,75 +29,10 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
      
-
-
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
 #include "main/imports.h"
 
-
-
-/* A facility similar to the data caching code above, which aims to
- * prevent identical commands being issued repeatedly.
- */
-GLboolean brw_cached_batch_struct( struct brw_context *brw,
-				   const void *data,
-				   GLuint sz )
-{
-   struct brw_cached_batch_item *item = brw->cached_batch_items;
-   struct header *newheader = (struct header *)data;
-
-   if (brw->emit_state_always) {
-      intel_batchbuffer_data(&brw->intel, data, sz, false);
-      return GL_TRUE;
-   }
-
-   while (item) {
-      if (item->header->opcode == newheader->opcode) {
-	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
-	    return GL_FALSE;
-	 if (item->sz != sz) {
-	    free(item->header);
-	    item->header = malloc(sz);
-	    item->sz = sz;
-	 }
-	 goto emit;
-      }
-      item = item->next;
-   }
-
-   assert(!item);
-   item = CALLOC_STRUCT(brw_cached_batch_item);
-   item->header = malloc(sz);
-   item->sz = sz;
-   item->next = brw->cached_batch_items;
-   brw->cached_batch_items = item;
-
- emit:
-   memcpy(item->header, newheader, sz);
-   intel_batchbuffer_data(&brw->intel, data, sz, false);
-   return GL_TRUE;
-}
-
-void brw_clear_batch_cache( struct brw_context *brw )
-{
-   struct brw_cached_batch_item *item = brw->cached_batch_items;
-
-   while (item) {
-      struct brw_cached_batch_item *next = item->next;
-      free((void *)item->header);
-      free(item);
-      item = next;
-   }
-
-   brw->cached_batch_items = NULL;
-}
-
-void brw_destroy_batch_cache( struct brw_context *brw )
-{
-   brw_clear_batch_cache(brw);
-}
-
 /**
  * Allocates a block of space in the batchbuffer for indirect state.
  *
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 08b7ac9079..8bb92fed5d 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -176,7 +176,6 @@ void brw_init_state( struct brw_context *brw )
 void brw_destroy_state( struct brw_context *brw )
 {
    brw_destroy_caches(brw);
-   brw_destroy_batch_cache(brw);
 }
 
 /***********************************************************************
@@ -383,9 +382,6 @@ void brw_validate_state( struct brw_context *brw )
        state->brw == 0)
       return;
 
-   if (brw->state.dirty.brw & BRW_NEW_CONTEXT)
-      brw_clear_batch_cache(brw);
-
    brw->intel.Fallback = GL_FALSE; /* boolean, not bitfield */
 
    /* do prepare stage for all atoms */
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 7d224ae535..805f3a34e8 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -33,6 +33,25 @@
 #include "intel_bufmgr.h"
 #include "intel_buffers.h"
 
+struct cached_batch_item {
+   struct cached_batch_item *next;
+   uint16_t header;
+   uint16_t size;
+};
+
+static void clear_cache( struct intel_context *intel )
+{
+   struct cached_batch_item *item = intel->batch.cached_items;
+
+   while (item) {
+      struct cached_batch_item *next = item->next;
+      free(item);
+      item = next;
+   }
+
+   intel->batch.cached_items = NULL;
+}
+
 void
 intel_batchbuffer_reset(struct intel_context *intel)
 {
@@ -40,6 +59,7 @@ intel_batchbuffer_reset(struct intel_context *intel)
       drm_intel_bo_unreference(intel->batch.bo);
       intel->batch.bo = NULL;
    }
+   clear_cache(intel);
 
    intel->batch.bo = drm_intel_bo_alloc(intel->bufmgr, "batchbuffer",
 					intel->maxBatchSize, 4096);
@@ -53,6 +73,7 @@ void
 intel_batchbuffer_free(struct intel_context *intel)
 {
    drm_intel_bo_unreference(intel->batch.bo);
+   clear_cache(intel);
 }
 
 
@@ -165,7 +186,8 @@ intel_batchbuffer_emit_reloc(struct intel_context *intel,
    ret = drm_intel_bo_emit_reloc(intel->batch.bo, 4*intel->batch.used,
 				 buffer, delta,
 				 read_domains, write_domain);
-   assert (ret == 0);
+   assert(ret == 0);
+   (void)ret;
 
    /*
     * Using the old buffer offset, write in what the right data would be, in case
@@ -191,7 +213,8 @@ intel_batchbuffer_emit_reloc_fenced(struct intel_context *intel,
    ret = drm_intel_bo_emit_reloc_fence(intel->batch.bo, 4*intel->batch.used,
 				       buffer, delta,
 				       read_domains, write_domain);
-   assert (ret == 0);
+   assert(ret == 0);
+   (void)ret;
 
    /*
     * Using the old buffer offset, write in what the right data would
@@ -213,6 +236,47 @@ intel_batchbuffer_data(struct intel_context *intel,
    intel->batch.used += bytes >> 2;
 }
 
+void
+intel_batchbuffer_cached_advance(struct intel_context *intel)
+{
+   struct cached_batch_item **prev = &intel->batch.cached_items, *item;
+   uint32_t sz = (intel->batch.used - intel->batch.emit) * sizeof(uint32_t);
+   uint32_t *start = intel->batch.map + intel->batch.emit;
+   uint16_t op = *start >> 16;
+
+   while (*prev) {
+      uint32_t *old;
+
+      item = *prev;
+      old = intel->batch.map + item->header;
+      if (op == *old >> 16) {
+	 if (item->size == sz && memcmp(old, start, sz) == 0) {
+	    if (prev != &intel->batch.cached_items) {
+	       *prev = item->next;
+	       item->next = intel->batch.cached_items;
+	       intel->batch.cached_items = item;
+	    }
+	    intel->batch.used = intel->batch.emit;
+	    return;
+	 }
+
+	 goto emit;
+      }
+      prev = &item->next;
+   }
+
+   item = malloc(sizeof(struct cached_batch_item));
+   if (item == NULL)
+      return;
+
+   item->next = intel->batch.cached_items;
+   intel->batch.cached_items = item;
+
+emit:
+   item->size = sz;
+   item->header = intel->batch.emit;
+}
+
 /* Emit a pipelined flush to either flush render and texture cache for
  * reading from a FBO-drawn texture, or flush so that frontbuffer
  * render appears on the screen in DRI1.
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.h b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
index 7699715c0e..a0a5c9841c 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.h
@@ -101,9 +101,9 @@ intel_batchbuffer_begin(struct intel_context *intel, int n, bool is_blit)
 {
    intel_batchbuffer_require_space(intel, n * 4, is_blit);
 
+   intel->batch.emit = intel->batch.used;
 #ifdef DEBUG
-   intel->batch.emit.total = n;
-   intel->batch.emit.start_ptr = intel->batch.used;
+   intel->batch.total = n;
 #endif
 }
 
@@ -123,6 +123,8 @@ intel_batchbuffer_advance(struct intel_context *intel)
 #endif
 }
 
+void intel_batchbuffer_cached_advance(struct intel_context *intel);
+
 /* Here are the crusty old macros, to be removed:
  */
 #define BATCH_LOCALS
@@ -141,5 +143,6 @@ intel_batchbuffer_advance(struct intel_context *intel)
 } while (0)
 
 #define ADVANCE_BATCH() intel_batchbuffer_advance(intel);
+#define CACHED_BATCH() intel_batchbuffer_cached_advance(intel);
 
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index bf2a0b4ead..6116764ad6 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -171,22 +171,14 @@ struct intel_context
 
    struct intel_batchbuffer {
       drm_intel_bo *bo;
+      struct cached_batch_item *cached_items;
 
-      uint16_t used;
-      uint16_t reserved_space;
+      uint16_t emit, total;
+      uint16_t used, reserved_space;
       uint32_t map[8192];
 #define BATCH_SZ (8192*sizeof(uint32_t))
 
       uint32_t state_batch_offset;
-
-#ifdef DEBUG
-      /** Tracking of BEGIN_BATCH()/OUT_BATCH()/ADVANCE_BATCH() debugging */
-      struct {
-	 uint16_t total;
-	 uint16_t start_ptr;
-      } emit;
-#endif
-
       bool is_blit;
    } batch;
author	Chris Wilson <chris@chris-wilson.co.uk>	2011-02-20 13:23:47 +0000
committer	Chris Wilson <chris@chris-wilson.co.uk>	2011-02-21 12:59:35 +0000
commit	aac120977d1ead319141d48d65c9bba626ec03b8 (patch)
tree	d1ac3a43eb7d784883c6d8076cd3ca5b9bd53909
parent	8d68a90e225d831a395ba788e425cb717eec1f9a (diff)