intel: use pwrite for batch

It's faster. Not only is the memcpy more efficiently performed in the kernel (making up for the system call overhead), but by not using mmap we remove the greater overhead of tracking the vma of every batch. And it means we can read back from the batch buffer without incurring the cost of a uncached read through the GTT. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
author: Chris Wilson <chris@chris-wilson.co.uk> 2011-02-10 20:25:51 +0000
committer: Chris Wilson <chris@chris-wilson.co.uk> 2011-02-21 12:59:35 +0000
commit: 8d68a90e225d831a395ba788e425cb717eec1f9a (patch)
tree: 1c46eb19125ebbb205625783dd184e0489231d01 /src/mesa/drivers/dri/intel/intel_batchbuffer.c
parent: 3f55683927278e57f3ef8a151d15f4cffdc060dc (diff)
1 files changed, 58 insertions, 108 deletions
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 5564e3ea2c..7d224ae535 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -34,83 +34,61 @@
 #include "intel_buffers.h"
 
 void
-intel_batchbuffer_reset(struct intel_batchbuffer *batch)
+intel_batchbuffer_reset(struct intel_context *intel)
 {
-   struct intel_context *intel = batch->intel;
-
-   if (batch->buf != NULL) {
-      drm_intel_bo_unreference(batch->buf);
-      batch->buf = NULL;
+   if (intel->batch.bo != NULL) {
+      drm_intel_bo_unreference(intel->batch.bo);
+      intel->batch.bo = NULL;
    }
 
-   batch->buf = drm_intel_bo_alloc(intel->bufmgr, "batchbuffer",
-				   intel->maxBatchSize, 4096);
-   drm_intel_gem_bo_map_gtt(batch->buf);
-   batch->map = batch->buf->virtual;
-
-   batch->size = intel->maxBatchSize;
-   batch->ptr = batch->map;
-   batch->reserved_space = BATCH_RESERVED;
-   batch->dirty_state = ~0;
-   batch->state_batch_offset = batch->size;
-}
-
-struct intel_batchbuffer *
-intel_batchbuffer_alloc(struct intel_context *intel)
-{
-   struct intel_batchbuffer *batch = calloc(sizeof(*batch), 1);
-
-   batch->intel = intel;
-   intel_batchbuffer_reset(batch);
+   intel->batch.bo = drm_intel_bo_alloc(intel->bufmgr, "batchbuffer",
+					intel->maxBatchSize, 4096);
 
-   return batch;
+   intel->batch.reserved_space = BATCH_RESERVED;
+   intel->batch.state_batch_offset = intel->batch.bo->size;
+   intel->batch.used = 0;
 }
 
 void
-intel_batchbuffer_free(struct intel_batchbuffer *batch)
+intel_batchbuffer_free(struct intel_context *intel)
 {
-   if (batch->map) {
-      drm_intel_gem_bo_unmap_gtt(batch->buf);
-      batch->map = NULL;
-   }
-   dri_bo_unreference(batch->buf);
-   batch->buf = NULL;
-   free(batch);
+   drm_intel_bo_unreference(intel->batch.bo);
 }
 
 
-
 /* TODO: Push this whole function into bufmgr.
  */
 static void
-do_flush_locked(struct intel_batchbuffer *batch, GLuint used)
+do_flush_locked(struct intel_context *intel)
 {
-   struct intel_context *intel = batch->intel;
+   struct intel_batchbuffer *batch = &intel->batch;
    int ret = 0;
-   int x_off = 0, y_off = 0;
-
-   drm_intel_gem_bo_unmap_gtt(batch->buf);
-
-   batch->ptr = NULL;
 
    if (!intel->intelScreen->no_hw) {
       int ring;
 
-      if (intel->gen < 6 || !intel->batch->is_blit) {
+      if (intel->gen < 6 || !batch->is_blit) {
 	 ring = I915_EXEC_RENDER;
       } else {
 	 ring = I915_EXEC_BLT;
       }
 
-      drm_intel_bo_mrb_exec(batch->buf, used, NULL, 0,
-			    (x_off & 0xffff) | (y_off << 16), ring);
+      ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
+      if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
+	 ret = drm_intel_bo_subdata(batch->bo,
+				    batch->state_batch_offset,
+				    batch->bo->size - batch->state_batch_offset,
+				    (char *)batch->map + batch->state_batch_offset);
+      }
+
+      if (ret == 0)
+	 ret = drm_intel_bo_mrb_exec(batch->bo, 4*batch->used, NULL, 0, 0, ring);
    }
 
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
-      drm_intel_bo_map(batch->buf, GL_FALSE);
-      intel_decode(batch->buf->virtual, used / 4, batch->buf->offset,
+      intel_decode(batch->map, batch->used,
+		   batch->bo->offset,
 		   intel->intelScreen->deviceID, GL_TRUE);
-      drm_intel_bo_unmap(batch->buf);
 
       if (intel->vtbl.debug_batch != NULL)
 	 intel->vtbl.debug_batch(intel);
@@ -123,55 +101,33 @@ do_flush_locked(struct intel_batchbuffer *batch, GLuint used)
 }
 
 void
-_intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
-			 int line)
+_intel_batchbuffer_flush(struct intel_context *intel,
+			 const char *file, int line)
 {
-   struct intel_context *intel = batch->intel;
-   GLuint used = batch->ptr - batch->map;
+   if (intel->batch.used == 0)
+      return;
 
    if (intel->first_post_swapbuffers_batch == NULL) {
-      intel->first_post_swapbuffers_batch = intel->batch->buf;
+      intel->first_post_swapbuffers_batch = intel->batch.bo;
       drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
    }
 
-   if (used == 0)
-      return;
-
    if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
       fprintf(stderr, "%s:%d: Batchbuffer flush with %db used\n", file, line,
-	      used);
+	      4*intel->batch.used);
 
-   batch->reserved_space = 0;
+   intel->batch.reserved_space = 0;
 
    if (intel->always_flush_cache) {
-      intel_batchbuffer_emit_mi_flush(batch);
-      used = batch->ptr - batch->map;
-   }
-
-   /* Round batchbuffer usage to 2 DWORDs. */
-
-   if ((used & 4) == 0) {
-      *(GLuint *) (batch->ptr) = 0; /* noop */
-      batch->ptr += 4;
-      used = batch->ptr - batch->map;
+      intel_batchbuffer_emit_mi_flush(intel);
    }
 
    /* Mark the end of the buffer. */
-   *(GLuint *) (batch->ptr) = MI_BATCH_BUFFER_END;
-   batch->ptr += 4;
-   used = batch->ptr - batch->map;
-   assert (used <= batch->buf->size);
-
-   /* Workaround for recursive batchbuffer flushing: If the window is
-    * moved, we can get into a case where we try to flush during a
-    * flush.  What happens is that when we try to grab the lock for
-    * the first flush, we detect that the window moved which then
-    * causes another flush (from the intel_draw_buffer() call in
-    * intelUpdatePageFlipping()).  To work around this we reset the
-    * batchbuffer tail pointer before trying to get the lock.  This
-    * prevent the nested buffer flush, but a better fix would be to
-    * avoid that in the first place. */
-   batch->ptr = batch->map;
+   intel_batchbuffer_emit_dword(intel, MI_BATCH_BUFFER_END);
+   if (intel->batch.used & 1) {
+      /* Round batchbuffer usage to 2 DWORDs. */
+      intel_batchbuffer_emit_dword(intel, MI_NOOP);
+   }
 
    if (intel->vtbl.finish_batch)
       intel->vtbl.finish_batch(intel);
@@ -181,24 +137,23 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
    /* Check that we didn't just wrap our batchbuffer at a bad time. */
    assert(!intel->no_batch_wrap);
 
-   do_flush_locked(batch, used);
+   do_flush_locked(intel);
 
    if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
       fprintf(stderr, "waiting for idle\n");
-      drm_intel_bo_map(batch->buf, GL_TRUE);
-      drm_intel_bo_unmap(batch->buf);
+      drm_intel_bo_wait_rendering(intel->batch.bo);
    }
 
    /* Reset the buffer:
     */
-   intel_batchbuffer_reset(batch);
+   intel_batchbuffer_reset(intel);
 }
 
 
 /*  This is the only way buffers get added to the validate list.
  */
 GLboolean
-intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
+intel_batchbuffer_emit_reloc(struct intel_context *intel,
                              drm_intel_bo *buffer,
                              uint32_t read_domains, uint32_t write_domain,
 			     uint32_t delta)
@@ -207,58 +162,55 @@ intel_batchbuffer_emit_reloc(struct intel_batchbuffer *batch,
 
    assert(delta < buffer->size);
 
-   if (batch->ptr - batch->map > batch->buf->size)
-    printf ("bad relocation ptr %p map %p offset %d size %lu\n",
-	    batch->ptr, batch->map, batch->ptr - batch->map, batch->buf->size);
-   ret = drm_intel_bo_emit_reloc(batch->buf, batch->ptr - batch->map,
+   ret = drm_intel_bo_emit_reloc(intel->batch.bo, 4*intel->batch.used,
 				 buffer, delta,
 				 read_domains, write_domain);
+   assert (ret == 0);
 
    /*
     * Using the old buffer offset, write in what the right data would be, in case
     * the buffer doesn't move and we can short-circuit the relocation processing
     * in the kernel
     */
-   intel_batchbuffer_emit_dword (batch, buffer->offset + delta);
+   intel_batchbuffer_emit_dword(intel, buffer->offset + delta);
 
    return GL_TRUE;
 }
 
 GLboolean
-intel_batchbuffer_emit_reloc_fenced(struct intel_batchbuffer *batch,
+intel_batchbuffer_emit_reloc_fenced(struct intel_context *intel,
 				    drm_intel_bo *buffer,
-				    uint32_t read_domains, uint32_t write_domain,
+				    uint32_t read_domains,
+				    uint32_t write_domain,
 				    uint32_t delta)
 {
    int ret;
 
    assert(delta < buffer->size);
 
-   if (batch->ptr - batch->map > batch->buf->size)
-    printf ("bad relocation ptr %p map %p offset %d size %lu\n",
-	    batch->ptr, batch->map, batch->ptr - batch->map, batch->buf->size);
-   ret = drm_intel_bo_emit_reloc_fence(batch->buf, batch->ptr - batch->map,
+   ret = drm_intel_bo_emit_reloc_fence(intel->batch.bo, 4*intel->batch.used,
 				       buffer, delta,
 				       read_domains, write_domain);
+   assert (ret == 0);
 
    /*
     * Using the old buffer offset, write in what the right data would
     * be, in case the buffer doesn't move and we can short-circuit the
     * relocation processing in the kernel
     */
-   intel_batchbuffer_emit_dword (batch, buffer->offset + delta);
+   intel_batchbuffer_emit_dword(intel, buffer->offset + delta);
 
    return GL_TRUE;
 }
 
 void
-intel_batchbuffer_data(struct intel_batchbuffer *batch,
+intel_batchbuffer_data(struct intel_context *intel,
                        const void *data, GLuint bytes, bool is_blit)
 {
    assert((bytes & 3) == 0);
-   intel_batchbuffer_require_space(batch, bytes, is_blit);
-   __memcpy(batch->ptr, data, bytes);
-   batch->ptr += bytes;
+   intel_batchbuffer_require_space(intel, bytes, is_blit);
+   __memcpy(intel->batch.map + intel->batch.used, data, bytes);
+   intel->batch.used += bytes >> 2;
 }
 
 /* Emit a pipelined flush to either flush render and texture cache for
@@ -268,12 +220,10 @@ intel_batchbuffer_data(struct intel_batchbuffer *batch,
  * This is also used for the always_flush_cache driconf debug option.
  */
 void
-intel_batchbuffer_emit_mi_flush(struct intel_batchbuffer *batch)
+intel_batchbuffer_emit_mi_flush(struct intel_context *intel)
 {
-   struct intel_context *intel = batch->intel;
-
    if (intel->gen >= 6) {
-      if (intel->batch->is_blit) {
+      if (intel->batch.is_blit) {
 	 BEGIN_BATCH_BLT(4);
 	 OUT_BATCH(MI_FLUSH_DW);
 	 OUT_BATCH(0);
author	Chris Wilson <chris@chris-wilson.co.uk>	2011-02-10 20:25:51 +0000
committer	Chris Wilson <chris@chris-wilson.co.uk>	2011-02-21 12:59:35 +0000
commit	8d68a90e225d831a395ba788e425cb717eec1f9a (patch)
tree	1c46eb19125ebbb205625783dd184e0489231d01 /src/mesa/drivers/dri/intel/intel_batchbuffer.c
parent	3f55683927278e57f3ef8a151d15f4cffdc060dc (diff)