r300g: offload the CS ioctl to another thread

This is a multi-threading optimization which hides the kernel overhead behind a thread. It improves performance in CPU-limited apps by 2-15%. Of course you must have at least 2 cores for it to make any difference. It can be disabled with: export RADEON_THREAD=0
author: Marek Olšák <maraeo@gmail.com> 2011-02-15 05:43:44 +0100
committer: Marek Olšák <maraeo@gmail.com> 2011-02-15 09:17:39 +0100
commit: b9e2cde6006b557a3a23a82384899f4d5a5ac7b8 (patch)
tree: a71d951e991ee3161fa1c2acc15ec58a79056caa /src/gallium/winsys/radeon/drm/radeon_drm_cs.c
parent: 8e0437914bb786d0b05be8f95e4ff37bf5a19f44 (diff)
1 files changed, 172 insertions, 94 deletions
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 5b2a17c856..b4f5c9f6a8 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -73,6 +73,63 @@
 
 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
 
+static boolean radeon_init_cs_context(struct radeon_cs_context *csc, int fd)
+{
+    csc->fd = fd;
+    csc->nrelocs = 512;
+    csc->relocs_bo = (struct radeon_bo**)
+                     CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
+    if (!csc->relocs_bo) {
+        return FALSE;
+    }
+
+    csc->relocs = (struct drm_radeon_cs_reloc*)
+                  CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
+    if (!csc->relocs) {
+        FREE(csc->relocs_bo);
+        return FALSE;
+    }
+
+    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
+    csc->chunks[0].length_dw = 0;
+    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
+    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
+    csc->chunks[1].length_dw = 0;
+    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
+
+    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
+    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
+
+    csc->cs.num_chunks = 2;
+    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
+    return TRUE;
+}
+
+static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
+{
+    unsigned i;
+
+    for (i = 0; i < csc->crelocs; i++) {
+        radeon_bo_unref(csc->relocs_bo[i]);
+        p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
+        csc->relocs_bo[i] = NULL;
+    }
+
+    csc->crelocs = 0;
+    csc->chunks[0].length_dw = 0;
+    csc->chunks[1].length_dw = 0;
+    csc->used_gart = 0;
+    csc->used_vram = 0;
+    memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
+}
+
+static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
+{
+    radeon_cs_context_cleanup(csc);
+    FREE(csc->relocs_bo);
+    FREE(csc->relocs);
+}
+
 static struct r300_winsys_cs *radeon_drm_cs_create(struct r300_winsys_screen *rws)
 {
     struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
@@ -84,35 +141,29 @@ static struct r300_winsys_cs *radeon_drm_cs_create(struct r300_winsys_screen *rw
     }
 
     cs->ws = ws;
-    cs->nrelocs = 256;
-    cs->relocs_bo = (struct radeon_bo**)
-                     CALLOC(1, cs->nrelocs * sizeof(struct radeon_bo*));
-    if (!cs->relocs_bo) {
+
+    if (!radeon_init_cs_context(&cs->csc1, cs->ws->fd)) {
         FREE(cs);
         return NULL;
     }
-
-    cs->relocs = (struct drm_radeon_cs_reloc*)
-                 CALLOC(1, cs->nrelocs * sizeof(struct drm_radeon_cs_reloc));
-    if (!cs->relocs) {
-        FREE(cs->relocs_bo);
+    if (!radeon_init_cs_context(&cs->csc2, cs->ws->fd)) {
+        radeon_destroy_cs_context(&cs->csc1);
         FREE(cs);
         return NULL;
     }
 
-    cs->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
-    cs->chunks[0].length_dw = 0;
-    cs->chunks[0].chunk_data = (uint64_t)(uintptr_t)cs->base.buf;
-    cs->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
-    cs->chunks[1].length_dw = 0;
-    cs->chunks[1].chunk_data = (uint64_t)(uintptr_t)cs->relocs;
+    /* Set the first command buffer as current. */
+    cs->csc = &cs->csc1;
+    cs->cst = &cs->csc2;
+    cs->base.buf = cs->csc->buf;
+
     p_atomic_inc(&ws->num_cs);
     return &cs->base;
 }
 
 #define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
 
-static inline void update_domains(struct drm_radeon_cs_reloc *reloc,
+static INLINE void update_domains(struct drm_radeon_cs_reloc *reloc,
                                   enum r300_buffer_domain rd,
                                   enum r300_buffer_domain wd,
                                   enum r300_buffer_domain *added_domains)
@@ -131,22 +182,22 @@ static inline void update_domains(struct drm_radeon_cs_reloc *reloc,
     }
 }
 
-int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo)
+int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
 {
     struct drm_radeon_cs_reloc *reloc;
     unsigned i;
-    unsigned hash = bo->handle & (sizeof(cs->is_handle_added)-1);
+    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
 
-    if (cs->is_handle_added[hash]) {
-        reloc = cs->relocs_hashlist[hash];
+    if (csc->is_handle_added[hash]) {
+        reloc = csc->relocs_hashlist[hash];
         if (reloc->handle == bo->handle) {
-            return cs->reloc_indices_hashlist[hash];
+            return csc->reloc_indices_hashlist[hash];
         }
 
         /* Hash collision, look for the BO in the list of relocs linearly. */
-        for (i = cs->crelocs; i != 0;) {
+        for (i = csc->crelocs; i != 0;) {
             --i;
-            reloc = &cs->relocs[i];
+            reloc = &csc->relocs[i];
             if (reloc->handle == bo->handle) {
                 /* Put this reloc in the hash list.
                  * This will prevent additional hash collisions if there are
@@ -157,8 +208,8 @@ int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo)
                  *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
                  * will collide here: ^ and here:   ^,
                  * meaning that we should get very few collisions in the end. */
-                cs->relocs_hashlist[hash] = reloc;
-                cs->reloc_indices_hashlist[hash] = i;
+                csc->relocs_hashlist[hash] = reloc;
+                csc->reloc_indices_hashlist[hash] = i;
                 /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
                 return i;
             }
@@ -168,7 +219,7 @@ int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo)
     return -1;
 }
 
-static void radeon_add_reloc(struct radeon_drm_cs *cs,
+static void radeon_add_reloc(struct radeon_cs_context *csc,
                              struct radeon_bo *bo,
                              enum r300_buffer_domain rd,
                              enum r300_buffer_domain wd,
@@ -176,24 +227,24 @@ static void radeon_add_reloc(struct radeon_drm_cs *cs,
 {
     struct drm_radeon_cs_reloc *reloc;
     unsigned i;
-    unsigned hash = bo->handle & (sizeof(cs->is_handle_added)-1);
+    unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
 
-    if (cs->is_handle_added[hash]) {
-        reloc = cs->relocs_hashlist[hash];
+    if (csc->is_handle_added[hash]) {
+        reloc = csc->relocs_hashlist[hash];
         if (reloc->handle == bo->handle) {
             update_domains(reloc, rd, wd, added_domains);
             return;
         }
 
         /* Hash collision, look for the BO in the list of relocs linearly. */
-        for (i = cs->crelocs; i != 0;) {
+        for (i = csc->crelocs; i != 0;) {
             --i;
-            reloc = &cs->relocs[i];
+            reloc = &csc->relocs[i];
             if (reloc->handle == bo->handle) {
                 update_domains(reloc, rd, wd, added_domains);
 
-                cs->relocs_hashlist[hash] = reloc;
-                cs->reloc_indices_hashlist[hash] = i;
+                csc->relocs_hashlist[hash] = reloc;
+                csc->reloc_indices_hashlist[hash] = i;
                 /*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
                 return;
             }
@@ -201,35 +252,35 @@ static void radeon_add_reloc(struct radeon_drm_cs *cs,
     }
 
     /* New relocation, check if the backing array is large enough. */
-    if (cs->crelocs >= cs->nrelocs) {
+    if (csc->crelocs >= csc->nrelocs) {
         uint32_t size;
-        cs->nrelocs += 10;
+        csc->nrelocs += 10;
 
-        size = cs->nrelocs * sizeof(struct radeon_bo*);
-        cs->relocs_bo = (struct radeon_bo**)realloc(cs->relocs_bo, size);
+        size = csc->nrelocs * sizeof(struct radeon_bo*);
+        csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
 
-        size = cs->nrelocs * sizeof(struct drm_radeon_cs_reloc);
-        cs->relocs = (struct drm_radeon_cs_reloc*)realloc(cs->relocs, size);
+        size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
+        csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
 
-        cs->chunks[1].chunk_data = (uint64_t)(uintptr_t)cs->relocs;
+        csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
     }
 
     /* Initialize the new relocation. */
     radeon_bo_ref(bo);
     p_atomic_inc(&bo->num_cs_references);
-    cs->relocs_bo[cs->crelocs] = bo;
-    reloc = &cs->relocs[cs->crelocs];
+    csc->relocs_bo[csc->crelocs] = bo;
+    reloc = &csc->relocs[csc->crelocs];
     reloc->handle = bo->handle;
     reloc->read_domains = rd;
     reloc->write_domain = wd;
     reloc->flags = 0;
 
-    cs->is_handle_added[hash] = TRUE;
-    cs->relocs_hashlist[hash] = reloc;
-    cs->reloc_indices_hashlist[hash] = cs->crelocs;
+    csc->is_handle_added[hash] = TRUE;
+    csc->relocs_hashlist[hash] = reloc;
+    csc->reloc_indices_hashlist[hash] = csc->crelocs;
 
-    cs->chunks[1].length_dw += RELOC_DWORDS;
-    cs->crelocs++;
+    csc->chunks[1].length_dw += RELOC_DWORDS;
+    csc->crelocs++;
 
     *added_domains = rd | wd;
 }
@@ -243,23 +294,23 @@ static void radeon_drm_cs_add_reloc(struct r300_winsys_cs *rcs,
     struct radeon_bo *bo = (struct radeon_bo*)buf;
     enum r300_buffer_domain added_domains;
 
-    radeon_add_reloc(cs, bo, rd, wd, &added_domains);
+    radeon_add_reloc(cs->csc, bo, rd, wd, &added_domains);
 
     if (!added_domains)
         return;
 
     if (added_domains & R300_DOMAIN_GTT)
-        cs->used_gart += bo->size;
+        cs->csc->used_gart += bo->size;
     if (added_domains & R300_DOMAIN_VRAM)
-        cs->used_vram += bo->size;
+        cs->csc->used_vram += bo->size;
 }
 
 static boolean radeon_drm_cs_validate(struct r300_winsys_cs *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 
-    return cs->used_gart < cs->ws->gart_size * 0.8 &&
-           cs->used_vram < cs->ws->vram_size * 0.8;
+    return cs->csc->used_gart < cs->ws->gart_size * 0.8 &&
+           cs->csc->used_vram < cs->ws->vram_size * 0.8;
 }
 
 static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs,
@@ -268,7 +319,7 @@ static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs,
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_bo *bo = (struct radeon_bo*)buf;
 
-    unsigned index = radeon_get_reloc(cs, bo);
+    unsigned index = radeon_get_reloc(cs->csc, bo);
 
     if (index == -1) {
         fprintf(stderr, "r300: Cannot get a relocation in %s.\n", __func__);
@@ -279,63 +330,89 @@ static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs,
     OUT_CS(&cs->base, index * RELOC_DWORDS);
 }
 
-static void radeon_drm_cs_emit(struct r300_winsys_cs *rcs)
+static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_async, param)
 {
-    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
-    uint64_t chunk_array[2];
+    struct radeon_cs_context *csc = (struct radeon_cs_context*)param;
     unsigned i;
-    int r;
 
-    if (cs->base.cdw) {
-        /* Prepare the arguments. */
-        cs->chunks[0].length_dw = cs->base.cdw;
-
-        chunk_array[0] = (uint64_t)(uintptr_t)&cs->chunks[0];
-        chunk_array[1] = (uint64_t)(uintptr_t)&cs->chunks[1];
-
-        cs->cs.num_chunks = 2;
-        cs->cs.chunks = (uint64_t)(uintptr_t)chunk_array;
-
-        /* Emit. */
-        r = drmCommandWriteRead(cs->ws->fd, DRM_RADEON_CS,
-                                &cs->cs, sizeof(struct drm_radeon_cs));
-        if (r) {
-            if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
-                fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
-                fprintf(stderr, "VENDORID:DEVICEID 0x%04X:0x%04X\n", 0x1002,
-                        cs->ws->pci_id);
-                for (i = 0; i < cs->base.cdw; i++) {
-                    fprintf(stderr, "0x%08X\n", cs->base.buf[i]);
-                }
-            } else {
-                fprintf(stderr, "radeon: The kernel rejected CS, "
-                                "see dmesg for more information.\n");
+    if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
+                            &csc->cs, sizeof(struct drm_radeon_cs))) {
+        if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
+            unsigned i;
+
+            fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
+            for (i = 0; i < csc->chunks[0].length_dw; i++) {
+                fprintf(stderr, "0x%08X\n", csc->buf[i]);
             }
+        } else {
+            fprintf(stderr, "radeon: The kernel rejected CS, "
+                    "see dmesg for more information.\n");
         }
     }
 
-    /* Unreference buffers, cleanup. */
-    for (i = 0; i < cs->crelocs; i++) {
-        radeon_bo_unref(cs->relocs_bo[i]);
-        p_atomic_dec(&cs->relocs_bo[i]->num_cs_references);
-        cs->relocs_bo[i] = NULL;
+    for (i = 0; i < csc->crelocs; i++)
+        p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
+    return NULL;
+}
+
+static void radeon_drm_cs_sync_flush(struct r300_winsys_cs *rcs)
+{
+    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+
+    /* Wait for any pending ioctl to complete. */
+    if (cs->thread) {
+        pipe_thread_wait(cs->thread);
+        cs->thread = 0;
     }
+}
+
+DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
+
+static void radeon_drm_cs_emit(struct r300_winsys_cs *rcs)
+{
+    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+    struct radeon_cs_context *tmp;
+
+    radeon_drm_cs_sync_flush(rcs);
+
+    /* If the CS is not empty, emit it in a newly-spawned thread. */
+    if (cs->base.cdw) {
+        unsigned i, crelocs = cs->csc->crelocs;
+
+        cs->csc->chunks[0].length_dw = cs->base.cdw;
+
+        for (i = 0; i < crelocs; i++)
+            p_atomic_inc(&cs->csc->relocs_bo[i]->num_active_ioctls);
+
+        if (debug_get_option_thread()) {
+            cs->thread = pipe_thread_create(radeon_drm_cs_emit_async, cs->csc);
+            assert(cs->thread);
+        } else {
+            radeon_drm_cs_emit_async(cs->csc);
+        }
+    }
+
+    /* Flip command streams. */
+    tmp = cs->csc;
+    cs->csc = cs->cst;
+    cs->cst = tmp;
+
+    /* Prepare a new CS. */
+    radeon_cs_context_cleanup(cs->csc);
 
+    cs->base.buf = cs->csc->buf;
     cs->base.cdw = 0;
-    cs->crelocs = 0;
-    cs->chunks[0].length_dw = 0;
-    cs->chunks[1].length_dw = 0;
-    cs->used_gart = 0;
-    cs->used_vram = 0;
-    memset(cs->is_handle_added, 0, sizeof(cs->is_handle_added));
 }
 
 static void radeon_drm_cs_destroy(struct r300_winsys_cs *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+    radeon_drm_cs_sync_flush(rcs);
+    radeon_cs_context_cleanup(&cs->csc1);
+    radeon_cs_context_cleanup(&cs->csc2);
     p_atomic_dec(&cs->ws->num_cs);
-    FREE(cs->relocs_bo);
-    FREE(cs->relocs);
+    radeon_destroy_cs_context(&cs->csc1);
+    radeon_destroy_cs_context(&cs->csc2);
     FREE(cs);
 }
 
@@ -364,6 +441,7 @@ void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
     ws->base.cs_validate = radeon_drm_cs_validate;
     ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
     ws->base.cs_flush = radeon_drm_cs_emit;
+    ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
     ws->base.cs_set_flush = radeon_drm_cs_set_flush;
     ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
 }
author	Marek Olšák <maraeo@gmail.com>	2011-02-15 05:43:44 +0100
committer	Marek Olšák <maraeo@gmail.com>	2011-02-15 09:17:39 +0100
commit	b9e2cde6006b557a3a23a82384899f4d5a5ac7b8 (patch)
tree	a71d951e991ee3161fa1c2acc15ec58a79056caa /src/gallium/winsys/radeon/drm/radeon_drm_cs.c
parent	8e0437914bb786d0b05be8f95e4ff37bf5a19f44 (diff)