summaryrefslogtreecommitdiff
path: root/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
diff options
context:
space:
mode:
authorMarek Olšák <maraeo@gmail.com>2011-02-15 05:43:44 +0100
committerMarek Olšák <maraeo@gmail.com>2011-02-15 09:17:39 +0100
commitb9e2cde6006b557a3a23a82384899f4d5a5ac7b8 (patch)
treea71d951e991ee3161fa1c2acc15ec58a79056caa /src/gallium/winsys/radeon/drm/radeon_drm_cs.c
parent8e0437914bb786d0b05be8f95e4ff37bf5a19f44 (diff)
r300g: offload the CS ioctl to another thread
This is a multi-threading optimization which hides the kernel overhead behind a thread. It improves performance in CPU-limited apps by 2-15%. Of course you must have at least 2 cores for it to make any difference. It can be disabled with: export RADEON_THREAD=0
Diffstat (limited to 'src/gallium/winsys/radeon/drm/radeon_drm_cs.c')
-rw-r--r--src/gallium/winsys/radeon/drm/radeon_drm_cs.c266
1 files changed, 172 insertions, 94 deletions
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 5b2a17c856..b4f5c9f6a8 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -73,6 +73,63 @@
#define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
+static boolean radeon_init_cs_context(struct radeon_cs_context *csc, int fd)
+{
+ csc->fd = fd;
+ csc->nrelocs = 512;
+ csc->relocs_bo = (struct radeon_bo**)
+ CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
+ if (!csc->relocs_bo) {
+ return FALSE;
+ }
+
+ csc->relocs = (struct drm_radeon_cs_reloc*)
+ CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
+ if (!csc->relocs) {
+ FREE(csc->relocs_bo);
+ return FALSE;
+ }
+
+ csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
+ csc->chunks[0].length_dw = 0;
+ csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
+ csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
+ csc->chunks[1].length_dw = 0;
+ csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
+
+ csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
+ csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
+
+ csc->cs.num_chunks = 2;
+ csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
+ return TRUE;
+}
+
+static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
+{
+ unsigned i;
+
+ for (i = 0; i < csc->crelocs; i++) {
+ radeon_bo_unref(csc->relocs_bo[i]);
+ p_atomic_dec(&csc->relocs_bo[i]->num_cs_references);
+ csc->relocs_bo[i] = NULL;
+ }
+
+ csc->crelocs = 0;
+ csc->chunks[0].length_dw = 0;
+ csc->chunks[1].length_dw = 0;
+ csc->used_gart = 0;
+ csc->used_vram = 0;
+ memset(csc->is_handle_added, 0, sizeof(csc->is_handle_added));
+}
+
+static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
+{
+ radeon_cs_context_cleanup(csc);
+ FREE(csc->relocs_bo);
+ FREE(csc->relocs);
+}
+
static struct r300_winsys_cs *radeon_drm_cs_create(struct r300_winsys_screen *rws)
{
struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
@@ -84,35 +141,29 @@ static struct r300_winsys_cs *radeon_drm_cs_create(struct r300_winsys_screen *rw
}
cs->ws = ws;
- cs->nrelocs = 256;
- cs->relocs_bo = (struct radeon_bo**)
- CALLOC(1, cs->nrelocs * sizeof(struct radeon_bo*));
- if (!cs->relocs_bo) {
+
+ if (!radeon_init_cs_context(&cs->csc1, cs->ws->fd)) {
FREE(cs);
return NULL;
}
-
- cs->relocs = (struct drm_radeon_cs_reloc*)
- CALLOC(1, cs->nrelocs * sizeof(struct drm_radeon_cs_reloc));
- if (!cs->relocs) {
- FREE(cs->relocs_bo);
+ if (!radeon_init_cs_context(&cs->csc2, cs->ws->fd)) {
+ radeon_destroy_cs_context(&cs->csc1);
FREE(cs);
return NULL;
}
- cs->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
- cs->chunks[0].length_dw = 0;
- cs->chunks[0].chunk_data = (uint64_t)(uintptr_t)cs->base.buf;
- cs->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
- cs->chunks[1].length_dw = 0;
- cs->chunks[1].chunk_data = (uint64_t)(uintptr_t)cs->relocs;
+ /* Set the first command buffer as current. */
+ cs->csc = &cs->csc1;
+ cs->cst = &cs->csc2;
+ cs->base.buf = cs->csc->buf;
+
p_atomic_inc(&ws->num_cs);
return &cs->base;
}
#define OUT_CS(cs, value) (cs)->buf[(cs)->cdw++] = (value)
-static inline void update_domains(struct drm_radeon_cs_reloc *reloc,
+static INLINE void update_domains(struct drm_radeon_cs_reloc *reloc,
enum r300_buffer_domain rd,
enum r300_buffer_domain wd,
enum r300_buffer_domain *added_domains)
@@ -131,22 +182,22 @@ static inline void update_domains(struct drm_radeon_cs_reloc *reloc,
}
}
-int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo)
+int radeon_get_reloc(struct radeon_cs_context *csc, struct radeon_bo *bo)
{
struct drm_radeon_cs_reloc *reloc;
unsigned i;
- unsigned hash = bo->handle & (sizeof(cs->is_handle_added)-1);
+ unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
- if (cs->is_handle_added[hash]) {
- reloc = cs->relocs_hashlist[hash];
+ if (csc->is_handle_added[hash]) {
+ reloc = csc->relocs_hashlist[hash];
if (reloc->handle == bo->handle) {
- return cs->reloc_indices_hashlist[hash];
+ return csc->reloc_indices_hashlist[hash];
}
/* Hash collision, look for the BO in the list of relocs linearly. */
- for (i = cs->crelocs; i != 0;) {
+ for (i = csc->crelocs; i != 0;) {
--i;
- reloc = &cs->relocs[i];
+ reloc = &csc->relocs[i];
if (reloc->handle == bo->handle) {
/* Put this reloc in the hash list.
* This will prevent additional hash collisions if there are
@@ -157,8 +208,8 @@ int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo)
* AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
* will collide here: ^ and here: ^,
* meaning that we should get very few collisions in the end. */
- cs->relocs_hashlist[hash] = reloc;
- cs->reloc_indices_hashlist[hash] = i;
+ csc->relocs_hashlist[hash] = reloc;
+ csc->reloc_indices_hashlist[hash] = i;
/*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
return i;
}
@@ -168,7 +219,7 @@ int radeon_get_reloc(struct radeon_drm_cs *cs, struct radeon_bo *bo)
return -1;
}
-static void radeon_add_reloc(struct radeon_drm_cs *cs,
+static void radeon_add_reloc(struct radeon_cs_context *csc,
struct radeon_bo *bo,
enum r300_buffer_domain rd,
enum r300_buffer_domain wd,
@@ -176,24 +227,24 @@ static void radeon_add_reloc(struct radeon_drm_cs *cs,
{
struct drm_radeon_cs_reloc *reloc;
unsigned i;
- unsigned hash = bo->handle & (sizeof(cs->is_handle_added)-1);
+ unsigned hash = bo->handle & (sizeof(csc->is_handle_added)-1);
- if (cs->is_handle_added[hash]) {
- reloc = cs->relocs_hashlist[hash];
+ if (csc->is_handle_added[hash]) {
+ reloc = csc->relocs_hashlist[hash];
if (reloc->handle == bo->handle) {
update_domains(reloc, rd, wd, added_domains);
return;
}
/* Hash collision, look for the BO in the list of relocs linearly. */
- for (i = cs->crelocs; i != 0;) {
+ for (i = csc->crelocs; i != 0;) {
--i;
- reloc = &cs->relocs[i];
+ reloc = &csc->relocs[i];
if (reloc->handle == bo->handle) {
update_domains(reloc, rd, wd, added_domains);
- cs->relocs_hashlist[hash] = reloc;
- cs->reloc_indices_hashlist[hash] = i;
+ csc->relocs_hashlist[hash] = reloc;
+ csc->reloc_indices_hashlist[hash] = i;
/*printf("write_reloc collision, hash: %i, handle: %i\n", hash, bo->handle);*/
return;
}
@@ -201,35 +252,35 @@ static void radeon_add_reloc(struct radeon_drm_cs *cs,
}
/* New relocation, check if the backing array is large enough. */
- if (cs->crelocs >= cs->nrelocs) {
+ if (csc->crelocs >= csc->nrelocs) {
uint32_t size;
- cs->nrelocs += 10;
+ csc->nrelocs += 10;
- size = cs->nrelocs * sizeof(struct radeon_bo*);
- cs->relocs_bo = (struct radeon_bo**)realloc(cs->relocs_bo, size);
+ size = csc->nrelocs * sizeof(struct radeon_bo*);
+ csc->relocs_bo = (struct radeon_bo**)realloc(csc->relocs_bo, size);
- size = cs->nrelocs * sizeof(struct drm_radeon_cs_reloc);
- cs->relocs = (struct drm_radeon_cs_reloc*)realloc(cs->relocs, size);
+ size = csc->nrelocs * sizeof(struct drm_radeon_cs_reloc);
+ csc->relocs = (struct drm_radeon_cs_reloc*)realloc(csc->relocs, size);
- cs->chunks[1].chunk_data = (uint64_t)(uintptr_t)cs->relocs;
+ csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
}
/* Initialize the new relocation. */
radeon_bo_ref(bo);
p_atomic_inc(&bo->num_cs_references);
- cs->relocs_bo[cs->crelocs] = bo;
- reloc = &cs->relocs[cs->crelocs];
+ csc->relocs_bo[csc->crelocs] = bo;
+ reloc = &csc->relocs[csc->crelocs];
reloc->handle = bo->handle;
reloc->read_domains = rd;
reloc->write_domain = wd;
reloc->flags = 0;
- cs->is_handle_added[hash] = TRUE;
- cs->relocs_hashlist[hash] = reloc;
- cs->reloc_indices_hashlist[hash] = cs->crelocs;
+ csc->is_handle_added[hash] = TRUE;
+ csc->relocs_hashlist[hash] = reloc;
+ csc->reloc_indices_hashlist[hash] = csc->crelocs;
- cs->chunks[1].length_dw += RELOC_DWORDS;
- cs->crelocs++;
+ csc->chunks[1].length_dw += RELOC_DWORDS;
+ csc->crelocs++;
*added_domains = rd | wd;
}
@@ -243,23 +294,23 @@ static void radeon_drm_cs_add_reloc(struct r300_winsys_cs *rcs,
struct radeon_bo *bo = (struct radeon_bo*)buf;
enum r300_buffer_domain added_domains;
- radeon_add_reloc(cs, bo, rd, wd, &added_domains);
+ radeon_add_reloc(cs->csc, bo, rd, wd, &added_domains);
if (!added_domains)
return;
if (added_domains & R300_DOMAIN_GTT)
- cs->used_gart += bo->size;
+ cs->csc->used_gart += bo->size;
if (added_domains & R300_DOMAIN_VRAM)
- cs->used_vram += bo->size;
+ cs->csc->used_vram += bo->size;
}
static boolean radeon_drm_cs_validate(struct r300_winsys_cs *rcs)
{
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
- return cs->used_gart < cs->ws->gart_size * 0.8 &&
- cs->used_vram < cs->ws->vram_size * 0.8;
+ return cs->csc->used_gart < cs->ws->gart_size * 0.8 &&
+ cs->csc->used_vram < cs->ws->vram_size * 0.8;
}
static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs,
@@ -268,7 +319,7 @@ static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs,
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
struct radeon_bo *bo = (struct radeon_bo*)buf;
- unsigned index = radeon_get_reloc(cs, bo);
+ unsigned index = radeon_get_reloc(cs->csc, bo);
if (index == -1) {
fprintf(stderr, "r300: Cannot get a relocation in %s.\n", __func__);
@@ -279,63 +330,89 @@ static void radeon_drm_cs_write_reloc(struct r300_winsys_cs *rcs,
OUT_CS(&cs->base, index * RELOC_DWORDS);
}
-static void radeon_drm_cs_emit(struct r300_winsys_cs *rcs)
+static PIPE_THREAD_ROUTINE(radeon_drm_cs_emit_async, param)
{
- struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
- uint64_t chunk_array[2];
+ struct radeon_cs_context *csc = (struct radeon_cs_context*)param;
unsigned i;
- int r;
- if (cs->base.cdw) {
- /* Prepare the arguments. */
- cs->chunks[0].length_dw = cs->base.cdw;
-
- chunk_array[0] = (uint64_t)(uintptr_t)&cs->chunks[0];
- chunk_array[1] = (uint64_t)(uintptr_t)&cs->chunks[1];
-
- cs->cs.num_chunks = 2;
- cs->cs.chunks = (uint64_t)(uintptr_t)chunk_array;
-
- /* Emit. */
- r = drmCommandWriteRead(cs->ws->fd, DRM_RADEON_CS,
- &cs->cs, sizeof(struct drm_radeon_cs));
- if (r) {
- if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
- fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
- fprintf(stderr, "VENDORID:DEVICEID 0x%04X:0x%04X\n", 0x1002,
- cs->ws->pci_id);
- for (i = 0; i < cs->base.cdw; i++) {
- fprintf(stderr, "0x%08X\n", cs->base.buf[i]);
- }
- } else {
- fprintf(stderr, "radeon: The kernel rejected CS, "
- "see dmesg for more information.\n");
+ if (drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
+ &csc->cs, sizeof(struct drm_radeon_cs))) {
+ if (debug_get_bool_option("RADEON_DUMP_CS", FALSE)) {
+ unsigned i;
+
+ fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
+ for (i = 0; i < csc->chunks[0].length_dw; i++) {
+ fprintf(stderr, "0x%08X\n", csc->buf[i]);
}
+ } else {
+ fprintf(stderr, "radeon: The kernel rejected CS, "
+ "see dmesg for more information.\n");
}
}
- /* Unreference buffers, cleanup. */
- for (i = 0; i < cs->crelocs; i++) {
- radeon_bo_unref(cs->relocs_bo[i]);
- p_atomic_dec(&cs->relocs_bo[i]->num_cs_references);
- cs->relocs_bo[i] = NULL;
+ for (i = 0; i < csc->crelocs; i++)
+ p_atomic_dec(&csc->relocs_bo[i]->num_active_ioctls);
+ return NULL;
+}
+
+static void radeon_drm_cs_sync_flush(struct r300_winsys_cs *rcs)
+{
+ struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+
+ /* Wait for any pending ioctl to complete. */
+ if (cs->thread) {
+ pipe_thread_wait(cs->thread);
+ cs->thread = 0;
}
+}
+
+DEBUG_GET_ONCE_BOOL_OPTION(thread, "RADEON_THREAD", TRUE)
+
+static void radeon_drm_cs_emit(struct r300_winsys_cs *rcs)
+{
+ struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+ struct radeon_cs_context *tmp;
+
+ radeon_drm_cs_sync_flush(rcs);
+
+ /* If the CS is not empty, emit it in a newly-spawned thread. */
+ if (cs->base.cdw) {
+ unsigned i, crelocs = cs->csc->crelocs;
+
+ cs->csc->chunks[0].length_dw = cs->base.cdw;
+
+ for (i = 0; i < crelocs; i++)
+ p_atomic_inc(&cs->csc->relocs_bo[i]->num_active_ioctls);
+
+ if (debug_get_option_thread()) {
+ cs->thread = pipe_thread_create(radeon_drm_cs_emit_async, cs->csc);
+ assert(cs->thread);
+ } else {
+ radeon_drm_cs_emit_async(cs->csc);
+ }
+ }
+
+ /* Flip command streams. */
+ tmp = cs->csc;
+ cs->csc = cs->cst;
+ cs->cst = tmp;
+
+ /* Prepare a new CS. */
+ radeon_cs_context_cleanup(cs->csc);
+ cs->base.buf = cs->csc->buf;
cs->base.cdw = 0;
- cs->crelocs = 0;
- cs->chunks[0].length_dw = 0;
- cs->chunks[1].length_dw = 0;
- cs->used_gart = 0;
- cs->used_vram = 0;
- memset(cs->is_handle_added, 0, sizeof(cs->is_handle_added));
}
static void radeon_drm_cs_destroy(struct r300_winsys_cs *rcs)
{
struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
+ radeon_drm_cs_sync_flush(rcs);
+ radeon_cs_context_cleanup(&cs->csc1);
+ radeon_cs_context_cleanup(&cs->csc2);
p_atomic_dec(&cs->ws->num_cs);
- FREE(cs->relocs_bo);
- FREE(cs->relocs);
+ radeon_destroy_cs_context(&cs->csc1);
+ radeon_destroy_cs_context(&cs->csc2);
FREE(cs);
}
@@ -364,6 +441,7 @@ void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
ws->base.cs_validate = radeon_drm_cs_validate;
ws->base.cs_write_reloc = radeon_drm_cs_write_reloc;
ws->base.cs_flush = radeon_drm_cs_emit;
+ ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
ws->base.cs_set_flush = radeon_drm_cs_set_flush;
ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
}