From df8ab3140ce05599e1dc983ac211a30fc845d9b5 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 19 Feb 2008 11:49:48 +0900
Subject: Bring rtasm from mesa to gallium.

---
 src/gallium/auxiliary/rtasm/Makefile   |   20 +
 src/gallium/auxiliary/rtasm/SConscript |   11 +
 src/gallium/auxiliary/rtasm/execmem.c  |  133 ++++
 src/gallium/auxiliary/rtasm/execmem.h  |   45 ++
 src/gallium/auxiliary/rtasm/mm.c       |  283 ++++++++
 src/gallium/auxiliary/rtasm/mm.h       |   89 +++
 src/gallium/auxiliary/rtasm/x86sse.c   | 1195 ++++++++++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/x86sse.h   |  256 +++++++
 8 files changed, 2032 insertions(+)
 create mode 100644 src/gallium/auxiliary/rtasm/Makefile
 create mode 100644 src/gallium/auxiliary/rtasm/SConscript
 create mode 100644 src/gallium/auxiliary/rtasm/execmem.c
 create mode 100644 src/gallium/auxiliary/rtasm/execmem.h
 create mode 100644 src/gallium/auxiliary/rtasm/mm.c
 create mode 100644 src/gallium/auxiliary/rtasm/mm.h
 create mode 100644 src/gallium/auxiliary/rtasm/x86sse.c
 create mode 100644 src/gallium/auxiliary/rtasm/x86sse.h

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
new file mode 100644
index 0000000000..b3b9934e10
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -0,0 +1,20 @@
+
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = rtasm
+
+DRIVER_SOURCES = \
+	x86sse.c \
+	mm.c \
+	execmem.c
+
+C_SOURCES = \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
new file mode 100644
index 0000000000..c5b1551786
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -0,0 +1,11 @@
+Import('*')
+
+rtasm = env.ConvenienceLibrary(
+	target = 'rtasm',
+	source = [
+		'x86sse.c',
+		'mm.c',
+		'execmem.c',
+	])
+
+auxiliaries.insert(0, rtasm)
diff --git a/src/gallium/auxiliary/rtasm/execmem.c b/src/gallium/auxiliary/rtasm/execmem.c
new file mode 100644
index 0000000000..c7c35f7ef2
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/execmem.c
@@ -0,0 +1,133 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * \file exemem.c
+ * Functions for allocating executable memory.
+ *
+ * \author Keith Whitwell
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_thread.h"
+
+#include "execmem.h"
+
+
+#if defined(__linux__)
+
+/*
+ * Allocate a large block of memory which can hold code then dole it out
+ * in pieces by means of the generic memory manager code.
+*/
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include "mm.h"
+
+#define EXEC_HEAP_SIZE (10*1024*1024)
+
+_glthread_DECLARE_STATIC_MUTEX(exec_mutex);
+
+static struct mem_block *exec_heap = NULL;
+static unsigned char *exec_mem = NULL;
+
+
+static void
+init_heap(void)
+{
+   if (!exec_heap)
+      exec_heap = mmInit( 0, EXEC_HEAP_SIZE );
+   
+   if (!exec_mem)
+      exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE, 
+					PROT_EXEC | PROT_READ | PROT_WRITE, 
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+}
+
+
+void *
+_mesa_exec_malloc(size_t size)
+{
+   struct mem_block *block = NULL;
+   void *addr = NULL;
+
+   _glthread_LOCK_MUTEX(exec_mutex);
+
+   init_heap();
+
+   if (exec_heap) {
+      size = (size + 31) & ~31;
+      block = mmAllocMem( exec_heap, size, 32, 0 );
+   }
+
+   if (block)
+      addr = exec_mem + block->ofs;
+   else 
+      debug_printf("_mesa_exec_malloc failed\n");
+   
+   _glthread_UNLOCK_MUTEX(exec_mutex);
+   
+   return addr;
+}
+
+ 
+void 
+_mesa_exec_free(void *addr)
+{
+   _glthread_LOCK_MUTEX(exec_mutex);
+
+   if (exec_heap) {
+      struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+   
+      if (block)
+	 mmFreeMem(block);
+   }
+
+   _glthread_UNLOCK_MUTEX(exec_mutex);
+}
+
+
+#else
+
+/*
+ * Just use regular memory.
+ */
+
+void *
+_mesa_exec_malloc(GLuint size)
+{
+   return _mesa_malloc( size );
+}
+
+ 
+void 
+_mesa_exec_free(void *addr)
+{
+   _mesa_free(addr);
+}
+
+
+#endif
diff --git a/src/gallium/auxiliary/rtasm/execmem.h b/src/gallium/auxiliary/rtasm/execmem.h
new file mode 100644
index 0000000000..9fd4569165
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/execmem.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file exemem.c
+ * Functions for allocating executable memory.
+ *
+ * \author Keith Whitwell
+ */
+
+#ifndef _EXECMEM_H_
+#define _EXECMEM_H_
+
+#include "pipe/p_compiler.h"
+
+
+extern void *
+_mesa_exec_malloc( size_t size );
+
+
+extern void 
+_mesa_exec_free( void *addr );
+
+
+#endif
diff --git a/src/gallium/auxiliary/rtasm/mm.c b/src/gallium/auxiliary/rtasm/mm.c
new file mode 100644
index 0000000000..15f50491da
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/mm.c
@@ -0,0 +1,283 @@
+/*
+ * GLX Hardware Device Driver common code
+ * Copyright (C) 1999 Wittawat Yamwong
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, 
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
+#include "pipe/p_debug.h"
+
+#include "mm.h"
+
+
+void
+mmDumpMemInfo(const struct mem_block *heap)
+{
+   debug_printf("Memory heap %p:\n", (void *)heap);
+   if (heap == 0) {
+      debug_printf("  heap == 0\n");
+   } else {
+      const struct mem_block *p;
+
+      for(p = heap->next; p != heap; p = p->next) {
+	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+		 p->free ? 'F':'.',
+		 p->reserved ? 'R':'.');
+      }
+
+      debug_printf("\nFree list:\n");
+
+      for(p = heap->next_free; p != heap; p = p->next_free) {
+	 debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+		 p->free ? 'F':'.',
+		 p->reserved ? 'R':'.');
+      }
+
+   }
+   debug_printf("End of memory blocks\n");
+}
+
+struct mem_block *
+mmInit(int ofs, int size)
+{
+   struct mem_block *heap, *block;
+  
+   if (size <= 0) 
+      return NULL;
+
+   heap = CALLOC_STRUCT(mem_block);
+   if (!heap) 
+      return NULL;
+   
+   block = CALLOC_STRUCT(mem_block);
+   if (!block) {
+      FREE(heap);
+      return NULL;
+   }
+
+   heap->next = block;
+   heap->prev = block;
+   heap->next_free = block;
+   heap->prev_free = block;
+
+   block->heap = heap;
+   block->next = heap;
+   block->prev = heap;
+   block->next_free = heap;
+   block->prev_free = heap;
+
+   block->ofs = ofs;
+   block->size = size;
+   block->free = 1;
+
+   return heap;
+}
+
+
+static struct mem_block *
+SliceBlock(struct mem_block *p, 
+           int startofs, int size, 
+           int reserved, int alignment)
+{
+   struct mem_block *newblock;
+
+   /* break left  [p, newblock, p->next], then p = newblock */
+   if (startofs > p->ofs) {
+      newblock = CALLOC_STRUCT(mem_block);
+      if (!newblock)
+	 return NULL;
+      newblock->ofs = startofs;
+      newblock->size = p->size - (startofs - p->ofs);
+      newblock->free = 1;
+      newblock->heap = p->heap;
+
+      newblock->next = p->next;
+      newblock->prev = p;
+      p->next->prev = newblock;
+      p->next = newblock;
+
+      newblock->next_free = p->next_free;
+      newblock->prev_free = p;
+      p->next_free->prev_free = newblock;
+      p->next_free = newblock;
+
+      p->size -= newblock->size;
+      p = newblock;
+   }
+
+   /* break right, also [p, newblock, p->next] */
+   if (size < p->size) {
+      newblock = CALLOC_STRUCT(mem_block);
+      if (!newblock)
+	 return NULL;
+      newblock->ofs = startofs + size;
+      newblock->size = p->size - size;
+      newblock->free = 1;
+      newblock->heap = p->heap;
+
+      newblock->next = p->next;
+      newblock->prev = p;
+      p->next->prev = newblock;
+      p->next = newblock;
+
+      newblock->next_free = p->next_free;
+      newblock->prev_free = p;
+      p->next_free->prev_free = newblock;
+      p->next_free = newblock;
+	 
+      p->size = size;
+   }
+
+   /* p = middle block */
+   p->free = 0;
+
+   /* Remove p from the free list: 
+    */
+   p->next_free->prev_free = p->prev_free;
+   p->prev_free->next_free = p->next_free;
+
+   p->next_free = 0;
+   p->prev_free = 0;
+
+   p->reserved = reserved;
+   return p;
+}
+
+
+struct mem_block *
+mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+{
+   struct mem_block *p;
+   const int mask = (1 << align2)-1;
+   int startofs = 0;
+   int endofs;
+
+   if (!heap || align2 < 0 || size <= 0)
+      return NULL;
+
+   for (p = heap->next_free; p != heap; p = p->next_free) {
+      assert(p->free);
+
+      startofs = (p->ofs + mask) & ~mask;
+      if ( startofs < startSearch ) {
+	 startofs = startSearch;
+      }
+      endofs = startofs+size;
+      if (endofs <= (p->ofs+p->size))
+	 break;
+   }
+
+   if (p == heap) 
+      return NULL;
+
+   assert(p->free);
+   p = SliceBlock(p,startofs,size,0,mask+1);
+
+   return p;
+}
+
+
+struct mem_block *
+mmFindBlock(struct mem_block *heap, int start)
+{
+   struct mem_block *p;
+
+   for (p = heap->next; p != heap; p = p->next) {
+      if (p->ofs == start) 
+	 return p;
+   }
+
+   return NULL;
+}
+
+
+static INLINE int
+Join2Blocks(struct mem_block *p)
+{
+   /* XXX there should be some assertions here */
+
+   /* NOTE: heap->free == 0 */
+
+   if (p->free && p->next->free) {
+      struct mem_block *q = p->next;
+
+      assert(p->ofs + p->size == q->ofs);
+      p->size += q->size;
+
+      p->next = q->next;
+      q->next->prev = p;
+
+      q->next_free->prev_free = q->prev_free; 
+      q->prev_free->next_free = q->next_free;
+     
+      FREE(q);
+      return 1;
+   }
+   return 0;
+}
+
+int
+mmFreeMem(struct mem_block *b)
+{
+   if (!b)
+      return 0;
+
+   if (b->free) {
+      debug_printf("block already free\n");
+      return -1;
+   }
+   if (b->reserved) {
+      debug_printf("block is reserved\n");
+      return -1;
+   }
+
+   b->free = 1;
+   b->next_free = b->heap->next_free;
+   b->prev_free = b->heap;
+   b->next_free->prev_free = b;
+   b->prev_free->next_free = b;
+
+   Join2Blocks(b);
+   if (b->prev != b->heap)
+      Join2Blocks(b->prev);
+
+   return 0;
+}
+
+
+void
+mmDestroy(struct mem_block *heap)
+{
+   struct mem_block *p;
+
+   if (!heap)
+      return;
+
+   for (p = heap->next; p != heap; ) {
+      struct mem_block *next = p->next;
+      FREE(p);
+      p = next;
+   }
+
+   FREE(heap);
+}
diff --git a/src/gallium/auxiliary/rtasm/mm.h b/src/gallium/auxiliary/rtasm/mm.h
new file mode 100644
index 0000000000..f469b18d3e
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/mm.h
@@ -0,0 +1,89 @@
+/*
+ * GLX Hardware Device Driver common code
+ * Copyright (C) 1999 Wittawat Yamwong
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * KEITH WHITWELL, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, 
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * Memory manager code.  Primarily used by device drivers to manage texture
+ * heaps, etc.
+ */
+
+
+#ifndef MM_H
+#define MM_H
+
+
+struct mem_block {
+   struct mem_block *next, *prev;
+   struct mem_block *next_free, *prev_free;
+   struct mem_block *heap;
+   int ofs,size;
+   unsigned int free:1;
+   unsigned int reserved:1;
+};
+
+
+
+/** 
+ * input: total size in bytes
+ * return: a heap pointer if OK, NULL if error
+ */
+extern struct mem_block *mmInit(int ofs, int size);
+
+/**
+ * Allocate 'size' bytes with 2^align2 bytes alignment,
+ * restrict the search to free memory after 'startSearch'
+ * depth and back buffers should be in different 4mb banks
+ * to get better page hits if possible
+ * input:	size = size of block
+ *       	align2 = 2^align2 bytes alignment
+ *		startSearch = linear offset from start of heap to begin search
+ * return: pointer to the allocated block, 0 if error
+ */
+extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2, 
+                            int startSearch);
+
+/**
+ * Free block starts at offset
+ * input: pointer to a block
+ * return: 0 if OK, -1 if error
+ */
+extern int mmFreeMem(struct mem_block *b);
+
+/**
+ * Free block starts at offset
+ * input: pointer to a heap, start offset
+ * return: pointer to a block
+ */
+extern struct mem_block *mmFindBlock(struct mem_block *heap, int start);
+
+/**
+ * destroy MM
+ */
+extern void mmDestroy(struct mem_block *mmInit);
+
+/**
+ * For debuging purpose.
+ */
+extern void mmDumpMemInfo(const struct mem_block *mmInit);
+
+#endif
diff --git a/src/gallium/auxiliary/rtasm/x86sse.c b/src/gallium/auxiliary/rtasm/x86sse.c
new file mode 100644
index 0000000000..fff6f77a6b
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/x86sse.c
@@ -0,0 +1,1195 @@
+#if defined(__i386__) || defined(__386__)
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+
+#include "x86sse.h"
+
+#define DISASSEM 0
+#define X86_TWOB 0x0f
+
+static unsigned char *cptr( void (*label)() )
+{
+   return (unsigned char *)(unsigned long)label;
+}
+
+
+static void do_realloc( struct x86_function *p )
+{
+   if (p->size == 0) {
+      p->size = 1024;
+      p->store = _mesa_exec_malloc(p->size);
+      p->csr = p->store;
+   }
+   else {
+      unsigned used = p->csr - p->store;
+      unsigned char *tmp = p->store;
+      p->size *= 2;
+      p->store = _mesa_exec_malloc(p->size);
+      memcpy(p->store, tmp, used);
+      p->csr = p->store + used;
+      _mesa_exec_free(tmp);
+   }
+}
+
+/* Emit bytes to the instruction stream:
+ */
+static unsigned char *reserve( struct x86_function *p, int bytes )
+{
+   if (p->csr + bytes - p->store > p->size)
+      do_realloc(p);
+
+   {
+      unsigned char *csr = p->csr;
+      p->csr += bytes;
+      return csr;
+   }
+}
+
+
+
+static void emit_1b( struct x86_function *p, char b0 )
+{
+   char *csr = (char *)reserve(p, 1);
+   *csr = b0;
+}
+
+static void emit_1i( struct x86_function *p, int i0 )
+{
+   int *icsr = (int *)reserve(p, sizeof(i0));
+   *icsr = i0;
+}
+
+static void emit_1ub( struct x86_function *p, unsigned char b0 )
+{
+   unsigned char *csr = reserve(p, 1);
+   *csr++ = b0;
+}
+
+static void emit_2ub( struct x86_function *p, unsigned char b0, unsigned char b1 )
+{
+   unsigned char *csr = reserve(p, 2);
+   *csr++ = b0;
+   *csr++ = b1;
+}
+
+static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1, unsigned char b2 )
+{
+   unsigned char *csr = reserve(p, 3);
+   *csr++ = b0;
+   *csr++ = b1;
+   *csr++ = b2;
+}
+
+
+/* Build a modRM byte + possible displacement.  No treatment of SIB
+ * indexing.  BZZT - no way to encode an absolute address.
+ */
+static void emit_modrm( struct x86_function *p, 
+			struct x86_reg reg, 
+			struct x86_reg regmem )
+{
+   unsigned char val = 0;
+   
+   assert(reg.mod == mod_REG);
+   
+   val |= regmem.mod << 6;     	/* mod field */
+   val |= reg.idx << 3;		/* reg field */
+   val |= regmem.idx;		/* r/m field */
+   
+   emit_1ub(p, val);
+
+   /* Oh-oh we've stumbled into the SIB thing.
+    */
+   if (regmem.file == file_REG32 &&
+       regmem.idx == reg_SP) {
+      emit_1ub(p, 0x24);		/* simplistic! */
+   }
+
+   switch (regmem.mod) {
+   case mod_REG:
+   case mod_INDIRECT:
+      break;
+   case mod_DISP8:
+      emit_1b(p, regmem.disp);
+      break;
+   case mod_DISP32:
+      emit_1i(p, regmem.disp);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+static void emit_modrm_noreg( struct x86_function *p,
+			      unsigned op,
+			      struct x86_reg regmem )
+{
+   struct x86_reg dummy = x86_make_reg(file_REG32, op);
+   emit_modrm(p, dummy, regmem);
+}
+
+/* Many x86 instructions have two opcodes to cope with the situations
+ * where the destination is a register or memory reference
+ * respectively.  This function selects the correct opcode based on
+ * the arguments presented.
+ */
+static void emit_op_modrm( struct x86_function *p,
+			   unsigned char op_dst_is_reg, 
+			   unsigned char op_dst_is_mem,
+			   struct x86_reg dst,
+			   struct x86_reg src )
+{  
+   switch (dst.mod) {
+   case mod_REG:
+      emit_1ub(p, op_dst_is_reg);
+      emit_modrm(p, dst, src);
+      break;
+   case mod_INDIRECT:
+   case mod_DISP32:
+   case mod_DISP8:
+      assert(src.mod == mod_REG);
+      emit_1ub(p, op_dst_is_mem);
+      emit_modrm(p, src, dst);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+
+
+
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+			     enum x86_reg_name idx )
+{
+   struct x86_reg reg;
+
+   reg.file = file;
+   reg.idx = idx;
+   reg.mod = mod_REG;
+   reg.disp = 0;
+
+   return reg;
+}
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+			      int disp )
+{
+   assert(reg.file == file_REG32);
+
+   if (reg.mod == mod_REG)
+      reg.disp = disp;
+   else
+      reg.disp += disp;
+
+   if (reg.disp == 0)
+      reg.mod = mod_INDIRECT;
+   else if (reg.disp <= 127 && reg.disp >= -128)
+      reg.mod = mod_DISP8;
+   else
+      reg.mod = mod_DISP32;
+
+   return reg;
+}
+
+struct x86_reg x86_deref( struct x86_reg reg )
+{
+   return x86_make_disp(reg, 0);
+}
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg )
+{
+   return x86_make_reg( reg.file, reg.idx );
+}
+
+unsigned char *x86_get_label( struct x86_function *p )
+{
+   return p->csr;
+}
+
+
+
+/***********************************************************************
+ * x86 instructions
+ */
+
+
+void x86_jcc( struct x86_function *p,
+	      enum x86_cc cc,
+	      unsigned char *label )
+{
+   int offset = label - (x86_get_label(p) + 2);
+   
+   if (offset <= 127 && offset >= -128) {
+      emit_1ub(p, 0x70 + cc);
+      emit_1b(p, (char) offset);
+   }
+   else {
+      offset = label - (x86_get_label(p) + 6);
+      emit_2ub(p, 0x0f, 0x80 + cc);
+      emit_1i(p, offset);
+   }
+}
+
+/* Always use a 32bit offset for forward jumps:
+ */
+unsigned char *x86_jcc_forward( struct x86_function *p,
+			  enum x86_cc cc )
+{
+   emit_2ub(p, 0x0f, 0x80 + cc);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+unsigned char *x86_jmp_forward( struct x86_function *p)
+{
+   emit_1ub(p, 0xe9);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+unsigned char *x86_call_forward( struct x86_function *p)
+{
+   emit_1ub(p, 0xe8);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+/* Fixup offset from forward jump:
+ */
+void x86_fixup_fwd_jump( struct x86_function *p,
+			 unsigned char *fixup )
+{
+   *(int *)(fixup - 4) = x86_get_label(p) - fixup;
+}
+
+void x86_jmp( struct x86_function *p, unsigned char *label)
+{
+   emit_1ub(p, 0xe9);
+   emit_1i(p, label - x86_get_label(p) - 4);
+}
+
+#if 0
+/* This doesn't work once we start reallocating & copying the
+ * generated code on buffer fills, because the call is relative to the
+ * current pc.
+ */
+void x86_call( struct x86_function *p, void (*label)())
+{
+   emit_1ub(p, 0xe8);
+   emit_1i(p, cptr(label) - x86_get_label(p) - 4);
+}
+#else
+void x86_call( struct x86_function *p, struct x86_reg reg)
+{
+   emit_1ub(p, 0xff);
+   emit_modrm(p, reg, reg);
+}
+#endif
+
+
+/* michal:
+ * Temporary. As I need immediate operands, and dont want to mess with the codegen,
+ * I load the immediate into general purpose register and use it.
+ */
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   assert(dst.mod == mod_REG);
+   emit_1ub(p, 0xb8 + dst.idx);
+   emit_1i(p, imm);
+}
+
+void x86_push( struct x86_function *p,
+	       struct x86_reg reg )
+{
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x50 + reg.idx);
+   p->stack_offset += 4;
+}
+
+void x86_pop( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x58 + reg.idx);
+   p->stack_offset -= 4;
+}
+
+void x86_inc( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x40 + reg.idx);
+}
+
+void x86_dec( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x48 + reg.idx);
+}
+
+void x86_ret( struct x86_function *p )
+{
+   emit_1ub(p, 0xc3);
+}
+
+void x86_sahf( struct x86_function *p )
+{
+   emit_1ub(p, 0x9e);
+}
+
+void x86_mov( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_xor( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   emit_op_modrm( p, 0x33, 0x31, dst, src );
+}
+
+void x86_cmp( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   emit_op_modrm( p, 0x3b, 0x39, dst, src );
+}
+
+void x86_lea( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   emit_1ub(p, 0x8d);
+   emit_modrm( p, dst, src );
+}
+
+void x86_test( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   emit_1ub(p, 0x85);
+   emit_modrm( p, dst, src );
+}
+
+void x86_add( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   emit_op_modrm(p, 0x03, 0x01, dst, src );
+}
+
+void x86_mul( struct x86_function *p,
+	       struct x86_reg src )
+{
+   assert (src.file == file_REG32 && src.mod == mod_REG);
+   emit_op_modrm(p, 0xf7, 0, x86_make_reg (file_REG32, reg_SP), src );
+}
+
+void x86_sub( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   emit_op_modrm(p, 0x2b, 0x29, dst, src );
+}
+
+void x86_or( struct x86_function *p,
+             struct x86_reg dst,
+             struct x86_reg src )
+{
+   emit_op_modrm( p, 0x0b, 0x09, dst, src );
+}
+
+void x86_and( struct x86_function *p,
+              struct x86_reg dst,
+              struct x86_reg src )
+{
+   emit_op_modrm( p, 0x23, 0x21, dst, src );
+}
+
+
+
+/***********************************************************************
+ * SSE instructions
+ */
+
+
+void sse_movss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, 0xF3, X86_TWOB);
+   emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movaps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x28, 0x29, dst, src );
+}
+
+void sse_movups( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movhps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   assert(dst.mod != mod_REG || src.mod != mod_REG);
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
+}
+
+void sse_movlps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   assert(dst.mod != mod_REG || src.mod != mod_REG);
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
+}
+
+void sse_maxps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
+void sse_maxss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
+void sse_divss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
+   emit_modrm( p, dst, src );
+}
+
+void sse_minps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x5D);
+   emit_modrm( p, dst, src );
+}
+
+void sse_subps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x5C);
+   emit_modrm( p, dst, src );
+}
+
+void sse_mulps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
+void sse_mulss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
+void sse_addps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x58);
+   emit_modrm( p, dst, src );
+}
+
+void sse_addss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x58);
+   emit_modrm( p, dst, src );
+}
+
+void sse_andnps( struct x86_function *p,
+                 struct x86_reg dst,
+                 struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x55);
+   emit_modrm( p, dst, src );
+}
+
+void sse_andps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x54);
+   emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtps( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x52);
+   emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtss( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x52);
+   emit_modrm( p, dst, src );
+
+}
+
+void sse_movhlps( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   assert(dst.mod == mod_REG && src.mod == mod_REG);
+   emit_2ub(p, X86_TWOB, 0x12);
+   emit_modrm( p, dst, src );
+}
+
+void sse_movlhps( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   assert(dst.mod == mod_REG && src.mod == mod_REG);
+   emit_2ub(p, X86_TWOB, 0x16);
+   emit_modrm( p, dst, src );
+}
+
+void sse_orps( struct x86_function *p,
+               struct x86_reg dst,
+               struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x56);
+   emit_modrm( p, dst, src );
+}
+
+void sse_xorps( struct x86_function *p,
+                struct x86_reg dst,
+                struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x57);
+   emit_modrm( p, dst, src );
+}
+
+void sse_cvtps2pi( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   assert(dst.file == file_MMX && 
+	  (src.file == file_XMM || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x2d);
+   emit_modrm( p, dst, src );
+}
+
+
+/* Shufps can also be used to implement a reduced swizzle when dest ==
+ * arg0.
+ */
+void sse_shufps( struct x86_function *p,
+		 struct x86_reg dest,
+		 struct x86_reg arg0,
+		 unsigned char shuf) 
+{
+   emit_2ub(p, X86_TWOB, 0xC6);
+   emit_modrm(p, dest, arg0);
+   emit_1ub(p, shuf); 
+}
+
+void sse_cmpps( struct x86_function *p,
+		struct x86_reg dest,
+		struct x86_reg arg0,
+		unsigned char cc) 
+{
+   emit_2ub(p, X86_TWOB, 0xC2);
+   emit_modrm(p, dest, arg0);
+   emit_1ub(p, cc); 
+}
+
+void sse_pmovmskb( struct x86_function *p,
+                   struct x86_reg dest,
+                   struct x86_reg src)
+{
+    emit_3ub(p, 0x66, X86_TWOB, 0xD7);
+    emit_modrm(p, dest, src);
+}
+
+/***********************************************************************
+ * SSE2 instructions
+ */
+
+/**
+ * Perform a reduced swizzle:
+ */
+void sse2_pshufd( struct x86_function *p,
+		  struct x86_reg dest,
+		  struct x86_reg arg0,
+		  unsigned char shuf) 
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x70);
+   emit_modrm(p, dest, arg0);
+   emit_1ub(p, shuf); 
+}
+
+void sse2_cvttps2dq( struct x86_function *p,
+                     struct x86_reg dst,
+                     struct x86_reg src )
+{
+   emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtps2dq( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x5B);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packssdw( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x6B);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packsswb( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x63);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packuswb( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x67);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_rcpps( struct x86_function *p,
+                 struct x86_reg dst,
+                 struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x53);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_rcpss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x53);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_movd( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, 0x66, X86_TWOB);
+   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
+}
+
+
+
+
+/***********************************************************************
+ * x87 instructions
+ */
+void x87_fist( struct x86_function *p, struct x86_reg dst )
+{
+   emit_1ub(p, 0xdb);
+   emit_modrm_noreg(p, 2, dst);
+}
+
+void x87_fistp( struct x86_function *p, struct x86_reg dst )
+{
+   emit_1ub(p, 0xdb);
+   emit_modrm_noreg(p, 3, dst);
+}
+
+void x87_fild( struct x86_function *p, struct x86_reg arg )
+{
+   emit_1ub(p, 0xdf);
+   emit_modrm_noreg(p, 0, arg);
+}
+
+void x87_fldz( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xee);
+}
+
+
+void x87_fldcw( struct x86_function *p, struct x86_reg arg )
+{
+   assert(arg.file == file_REG32);
+   assert(arg.mod != mod_REG);
+   emit_1ub(p, 0xd9);
+   emit_modrm_noreg(p, 5, arg);
+}
+
+void x87_fld1( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xe8);
+}
+
+void x87_fldl2e( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xea);
+}
+
+void x87_fldln2( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xed);
+}
+
+void x87_fwait( struct x86_function *p )
+{
+   emit_1ub(p, 0x9b);
+}
+
+void x87_fnclex( struct x86_function *p )
+{
+   emit_2ub(p, 0xdb, 0xe2);
+}
+
+void x87_fclex( struct x86_function *p )
+{
+   x87_fwait(p);
+   x87_fnclex(p);
+}
+
+
+static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,
+			  unsigned char dst0ub0,
+			  unsigned char dst0ub1,
+			  unsigned char arg0ub0,
+			  unsigned char arg0ub1,
+			  unsigned char argmem_noreg)
+{
+   assert(dst.file == file_x87);
+
+   if (arg.file == file_x87) {
+      if (dst.idx == 0) 
+	 emit_2ub(p, dst0ub0, dst0ub1+arg.idx);
+      else if (arg.idx == 0) 
+	 emit_2ub(p, arg0ub0, arg0ub1+arg.idx);
+      else
+	 assert(0);
+   }
+   else if (dst.idx == 0) {
+      assert(arg.file == file_REG32);
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, argmem_noreg, arg);
+   }
+   else
+      assert(0);
+}
+
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xc8,
+		0xdc, 0xc8,
+		4);
+}
+
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xe0,
+		0xdc, 0xe8,
+		4);
+}
+
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xe8,
+		0xdc, 0xe0,
+		5);
+}
+
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xc0,
+		0xdc, 0xc0,
+		0);
+}
+
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xf0,
+		0xdc, 0xf8,
+		6);
+}
+
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xf8,
+		0xdc, 0xf0,
+		7);
+}
+
+void x87_fmulp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xc8+dst.idx);
+}
+
+void x87_fsubp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xe8+dst.idx);
+}
+
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xe0+dst.idx);
+}
+
+void x87_faddp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xc0+dst.idx);
+}
+
+void x87_fdivp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xf8+dst.idx);
+}
+
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xf0+dst.idx);
+}
+
+void x87_fucom( struct x86_function *p, struct x86_reg arg )
+{
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdd, 0xe0+arg.idx);
+}
+
+void x87_fucomp( struct x86_function *p, struct x86_reg arg )
+{
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdd, 0xe8+arg.idx);
+}
+
+void x87_fucompp( struct x86_function *p )
+{
+   emit_2ub(p, 0xda, 0xe9);
+}
+
+void x87_fxch( struct x86_function *p, struct x86_reg arg )
+{
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xd9, 0xc8+arg.idx);
+}
+
+void x87_fabs( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xe1);
+}
+
+void x87_fchs( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xe0);
+}
+
+void x87_fcos( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xff);
+}
+
+
+void x87_fprndint( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfc);
+}
+
+void x87_fscale( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfd);
+}
+
+void x87_fsin( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfe);
+}
+
+void x87_fsincos( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfb);
+}
+
+void x87_fsqrt( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfa);
+}
+
+void x87_fxtract( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xf4);
+}
+
+/* st0 = (2^st0)-1
+ *
+ * Restrictions: -1.0 <= st0 <= 1.0
+ */
+void x87_f2xm1( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xf0);
+}
+
+/* st1 = st1 * log2(st0);
+ * pop_stack;
+ */
+void x87_fyl2x( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xf1);
+}
+
+/* st1 = st1 * log2(st0 + 1.0);
+ * pop_stack;
+ *
+ * A fast operation, with restrictions: -.29 < st0 < .29 
+ */
+void x87_fyl2xp1( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xf9);
+}
+
+
+void x87_fld( struct x86_function *p, struct x86_reg arg )
+{
+   if (arg.file == file_x87) 
+      emit_2ub(p, 0xd9, 0xc0 + arg.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 0, arg);
+   }
+}
+
+void x87_fst( struct x86_function *p, struct x86_reg dst )
+{
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xdd, 0xd0 + dst.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 2, dst);
+   }
+}
+
+void x87_fstp( struct x86_function *p, struct x86_reg dst )
+{
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xdd, 0xd8 + dst.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 3, dst);
+   }
+}
+
+void x87_fcom( struct x86_function *p, struct x86_reg dst )
+{
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xd8, 0xd0 + dst.idx);
+   else {
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, 2, dst);
+   }
+}
+
+void x87_fcomp( struct x86_function *p, struct x86_reg dst )
+{
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xd8, 0xd8 + dst.idx);
+   else {
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, 3, dst);
+   }
+}
+
+
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_REG32);
+
+   if (dst.idx == reg_AX &&
+       dst.mod == mod_REG) 
+      emit_2ub(p, 0xdf, 0xe0);
+   else {
+      emit_1ub(p, 0xdd);
+      emit_modrm_noreg(p, 7, dst);
+   }
+}
+
+
+
+
+/***********************************************************************
+ * MMX instructions
+ */
+
+void mmx_emms( struct x86_function *p )
+{
+   assert(p->need_emms);
+   emit_2ub(p, 0x0f, 0x77);
+   p->need_emms = 0;
+}
+
+void mmx_packssdw( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   assert(dst.file == file_MMX && 
+	  (src.file == file_MMX || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x6b);
+   emit_modrm( p, dst, src );
+}
+
+void mmx_packuswb( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   assert(dst.file == file_MMX && 
+	  (src.file == file_MMX || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x67);
+   emit_modrm( p, dst, src );
+}
+
+void mmx_movd( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   p->need_emms = 1;
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
+}
+
+void mmx_movq( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   p->need_emms = 1;
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x6f, 0x7f, dst, src );
+}
+
+
+/***********************************************************************
+ * Helper functions
+ */
+
+
+/* Retreive a reference to one of the function arguments, taking into
+ * account any push/pop activity:
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p,
+			   unsigned arg )
+{
+   return x86_make_disp(x86_make_reg(file_REG32, reg_SP), 
+			p->stack_offset + arg * 4);	/* ??? */
+}
+
+
+void x86_init_func( struct x86_function *p )
+{
+   p->size = 0;
+   p->store = NULL;
+   p->csr = p->store;
+}
+
+void x86_init_func_size( struct x86_function *p, unsigned code_size )
+{
+   p->size = code_size;
+   p->store = _mesa_exec_malloc(code_size);
+   p->csr = p->store;
+}
+
+void x86_release_func( struct x86_function *p )
+{
+   _mesa_exec_free(p->store);
+   p->store = NULL;
+   p->csr = NULL;
+   p->size = 0;
+}
+
+
+void (*x86_get_func( struct x86_function *p ))(void)
+{
+   if (DISASSEM && p->store)
+      _mesa_printf("disassemble %p %p\n", p->store, p->csr);
+   return (void (*)(void)) (unsigned long) p->store;
+}
+
+#else
+
+void x86sse_dummy( void )
+{
+}
+
+#endif
diff --git a/src/gallium/auxiliary/rtasm/x86sse.h b/src/gallium/auxiliary/rtasm/x86sse.h
new file mode 100644
index 0000000000..c2aa416492
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/x86sse.h
@@ -0,0 +1,256 @@
+
+#ifndef _X86SSE_H_
+#define _X86SSE_H_
+
+#if defined(__i386__) || defined(__386__)
+
+/* It is up to the caller to ensure that instructions issued are
+ * suitable for the host cpu.  There are no checks made in this module
+ * for mmx/sse/sse2 support on the cpu.
+ */
+struct x86_reg {
+   unsigned file:3;
+   unsigned idx:3;
+   unsigned mod:2;		/* mod_REG if this is just a register */
+   int      disp:24;		/* only +/- 23bits of offset - should be enough... */
+};
+
+struct x86_function {
+   unsigned size;
+   unsigned char *store;
+   unsigned char *csr;
+   unsigned stack_offset;
+   int need_emms;
+   const char *fn;
+};
+
+enum x86_reg_file {
+   file_REG32,
+   file_MMX,
+   file_XMM,
+   file_x87
+};
+
+/* Values for mod field of modr/m byte
+ */
+enum x86_reg_mod {
+   mod_INDIRECT,
+   mod_DISP8,
+   mod_DISP32,
+   mod_REG
+};
+
+enum x86_reg_name {
+   reg_AX,
+   reg_CX,
+   reg_DX,
+   reg_BX,
+   reg_SP,
+   reg_BP,
+   reg_SI,
+   reg_DI
+};
+
+
+enum x86_cc {
+   cc_O,			/* overflow */
+   cc_NO,			/* not overflow */
+   cc_NAE,			/* not above or equal / carry */
+   cc_AE,			/* above or equal / not carry */
+   cc_E,			/* equal / zero */
+   cc_NE			/* not equal / not zero */
+};
+
+enum sse_cc {
+   cc_Equal,
+   cc_LessThan,
+   cc_LessThanEqual,
+   cc_Unordered,
+   cc_NotEqual,
+   cc_NotLessThan,
+   cc_NotLessThanEqual,
+   cc_Ordered
+};
+
+#define cc_Z  cc_E
+#define cc_NZ cc_NE
+
+/* Begin/end/retreive function creation:
+ */
+
+
+void x86_init_func( struct x86_function *p );
+void x86_init_func_size( struct x86_function *p, unsigned code_size );
+void x86_release_func( struct x86_function *p );
+void (*x86_get_func( struct x86_function *p ))( void );
+
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+			     enum x86_reg_name idx );
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+			      int disp );
+
+struct x86_reg x86_deref( struct x86_reg reg );
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg );
+
+
+/* Labels, jumps and fixup:
+ */
+unsigned char *x86_get_label( struct x86_function *p );
+
+void x86_jcc( struct x86_function *p,
+	      enum x86_cc cc,
+	      unsigned char *label );
+
+unsigned char *x86_jcc_forward( struct x86_function *p,
+			  enum x86_cc cc );
+
+unsigned char *x86_jmp_forward( struct x86_function *p);
+
+unsigned char *x86_call_forward( struct x86_function *p);
+
+void x86_fixup_fwd_jump( struct x86_function *p,
+			 unsigned char *fixup );
+
+void x86_jmp( struct x86_function *p, unsigned char *label );
+
+/* void x86_call( struct x86_function *p, void (*label)() ); */
+void x86_call( struct x86_function *p, struct x86_reg reg);
+
+/* michal:
+ * Temporary. As I need immediate operands, and dont want to mess with the codegen,
+ * I load the immediate into general purpose register and use it.
+ */
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
+
+
+/* Macro for sse_shufps() and sse2_pshufd():
+ */
+#define SHUF(_x,_y,_z,_w)       (((_x)<<0) | ((_y)<<2) | ((_z)<<4) | ((_w)<<6))
+#define SHUF_NOOP               RSW(0,1,2,3)
+#define GET_SHUF(swz, idx)      (((swz) >> ((idx)*2)) & 0x3)
+
+void mmx_emms( struct x86_function *p );
+void mmx_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src,
+                unsigned char cc );
+void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_orps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_xorps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                 unsigned char shuf );
+void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
+
+void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_cmp( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_dec( struct x86_function *p, struct x86_reg reg );
+void x86_inc( struct x86_function *p, struct x86_reg reg );
+void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mul( struct x86_function *p, struct x86_reg src );
+void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_pop( struct x86_function *p, struct x86_reg reg );
+void x86_push( struct x86_function *p, struct x86_reg reg );
+void x86_ret( struct x86_function *p );
+void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_sahf( struct x86_function *p );
+
+void x87_f2xm1( struct x86_function *p );
+void x87_fabs( struct x86_function *p );
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_faddp( struct x86_function *p, struct x86_reg dst );
+void x87_fchs( struct x86_function *p );
+void x87_fclex( struct x86_function *p );
+void x87_fcom( struct x86_function *p, struct x86_reg dst );
+void x87_fcomp( struct x86_function *p, struct x86_reg dst );
+void x87_fcos( struct x86_function *p );
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivp( struct x86_function *p, struct x86_reg dst );
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst );
+void x87_fild( struct x86_function *p, struct x86_reg arg );
+void x87_fist( struct x86_function *p, struct x86_reg dst );
+void x87_fistp( struct x86_function *p, struct x86_reg dst );
+void x87_fld( struct x86_function *p, struct x86_reg arg );
+void x87_fld1( struct x86_function *p );
+void x87_fldcw( struct x86_function *p, struct x86_reg arg );
+void x87_fldl2e( struct x86_function *p );
+void x87_fldln2( struct x86_function *p );
+void x87_fldz( struct x86_function *p );
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fmulp( struct x86_function *p, struct x86_reg dst );
+void x87_fnclex( struct x86_function *p );
+void x87_fprndint( struct x86_function *p );
+void x87_fscale( struct x86_function *p );
+void x87_fsin( struct x86_function *p );
+void x87_fsincos( struct x86_function *p );
+void x87_fsqrt( struct x86_function *p );
+void x87_fst( struct x86_function *p, struct x86_reg dst );
+void x87_fstp( struct x86_function *p, struct x86_reg dst );
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubp( struct x86_function *p, struct x86_reg dst );
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
+void x87_fxch( struct x86_function *p, struct x86_reg dst );
+void x87_fxtract( struct x86_function *p );
+void x87_fyl2x( struct x86_function *p );
+void x87_fyl2xp1( struct x86_function *p );
+void x87_fwait( struct x86_function *p );
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
+void x87_fucompp( struct x86_function *p );
+void x87_fucomp( struct x86_function *p, struct x86_reg arg );
+void x87_fucom( struct x86_function *p, struct x86_reg arg );
+
+
+
+/* Retreive a reference to one of the function arguments, taking into
+ * account any push/pop activity.  Note - doesn't track explict
+ * manipulation of ESP by other instructions.
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg );
+
+#endif
+#endif
-- 
cgit v1.2.3


From 39ea0308425ad04618061129c63c22ac0efb0692 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 19 Feb 2008 12:00:48 +0900
Subject: Rename rtasm files.

---
 src/gallium/auxiliary/rtasm/Makefile        |    4 +-
 src/gallium/auxiliary/rtasm/SConscript      |    4 +-
 src/gallium/auxiliary/rtasm/execmem.c       |  133 ---
 src/gallium/auxiliary/rtasm/execmem.h       |   45 -
 src/gallium/auxiliary/rtasm/rtasm_execmem.c |  134 +++
 src/gallium/auxiliary/rtasm/rtasm_execmem.h |   45 +
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c  | 1196 +++++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h  |  256 ++++++
 src/gallium/auxiliary/rtasm/x86sse.c        | 1195 --------------------------
 src/gallium/auxiliary/rtasm/x86sse.h        |  256 ------
 10 files changed, 1635 insertions(+), 1633 deletions(-)
 delete mode 100644 src/gallium/auxiliary/rtasm/execmem.c
 delete mode 100644 src/gallium/auxiliary/rtasm/execmem.h
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_execmem.c
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_execmem.h
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_x86sse.c
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_x86sse.h
 delete mode 100644 src/gallium/auxiliary/rtasm/x86sse.c
 delete mode 100644 src/gallium/auxiliary/rtasm/x86sse.h

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index b3b9934e10..7c8ac60794 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -5,9 +5,9 @@ include $(TOP)/configs/current
 LIBNAME = rtasm
 
 DRIVER_SOURCES = \
+	execmem.c \
 	x86sse.c \
-	mm.c \
-	execmem.c
+	mm.c
 
 C_SOURCES = \
 	$(DRIVER_SOURCES)
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index c5b1551786..de8456e0ca 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -3,9 +3,9 @@ Import('*')
 rtasm = env.ConvenienceLibrary(
 	target = 'rtasm',
 	source = [
-		'x86sse.c',
+		'rtasm_execmem.c',
+		'rtasm_x86sse.c',
 		'mm.c',
-		'execmem.c',
 	])
 
 auxiliaries.insert(0, rtasm)
diff --git a/src/gallium/auxiliary/rtasm/execmem.c b/src/gallium/auxiliary/rtasm/execmem.c
deleted file mode 100644
index c7c35f7ef2..0000000000
--- a/src/gallium/auxiliary/rtasm/execmem.c
+++ /dev/null
@@ -1,133 +0,0 @@
-/**************************************************************************
- *
- * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * \file exemem.c
- * Functions for allocating executable memory.
- *
- * \author Keith Whitwell
- */
-
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_thread.h"
-
-#include "execmem.h"
-
-
-#if defined(__linux__)
-
-/*
- * Allocate a large block of memory which can hold code then dole it out
- * in pieces by means of the generic memory manager code.
-*/
-
-#include <unistd.h>
-#include <sys/mman.h>
-#include "mm.h"
-
-#define EXEC_HEAP_SIZE (10*1024*1024)
-
-_glthread_DECLARE_STATIC_MUTEX(exec_mutex);
-
-static struct mem_block *exec_heap = NULL;
-static unsigned char *exec_mem = NULL;
-
-
-static void
-init_heap(void)
-{
-   if (!exec_heap)
-      exec_heap = mmInit( 0, EXEC_HEAP_SIZE );
-   
-   if (!exec_mem)
-      exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE, 
-					PROT_EXEC | PROT_READ | PROT_WRITE, 
-					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-}
-
-
-void *
-_mesa_exec_malloc(size_t size)
-{
-   struct mem_block *block = NULL;
-   void *addr = NULL;
-
-   _glthread_LOCK_MUTEX(exec_mutex);
-
-   init_heap();
-
-   if (exec_heap) {
-      size = (size + 31) & ~31;
-      block = mmAllocMem( exec_heap, size, 32, 0 );
-   }
-
-   if (block)
-      addr = exec_mem + block->ofs;
-   else 
-      debug_printf("_mesa_exec_malloc failed\n");
-   
-   _glthread_UNLOCK_MUTEX(exec_mutex);
-   
-   return addr;
-}
-
- 
-void 
-_mesa_exec_free(void *addr)
-{
-   _glthread_LOCK_MUTEX(exec_mutex);
-
-   if (exec_heap) {
-      struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
-   
-      if (block)
-	 mmFreeMem(block);
-   }
-
-   _glthread_UNLOCK_MUTEX(exec_mutex);
-}
-
-
-#else
-
-/*
- * Just use regular memory.
- */
-
-void *
-_mesa_exec_malloc(GLuint size)
-{
-   return _mesa_malloc( size );
-}
-
- 
-void 
-_mesa_exec_free(void *addr)
-{
-   _mesa_free(addr);
-}
-
-
-#endif
diff --git a/src/gallium/auxiliary/rtasm/execmem.h b/src/gallium/auxiliary/rtasm/execmem.h
deleted file mode 100644
index 9fd4569165..0000000000
--- a/src/gallium/auxiliary/rtasm/execmem.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/**************************************************************************
- *
- * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * \file exemem.c
- * Functions for allocating executable memory.
- *
- * \author Keith Whitwell
- */
-
-#ifndef _EXECMEM_H_
-#define _EXECMEM_H_
-
-#include "pipe/p_compiler.h"
-
-
-extern void *
-_mesa_exec_malloc( size_t size );
-
-
-extern void 
-_mesa_exec_free( void *addr );
-
-
-#endif
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
new file mode 100644
index 0000000000..cb13db2498
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -0,0 +1,134 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * \file exemem.c
+ * Functions for allocating executable memory.
+ *
+ * \author Keith Whitwell
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_thread.h"
+
+#include "rtasm_execmem.h"
+
+
+#if defined(__linux__)
+
+/*
+ * Allocate a large block of memory which can hold code then dole it out
+ * in pieces by means of the generic memory manager code.
+*/
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include "mm.h"
+
+#define EXEC_HEAP_SIZE (10*1024*1024)
+
+_glthread_DECLARE_STATIC_MUTEX(exec_mutex);
+
+static struct mem_block *exec_heap = NULL;
+static unsigned char *exec_mem = NULL;
+
+
+static void
+init_heap(void)
+{
+   if (!exec_heap)
+      exec_heap = mmInit( 0, EXEC_HEAP_SIZE );
+   
+   if (!exec_mem)
+      exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE, 
+					PROT_EXEC | PROT_READ | PROT_WRITE, 
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+}
+
+
+void *
+rtasm_exec_malloc(size_t size)
+{
+   struct mem_block *block = NULL;
+   void *addr = NULL;
+
+   _glthread_LOCK_MUTEX(exec_mutex);
+
+   init_heap();
+
+   if (exec_heap) {
+      size = (size + 31) & ~31;
+      block = mmAllocMem( exec_heap, size, 32, 0 );
+   }
+
+   if (block)
+      addr = exec_mem + block->ofs;
+   else 
+      debug_printf("rtasm_exec_malloc failed\n");
+   
+   _glthread_UNLOCK_MUTEX(exec_mutex);
+   
+   return addr;
+}
+
+ 
+void 
+rtasm_exec_free(void *addr)
+{
+   _glthread_LOCK_MUTEX(exec_mutex);
+
+   if (exec_heap) {
+      struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+   
+      if (block)
+	 mmFreeMem(block);
+   }
+
+   _glthread_UNLOCK_MUTEX(exec_mutex);
+}
+
+
+#else
+
+/*
+ * Just use regular memory.
+ */
+
+void *
+rtasm_exec_malloc(GLuint size)
+{
+   return MALLOC( size );
+}
+
+ 
+void 
+rtasm_exec_free(void *addr)
+{
+   FREE(addr);
+}
+
+
+#endif
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.h b/src/gallium/auxiliary/rtasm/rtasm_execmem.h
new file mode 100644
index 0000000000..155c6d34e0
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file exemem.c
+ * Functions for allocating executable memory.
+ *
+ * \author Keith Whitwell
+ */
+
+#ifndef _RTASM_EXECMEM_H_
+#define _RTASM_EXECMEM_H_
+
+#include "pipe/p_compiler.h"
+
+
+extern void *
+rtasm_exec_malloc( size_t size );
+
+
+extern void 
+rtasm_exec_free( void *addr );
+
+
+#endif
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
new file mode 100644
index 0000000000..3c885a9fff
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -0,0 +1,1196 @@
+#if defined(__i386__) || defined(__386__)
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+
+#include "rtasm_execmem.h"
+#include "rtasm_x86sse.h"
+
+#define DISASSEM 0
+#define X86_TWOB 0x0f
+
+static unsigned char *cptr( void (*label)() )
+{
+   return (unsigned char *)(unsigned long)label;
+}
+
+
+static void do_realloc( struct x86_function *p )
+{
+   if (p->size == 0) {
+      p->size = 1024;
+      p->store = rtasm_exec_malloc(p->size);
+      p->csr = p->store;
+   }
+   else {
+      unsigned used = p->csr - p->store;
+      unsigned char *tmp = p->store;
+      p->size *= 2;
+      p->store = rtasm_exec_malloc(p->size);
+      memcpy(p->store, tmp, used);
+      p->csr = p->store + used;
+      rtasm_exec_free(tmp);
+   }
+}
+
+/* Emit bytes to the instruction stream:
+ */
+static unsigned char *reserve( struct x86_function *p, int bytes )
+{
+   if (p->csr + bytes - p->store > p->size)
+      do_realloc(p);
+
+   {
+      unsigned char *csr = p->csr;
+      p->csr += bytes;
+      return csr;
+   }
+}
+
+
+
+static void emit_1b( struct x86_function *p, char b0 )
+{
+   char *csr = (char *)reserve(p, 1);
+   *csr = b0;
+}
+
+static void emit_1i( struct x86_function *p, int i0 )
+{
+   int *icsr = (int *)reserve(p, sizeof(i0));
+   *icsr = i0;
+}
+
+static void emit_1ub( struct x86_function *p, unsigned char b0 )
+{
+   unsigned char *csr = reserve(p, 1);
+   *csr++ = b0;
+}
+
+static void emit_2ub( struct x86_function *p, unsigned char b0, unsigned char b1 )
+{
+   unsigned char *csr = reserve(p, 2);
+   *csr++ = b0;
+   *csr++ = b1;
+}
+
+static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1, unsigned char b2 )
+{
+   unsigned char *csr = reserve(p, 3);
+   *csr++ = b0;
+   *csr++ = b1;
+   *csr++ = b2;
+}
+
+
+/* Build a modRM byte + possible displacement.  No treatment of SIB
+ * indexing.  BZZT - no way to encode an absolute address.
+ */
+static void emit_modrm( struct x86_function *p, 
+			struct x86_reg reg, 
+			struct x86_reg regmem )
+{
+   unsigned char val = 0;
+   
+   assert(reg.mod == mod_REG);
+   
+   val |= regmem.mod << 6;     	/* mod field */
+   val |= reg.idx << 3;		/* reg field */
+   val |= regmem.idx;		/* r/m field */
+   
+   emit_1ub(p, val);
+
+   /* Oh-oh we've stumbled into the SIB thing.
+    */
+   if (regmem.file == file_REG32 &&
+       regmem.idx == reg_SP) {
+      emit_1ub(p, 0x24);		/* simplistic! */
+   }
+
+   switch (regmem.mod) {
+   case mod_REG:
+   case mod_INDIRECT:
+      break;
+   case mod_DISP8:
+      emit_1b(p, regmem.disp);
+      break;
+   case mod_DISP32:
+      emit_1i(p, regmem.disp);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+static void emit_modrm_noreg( struct x86_function *p,
+			      unsigned op,
+			      struct x86_reg regmem )
+{
+   struct x86_reg dummy = x86_make_reg(file_REG32, op);
+   emit_modrm(p, dummy, regmem);
+}
+
+/* Many x86 instructions have two opcodes to cope with the situations
+ * where the destination is a register or memory reference
+ * respectively.  This function selects the correct opcode based on
+ * the arguments presented.
+ */
+static void emit_op_modrm( struct x86_function *p,
+			   unsigned char op_dst_is_reg, 
+			   unsigned char op_dst_is_mem,
+			   struct x86_reg dst,
+			   struct x86_reg src )
+{  
+   switch (dst.mod) {
+   case mod_REG:
+      emit_1ub(p, op_dst_is_reg);
+      emit_modrm(p, dst, src);
+      break;
+   case mod_INDIRECT:
+   case mod_DISP32:
+   case mod_DISP8:
+      assert(src.mod == mod_REG);
+      emit_1ub(p, op_dst_is_mem);
+      emit_modrm(p, src, dst);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+
+
+
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+			     enum x86_reg_name idx )
+{
+   struct x86_reg reg;
+
+   reg.file = file;
+   reg.idx = idx;
+   reg.mod = mod_REG;
+   reg.disp = 0;
+
+   return reg;
+}
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+			      int disp )
+{
+   assert(reg.file == file_REG32);
+
+   if (reg.mod == mod_REG)
+      reg.disp = disp;
+   else
+      reg.disp += disp;
+
+   if (reg.disp == 0)
+      reg.mod = mod_INDIRECT;
+   else if (reg.disp <= 127 && reg.disp >= -128)
+      reg.mod = mod_DISP8;
+   else
+      reg.mod = mod_DISP32;
+
+   return reg;
+}
+
+struct x86_reg x86_deref( struct x86_reg reg )
+{
+   return x86_make_disp(reg, 0);
+}
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg )
+{
+   return x86_make_reg( reg.file, reg.idx );
+}
+
+unsigned char *x86_get_label( struct x86_function *p )
+{
+   return p->csr;
+}
+
+
+
+/***********************************************************************
+ * x86 instructions
+ */
+
+
+void x86_jcc( struct x86_function *p,
+	      enum x86_cc cc,
+	      unsigned char *label )
+{
+   int offset = label - (x86_get_label(p) + 2);
+   
+   if (offset <= 127 && offset >= -128) {
+      emit_1ub(p, 0x70 + cc);
+      emit_1b(p, (char) offset);
+   }
+   else {
+      offset = label - (x86_get_label(p) + 6);
+      emit_2ub(p, 0x0f, 0x80 + cc);
+      emit_1i(p, offset);
+   }
+}
+
+/* Always use a 32bit offset for forward jumps:
+ */
+unsigned char *x86_jcc_forward( struct x86_function *p,
+			  enum x86_cc cc )
+{
+   emit_2ub(p, 0x0f, 0x80 + cc);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+unsigned char *x86_jmp_forward( struct x86_function *p)
+{
+   emit_1ub(p, 0xe9);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+unsigned char *x86_call_forward( struct x86_function *p)
+{
+   emit_1ub(p, 0xe8);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+/* Fixup offset from forward jump:
+ */
+void x86_fixup_fwd_jump( struct x86_function *p,
+			 unsigned char *fixup )
+{
+   *(int *)(fixup - 4) = x86_get_label(p) - fixup;
+}
+
+void x86_jmp( struct x86_function *p, unsigned char *label)
+{
+   emit_1ub(p, 0xe9);
+   emit_1i(p, label - x86_get_label(p) - 4);
+}
+
+#if 0
+/* This doesn't work once we start reallocating & copying the
+ * generated code on buffer fills, because the call is relative to the
+ * current pc.
+ */
+void x86_call( struct x86_function *p, void (*label)())
+{
+   emit_1ub(p, 0xe8);
+   emit_1i(p, cptr(label) - x86_get_label(p) - 4);
+}
+#else
+void x86_call( struct x86_function *p, struct x86_reg reg)
+{
+   emit_1ub(p, 0xff);
+   emit_modrm(p, reg, reg);
+}
+#endif
+
+
+/* michal:
+ * Temporary. As I need immediate operands, and dont want to mess with the codegen,
+ * I load the immediate into general purpose register and use it.
+ */
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   assert(dst.mod == mod_REG);
+   emit_1ub(p, 0xb8 + dst.idx);
+   emit_1i(p, imm);
+}
+
+void x86_push( struct x86_function *p,
+	       struct x86_reg reg )
+{
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x50 + reg.idx);
+   p->stack_offset += 4;
+}
+
+void x86_pop( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x58 + reg.idx);
+   p->stack_offset -= 4;
+}
+
+void x86_inc( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x40 + reg.idx);
+}
+
+void x86_dec( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x48 + reg.idx);
+}
+
+void x86_ret( struct x86_function *p )
+{
+   emit_1ub(p, 0xc3);
+}
+
+void x86_sahf( struct x86_function *p )
+{
+   emit_1ub(p, 0x9e);
+}
+
+void x86_mov( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_xor( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   emit_op_modrm( p, 0x33, 0x31, dst, src );
+}
+
+void x86_cmp( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   emit_op_modrm( p, 0x3b, 0x39, dst, src );
+}
+
+void x86_lea( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   emit_1ub(p, 0x8d);
+   emit_modrm( p, dst, src );
+}
+
+void x86_test( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   emit_1ub(p, 0x85);
+   emit_modrm( p, dst, src );
+}
+
+void x86_add( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   emit_op_modrm(p, 0x03, 0x01, dst, src );
+}
+
+void x86_mul( struct x86_function *p,
+	       struct x86_reg src )
+{
+   assert (src.file == file_REG32 && src.mod == mod_REG);
+   emit_op_modrm(p, 0xf7, 0, x86_make_reg (file_REG32, reg_SP), src );
+}
+
+void x86_sub( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   emit_op_modrm(p, 0x2b, 0x29, dst, src );
+}
+
+void x86_or( struct x86_function *p,
+             struct x86_reg dst,
+             struct x86_reg src )
+{
+   emit_op_modrm( p, 0x0b, 0x09, dst, src );
+}
+
+void x86_and( struct x86_function *p,
+              struct x86_reg dst,
+              struct x86_reg src )
+{
+   emit_op_modrm( p, 0x23, 0x21, dst, src );
+}
+
+
+
+/***********************************************************************
+ * SSE instructions
+ */
+
+
+void sse_movss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, 0xF3, X86_TWOB);
+   emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movaps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x28, 0x29, dst, src );
+}
+
+void sse_movups( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movhps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   assert(dst.mod != mod_REG || src.mod != mod_REG);
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
+}
+
+void sse_movlps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   assert(dst.mod != mod_REG || src.mod != mod_REG);
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
+}
+
+void sse_maxps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
+void sse_maxss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
+void sse_divss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
+   emit_modrm( p, dst, src );
+}
+
+void sse_minps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x5D);
+   emit_modrm( p, dst, src );
+}
+
+void sse_subps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x5C);
+   emit_modrm( p, dst, src );
+}
+
+void sse_mulps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
+void sse_mulss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
+void sse_addps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x58);
+   emit_modrm( p, dst, src );
+}
+
+void sse_addss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x58);
+   emit_modrm( p, dst, src );
+}
+
+void sse_andnps( struct x86_function *p,
+                 struct x86_reg dst,
+                 struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x55);
+   emit_modrm( p, dst, src );
+}
+
+void sse_andps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x54);
+   emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtps( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x52);
+   emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtss( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x52);
+   emit_modrm( p, dst, src );
+
+}
+
+void sse_movhlps( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   assert(dst.mod == mod_REG && src.mod == mod_REG);
+   emit_2ub(p, X86_TWOB, 0x12);
+   emit_modrm( p, dst, src );
+}
+
+void sse_movlhps( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   assert(dst.mod == mod_REG && src.mod == mod_REG);
+   emit_2ub(p, X86_TWOB, 0x16);
+   emit_modrm( p, dst, src );
+}
+
+void sse_orps( struct x86_function *p,
+               struct x86_reg dst,
+               struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x56);
+   emit_modrm( p, dst, src );
+}
+
+void sse_xorps( struct x86_function *p,
+                struct x86_reg dst,
+                struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x57);
+   emit_modrm( p, dst, src );
+}
+
+void sse_cvtps2pi( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   assert(dst.file == file_MMX && 
+	  (src.file == file_XMM || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x2d);
+   emit_modrm( p, dst, src );
+}
+
+
+/* Shufps can also be used to implement a reduced swizzle when dest ==
+ * arg0.
+ */
+void sse_shufps( struct x86_function *p,
+		 struct x86_reg dest,
+		 struct x86_reg arg0,
+		 unsigned char shuf) 
+{
+   emit_2ub(p, X86_TWOB, 0xC6);
+   emit_modrm(p, dest, arg0);
+   emit_1ub(p, shuf); 
+}
+
+void sse_cmpps( struct x86_function *p,
+		struct x86_reg dest,
+		struct x86_reg arg0,
+		unsigned char cc) 
+{
+   emit_2ub(p, X86_TWOB, 0xC2);
+   emit_modrm(p, dest, arg0);
+   emit_1ub(p, cc); 
+}
+
+void sse_pmovmskb( struct x86_function *p,
+                   struct x86_reg dest,
+                   struct x86_reg src)
+{
+    emit_3ub(p, 0x66, X86_TWOB, 0xD7);
+    emit_modrm(p, dest, src);
+}
+
+/***********************************************************************
+ * SSE2 instructions
+ */
+
+/**
+ * Perform a reduced swizzle:
+ */
+void sse2_pshufd( struct x86_function *p,
+		  struct x86_reg dest,
+		  struct x86_reg arg0,
+		  unsigned char shuf) 
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x70);
+   emit_modrm(p, dest, arg0);
+   emit_1ub(p, shuf); 
+}
+
+void sse2_cvttps2dq( struct x86_function *p,
+                     struct x86_reg dst,
+                     struct x86_reg src )
+{
+   emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtps2dq( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x5B);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packssdw( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x6B);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packsswb( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x63);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packuswb( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x67);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_rcpps( struct x86_function *p,
+                 struct x86_reg dst,
+                 struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x53);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_rcpss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_3ub(p, 0xF3, X86_TWOB, 0x53);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_movd( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   emit_2ub(p, 0x66, X86_TWOB);
+   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
+}
+
+
+
+
+/***********************************************************************
+ * x87 instructions
+ */
+void x87_fist( struct x86_function *p, struct x86_reg dst )
+{
+   emit_1ub(p, 0xdb);
+   emit_modrm_noreg(p, 2, dst);
+}
+
+void x87_fistp( struct x86_function *p, struct x86_reg dst )
+{
+   emit_1ub(p, 0xdb);
+   emit_modrm_noreg(p, 3, dst);
+}
+
+void x87_fild( struct x86_function *p, struct x86_reg arg )
+{
+   emit_1ub(p, 0xdf);
+   emit_modrm_noreg(p, 0, arg);
+}
+
+void x87_fldz( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xee);
+}
+
+
+void x87_fldcw( struct x86_function *p, struct x86_reg arg )
+{
+   assert(arg.file == file_REG32);
+   assert(arg.mod != mod_REG);
+   emit_1ub(p, 0xd9);
+   emit_modrm_noreg(p, 5, arg);
+}
+
+void x87_fld1( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xe8);
+}
+
+void x87_fldl2e( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xea);
+}
+
+void x87_fldln2( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xed);
+}
+
+void x87_fwait( struct x86_function *p )
+{
+   emit_1ub(p, 0x9b);
+}
+
+void x87_fnclex( struct x86_function *p )
+{
+   emit_2ub(p, 0xdb, 0xe2);
+}
+
+void x87_fclex( struct x86_function *p )
+{
+   x87_fwait(p);
+   x87_fnclex(p);
+}
+
+
+static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,
+			  unsigned char dst0ub0,
+			  unsigned char dst0ub1,
+			  unsigned char arg0ub0,
+			  unsigned char arg0ub1,
+			  unsigned char argmem_noreg)
+{
+   assert(dst.file == file_x87);
+
+   if (arg.file == file_x87) {
+      if (dst.idx == 0) 
+	 emit_2ub(p, dst0ub0, dst0ub1+arg.idx);
+      else if (arg.idx == 0) 
+	 emit_2ub(p, arg0ub0, arg0ub1+arg.idx);
+      else
+	 assert(0);
+   }
+   else if (dst.idx == 0) {
+      assert(arg.file == file_REG32);
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, argmem_noreg, arg);
+   }
+   else
+      assert(0);
+}
+
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xc8,
+		0xdc, 0xc8,
+		4);
+}
+
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xe0,
+		0xdc, 0xe8,
+		4);
+}
+
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xe8,
+		0xdc, 0xe0,
+		5);
+}
+
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xc0,
+		0xdc, 0xc0,
+		0);
+}
+
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xf0,
+		0xdc, 0xf8,
+		6);
+}
+
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+{
+   x87_arith_op(p, dst, arg, 
+		0xd8, 0xf8,
+		0xdc, 0xf0,
+		7);
+}
+
+void x87_fmulp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xc8+dst.idx);
+}
+
+void x87_fsubp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xe8+dst.idx);
+}
+
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xe0+dst.idx);
+}
+
+void x87_faddp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xc0+dst.idx);
+}
+
+void x87_fdivp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xf8+dst.idx);
+}
+
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xf0+dst.idx);
+}
+
+void x87_fucom( struct x86_function *p, struct x86_reg arg )
+{
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdd, 0xe0+arg.idx);
+}
+
+void x87_fucomp( struct x86_function *p, struct x86_reg arg )
+{
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdd, 0xe8+arg.idx);
+}
+
+void x87_fucompp( struct x86_function *p )
+{
+   emit_2ub(p, 0xda, 0xe9);
+}
+
+void x87_fxch( struct x86_function *p, struct x86_reg arg )
+{
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xd9, 0xc8+arg.idx);
+}
+
+void x87_fabs( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xe1);
+}
+
+void x87_fchs( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xe0);
+}
+
+void x87_fcos( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xff);
+}
+
+
+void x87_fprndint( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfc);
+}
+
+void x87_fscale( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfd);
+}
+
+void x87_fsin( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfe);
+}
+
+void x87_fsincos( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfb);
+}
+
+void x87_fsqrt( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xfa);
+}
+
+void x87_fxtract( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xf4);
+}
+
+/* st0 = (2^st0)-1
+ *
+ * Restrictions: -1.0 <= st0 <= 1.0
+ */
+void x87_f2xm1( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xf0);
+}
+
+/* st1 = st1 * log2(st0);
+ * pop_stack;
+ */
+void x87_fyl2x( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xf1);
+}
+
+/* st1 = st1 * log2(st0 + 1.0);
+ * pop_stack;
+ *
+ * A fast operation, with restrictions: -.29 < st0 < .29 
+ */
+void x87_fyl2xp1( struct x86_function *p )
+{
+   emit_2ub(p, 0xd9, 0xf9);
+}
+
+
+void x87_fld( struct x86_function *p, struct x86_reg arg )
+{
+   if (arg.file == file_x87) 
+      emit_2ub(p, 0xd9, 0xc0 + arg.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 0, arg);
+   }
+}
+
+void x87_fst( struct x86_function *p, struct x86_reg dst )
+{
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xdd, 0xd0 + dst.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 2, dst);
+   }
+}
+
+void x87_fstp( struct x86_function *p, struct x86_reg dst )
+{
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xdd, 0xd8 + dst.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 3, dst);
+   }
+}
+
+void x87_fcom( struct x86_function *p, struct x86_reg dst )
+{
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xd8, 0xd0 + dst.idx);
+   else {
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, 2, dst);
+   }
+}
+
+void x87_fcomp( struct x86_function *p, struct x86_reg dst )
+{
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xd8, 0xd8 + dst.idx);
+   else {
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, 3, dst);
+   }
+}
+
+
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
+{
+   assert(dst.file == file_REG32);
+
+   if (dst.idx == reg_AX &&
+       dst.mod == mod_REG) 
+      emit_2ub(p, 0xdf, 0xe0);
+   else {
+      emit_1ub(p, 0xdd);
+      emit_modrm_noreg(p, 7, dst);
+   }
+}
+
+
+
+
+/***********************************************************************
+ * MMX instructions
+ */
+
+void mmx_emms( struct x86_function *p )
+{
+   assert(p->need_emms);
+   emit_2ub(p, 0x0f, 0x77);
+   p->need_emms = 0;
+}
+
+void mmx_packssdw( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   assert(dst.file == file_MMX && 
+	  (src.file == file_MMX || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x6b);
+   emit_modrm( p, dst, src );
+}
+
+void mmx_packuswb( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   assert(dst.file == file_MMX && 
+	  (src.file == file_MMX || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x67);
+   emit_modrm( p, dst, src );
+}
+
+void mmx_movd( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   p->need_emms = 1;
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
+}
+
+void mmx_movq( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   p->need_emms = 1;
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x6f, 0x7f, dst, src );
+}
+
+
+/***********************************************************************
+ * Helper functions
+ */
+
+
+/* Retreive a reference to one of the function arguments, taking into
+ * account any push/pop activity:
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p,
+			   unsigned arg )
+{
+   return x86_make_disp(x86_make_reg(file_REG32, reg_SP), 
+			p->stack_offset + arg * 4);	/* ??? */
+}
+
+
+void x86_init_func( struct x86_function *p )
+{
+   p->size = 0;
+   p->store = NULL;
+   p->csr = p->store;
+}
+
+void x86_init_func_size( struct x86_function *p, unsigned code_size )
+{
+   p->size = code_size;
+   p->store = rtasm_exec_malloc(code_size);
+   p->csr = p->store;
+}
+
+void x86_release_func( struct x86_function *p )
+{
+   rtasm_exec_free(p->store);
+   p->store = NULL;
+   p->csr = NULL;
+   p->size = 0;
+}
+
+
+void (*x86_get_func( struct x86_function *p ))(void)
+{
+   if (DISASSEM && p->store)
+      debug_printf("disassemble %p %p\n", p->store, p->csr);
+   return (void (*)(void)) (unsigned long) p->store;
+}
+
+#else
+
+void x86sse_dummy( void )
+{
+}
+
+#endif
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
new file mode 100644
index 0000000000..c2aa416492
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -0,0 +1,256 @@
+
+#ifndef _X86SSE_H_
+#define _X86SSE_H_
+
+#if defined(__i386__) || defined(__386__)
+
+/* It is up to the caller to ensure that instructions issued are
+ * suitable for the host cpu.  There are no checks made in this module
+ * for mmx/sse/sse2 support on the cpu.
+ */
+struct x86_reg {
+   unsigned file:3;
+   unsigned idx:3;
+   unsigned mod:2;		/* mod_REG if this is just a register */
+   int      disp:24;		/* only +/- 23bits of offset - should be enough... */
+};
+
+struct x86_function {
+   unsigned size;
+   unsigned char *store;
+   unsigned char *csr;
+   unsigned stack_offset;
+   int need_emms;
+   const char *fn;
+};
+
+enum x86_reg_file {
+   file_REG32,
+   file_MMX,
+   file_XMM,
+   file_x87
+};
+
+/* Values for mod field of modr/m byte
+ */
+enum x86_reg_mod {
+   mod_INDIRECT,
+   mod_DISP8,
+   mod_DISP32,
+   mod_REG
+};
+
+enum x86_reg_name {
+   reg_AX,
+   reg_CX,
+   reg_DX,
+   reg_BX,
+   reg_SP,
+   reg_BP,
+   reg_SI,
+   reg_DI
+};
+
+
+enum x86_cc {
+   cc_O,			/* overflow */
+   cc_NO,			/* not overflow */
+   cc_NAE,			/* not above or equal / carry */
+   cc_AE,			/* above or equal / not carry */
+   cc_E,			/* equal / zero */
+   cc_NE			/* not equal / not zero */
+};
+
+enum sse_cc {
+   cc_Equal,
+   cc_LessThan,
+   cc_LessThanEqual,
+   cc_Unordered,
+   cc_NotEqual,
+   cc_NotLessThan,
+   cc_NotLessThanEqual,
+   cc_Ordered
+};
+
+#define cc_Z  cc_E
+#define cc_NZ cc_NE
+
+/* Begin/end/retreive function creation:
+ */
+
+
+void x86_init_func( struct x86_function *p );
+void x86_init_func_size( struct x86_function *p, unsigned code_size );
+void x86_release_func( struct x86_function *p );
+void (*x86_get_func( struct x86_function *p ))( void );
+
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+			     enum x86_reg_name idx );
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+			      int disp );
+
+struct x86_reg x86_deref( struct x86_reg reg );
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg );
+
+
+/* Labels, jumps and fixup:
+ */
+unsigned char *x86_get_label( struct x86_function *p );
+
+void x86_jcc( struct x86_function *p,
+	      enum x86_cc cc,
+	      unsigned char *label );
+
+unsigned char *x86_jcc_forward( struct x86_function *p,
+			  enum x86_cc cc );
+
+unsigned char *x86_jmp_forward( struct x86_function *p);
+
+unsigned char *x86_call_forward( struct x86_function *p);
+
+void x86_fixup_fwd_jump( struct x86_function *p,
+			 unsigned char *fixup );
+
+void x86_jmp( struct x86_function *p, unsigned char *label );
+
+/* void x86_call( struct x86_function *p, void (*label)() ); */
+void x86_call( struct x86_function *p, struct x86_reg reg);
+
+/* michal:
+ * Temporary. As I need immediate operands, and dont want to mess with the codegen,
+ * I load the immediate into general purpose register and use it.
+ */
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
+
+
+/* Macro for sse_shufps() and sse2_pshufd():
+ */
+#define SHUF(_x,_y,_z,_w)       (((_x)<<0) | ((_y)<<2) | ((_z)<<4) | ((_w)<<6))
+#define SHUF_NOOP               RSW(0,1,2,3)
+#define GET_SHUF(swz, idx)      (((swz) >> ((idx)*2)) & 0x3)
+
+void mmx_emms( struct x86_function *p );
+void mmx_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src,
+                unsigned char cc );
+void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_orps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_xorps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                 unsigned char shuf );
+void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
+
+void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_cmp( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_dec( struct x86_function *p, struct x86_reg reg );
+void x86_inc( struct x86_function *p, struct x86_reg reg );
+void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mul( struct x86_function *p, struct x86_reg src );
+void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_pop( struct x86_function *p, struct x86_reg reg );
+void x86_push( struct x86_function *p, struct x86_reg reg );
+void x86_ret( struct x86_function *p );
+void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_sahf( struct x86_function *p );
+
+void x87_f2xm1( struct x86_function *p );
+void x87_fabs( struct x86_function *p );
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_faddp( struct x86_function *p, struct x86_reg dst );
+void x87_fchs( struct x86_function *p );
+void x87_fclex( struct x86_function *p );
+void x87_fcom( struct x86_function *p, struct x86_reg dst );
+void x87_fcomp( struct x86_function *p, struct x86_reg dst );
+void x87_fcos( struct x86_function *p );
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivp( struct x86_function *p, struct x86_reg dst );
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst );
+void x87_fild( struct x86_function *p, struct x86_reg arg );
+void x87_fist( struct x86_function *p, struct x86_reg dst );
+void x87_fistp( struct x86_function *p, struct x86_reg dst );
+void x87_fld( struct x86_function *p, struct x86_reg arg );
+void x87_fld1( struct x86_function *p );
+void x87_fldcw( struct x86_function *p, struct x86_reg arg );
+void x87_fldl2e( struct x86_function *p );
+void x87_fldln2( struct x86_function *p );
+void x87_fldz( struct x86_function *p );
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fmulp( struct x86_function *p, struct x86_reg dst );
+void x87_fnclex( struct x86_function *p );
+void x87_fprndint( struct x86_function *p );
+void x87_fscale( struct x86_function *p );
+void x87_fsin( struct x86_function *p );
+void x87_fsincos( struct x86_function *p );
+void x87_fsqrt( struct x86_function *p );
+void x87_fst( struct x86_function *p, struct x86_reg dst );
+void x87_fstp( struct x86_function *p, struct x86_reg dst );
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubp( struct x86_function *p, struct x86_reg dst );
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
+void x87_fxch( struct x86_function *p, struct x86_reg dst );
+void x87_fxtract( struct x86_function *p );
+void x87_fyl2x( struct x86_function *p );
+void x87_fyl2xp1( struct x86_function *p );
+void x87_fwait( struct x86_function *p );
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
+void x87_fucompp( struct x86_function *p );
+void x87_fucomp( struct x86_function *p, struct x86_reg arg );
+void x87_fucom( struct x86_function *p, struct x86_reg arg );
+
+
+
+/* Retreive a reference to one of the function arguments, taking into
+ * account any push/pop activity.  Note - doesn't track explict
+ * manipulation of ESP by other instructions.
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg );
+
+#endif
+#endif
diff --git a/src/gallium/auxiliary/rtasm/x86sse.c b/src/gallium/auxiliary/rtasm/x86sse.c
deleted file mode 100644
index fff6f77a6b..0000000000
--- a/src/gallium/auxiliary/rtasm/x86sse.c
+++ /dev/null
@@ -1,1195 +0,0 @@
-#if defined(__i386__) || defined(__386__)
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_debug.h"
-
-#include "x86sse.h"
-
-#define DISASSEM 0
-#define X86_TWOB 0x0f
-
-static unsigned char *cptr( void (*label)() )
-{
-   return (unsigned char *)(unsigned long)label;
-}
-
-
-static void do_realloc( struct x86_function *p )
-{
-   if (p->size == 0) {
-      p->size = 1024;
-      p->store = _mesa_exec_malloc(p->size);
-      p->csr = p->store;
-   }
-   else {
-      unsigned used = p->csr - p->store;
-      unsigned char *tmp = p->store;
-      p->size *= 2;
-      p->store = _mesa_exec_malloc(p->size);
-      memcpy(p->store, tmp, used);
-      p->csr = p->store + used;
-      _mesa_exec_free(tmp);
-   }
-}
-
-/* Emit bytes to the instruction stream:
- */
-static unsigned char *reserve( struct x86_function *p, int bytes )
-{
-   if (p->csr + bytes - p->store > p->size)
-      do_realloc(p);
-
-   {
-      unsigned char *csr = p->csr;
-      p->csr += bytes;
-      return csr;
-   }
-}
-
-
-
-static void emit_1b( struct x86_function *p, char b0 )
-{
-   char *csr = (char *)reserve(p, 1);
-   *csr = b0;
-}
-
-static void emit_1i( struct x86_function *p, int i0 )
-{
-   int *icsr = (int *)reserve(p, sizeof(i0));
-   *icsr = i0;
-}
-
-static void emit_1ub( struct x86_function *p, unsigned char b0 )
-{
-   unsigned char *csr = reserve(p, 1);
-   *csr++ = b0;
-}
-
-static void emit_2ub( struct x86_function *p, unsigned char b0, unsigned char b1 )
-{
-   unsigned char *csr = reserve(p, 2);
-   *csr++ = b0;
-   *csr++ = b1;
-}
-
-static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1, unsigned char b2 )
-{
-   unsigned char *csr = reserve(p, 3);
-   *csr++ = b0;
-   *csr++ = b1;
-   *csr++ = b2;
-}
-
-
-/* Build a modRM byte + possible displacement.  No treatment of SIB
- * indexing.  BZZT - no way to encode an absolute address.
- */
-static void emit_modrm( struct x86_function *p, 
-			struct x86_reg reg, 
-			struct x86_reg regmem )
-{
-   unsigned char val = 0;
-   
-   assert(reg.mod == mod_REG);
-   
-   val |= regmem.mod << 6;     	/* mod field */
-   val |= reg.idx << 3;		/* reg field */
-   val |= regmem.idx;		/* r/m field */
-   
-   emit_1ub(p, val);
-
-   /* Oh-oh we've stumbled into the SIB thing.
-    */
-   if (regmem.file == file_REG32 &&
-       regmem.idx == reg_SP) {
-      emit_1ub(p, 0x24);		/* simplistic! */
-   }
-
-   switch (regmem.mod) {
-   case mod_REG:
-   case mod_INDIRECT:
-      break;
-   case mod_DISP8:
-      emit_1b(p, regmem.disp);
-      break;
-   case mod_DISP32:
-      emit_1i(p, regmem.disp);
-      break;
-   default:
-      assert(0);
-      break;
-   }
-}
-
-
-static void emit_modrm_noreg( struct x86_function *p,
-			      unsigned op,
-			      struct x86_reg regmem )
-{
-   struct x86_reg dummy = x86_make_reg(file_REG32, op);
-   emit_modrm(p, dummy, regmem);
-}
-
-/* Many x86 instructions have two opcodes to cope with the situations
- * where the destination is a register or memory reference
- * respectively.  This function selects the correct opcode based on
- * the arguments presented.
- */
-static void emit_op_modrm( struct x86_function *p,
-			   unsigned char op_dst_is_reg, 
-			   unsigned char op_dst_is_mem,
-			   struct x86_reg dst,
-			   struct x86_reg src )
-{  
-   switch (dst.mod) {
-   case mod_REG:
-      emit_1ub(p, op_dst_is_reg);
-      emit_modrm(p, dst, src);
-      break;
-   case mod_INDIRECT:
-   case mod_DISP32:
-   case mod_DISP8:
-      assert(src.mod == mod_REG);
-      emit_1ub(p, op_dst_is_mem);
-      emit_modrm(p, src, dst);
-      break;
-   default:
-      assert(0);
-      break;
-   }
-}
-
-
-
-
-
-
-
-/* Create and manipulate registers and regmem values:
- */
-struct x86_reg x86_make_reg( enum x86_reg_file file,
-			     enum x86_reg_name idx )
-{
-   struct x86_reg reg;
-
-   reg.file = file;
-   reg.idx = idx;
-   reg.mod = mod_REG;
-   reg.disp = 0;
-
-   return reg;
-}
-
-struct x86_reg x86_make_disp( struct x86_reg reg,
-			      int disp )
-{
-   assert(reg.file == file_REG32);
-
-   if (reg.mod == mod_REG)
-      reg.disp = disp;
-   else
-      reg.disp += disp;
-
-   if (reg.disp == 0)
-      reg.mod = mod_INDIRECT;
-   else if (reg.disp <= 127 && reg.disp >= -128)
-      reg.mod = mod_DISP8;
-   else
-      reg.mod = mod_DISP32;
-
-   return reg;
-}
-
-struct x86_reg x86_deref( struct x86_reg reg )
-{
-   return x86_make_disp(reg, 0);
-}
-
-struct x86_reg x86_get_base_reg( struct x86_reg reg )
-{
-   return x86_make_reg( reg.file, reg.idx );
-}
-
-unsigned char *x86_get_label( struct x86_function *p )
-{
-   return p->csr;
-}
-
-
-
-/***********************************************************************
- * x86 instructions
- */
-
-
-void x86_jcc( struct x86_function *p,
-	      enum x86_cc cc,
-	      unsigned char *label )
-{
-   int offset = label - (x86_get_label(p) + 2);
-   
-   if (offset <= 127 && offset >= -128) {
-      emit_1ub(p, 0x70 + cc);
-      emit_1b(p, (char) offset);
-   }
-   else {
-      offset = label - (x86_get_label(p) + 6);
-      emit_2ub(p, 0x0f, 0x80 + cc);
-      emit_1i(p, offset);
-   }
-}
-
-/* Always use a 32bit offset for forward jumps:
- */
-unsigned char *x86_jcc_forward( struct x86_function *p,
-			  enum x86_cc cc )
-{
-   emit_2ub(p, 0x0f, 0x80 + cc);
-   emit_1i(p, 0);
-   return x86_get_label(p);
-}
-
-unsigned char *x86_jmp_forward( struct x86_function *p)
-{
-   emit_1ub(p, 0xe9);
-   emit_1i(p, 0);
-   return x86_get_label(p);
-}
-
-unsigned char *x86_call_forward( struct x86_function *p)
-{
-   emit_1ub(p, 0xe8);
-   emit_1i(p, 0);
-   return x86_get_label(p);
-}
-
-/* Fixup offset from forward jump:
- */
-void x86_fixup_fwd_jump( struct x86_function *p,
-			 unsigned char *fixup )
-{
-   *(int *)(fixup - 4) = x86_get_label(p) - fixup;
-}
-
-void x86_jmp( struct x86_function *p, unsigned char *label)
-{
-   emit_1ub(p, 0xe9);
-   emit_1i(p, label - x86_get_label(p) - 4);
-}
-
-#if 0
-/* This doesn't work once we start reallocating & copying the
- * generated code on buffer fills, because the call is relative to the
- * current pc.
- */
-void x86_call( struct x86_function *p, void (*label)())
-{
-   emit_1ub(p, 0xe8);
-   emit_1i(p, cptr(label) - x86_get_label(p) - 4);
-}
-#else
-void x86_call( struct x86_function *p, struct x86_reg reg)
-{
-   emit_1ub(p, 0xff);
-   emit_modrm(p, reg, reg);
-}
-#endif
-
-
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
-void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
-{
-   assert(dst.mod == mod_REG);
-   emit_1ub(p, 0xb8 + dst.idx);
-   emit_1i(p, imm);
-}
-
-void x86_push( struct x86_function *p,
-	       struct x86_reg reg )
-{
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x50 + reg.idx);
-   p->stack_offset += 4;
-}
-
-void x86_pop( struct x86_function *p,
-	      struct x86_reg reg )
-{
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x58 + reg.idx);
-   p->stack_offset -= 4;
-}
-
-void x86_inc( struct x86_function *p,
-	      struct x86_reg reg )
-{
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x40 + reg.idx);
-}
-
-void x86_dec( struct x86_function *p,
-	      struct x86_reg reg )
-{
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x48 + reg.idx);
-}
-
-void x86_ret( struct x86_function *p )
-{
-   emit_1ub(p, 0xc3);
-}
-
-void x86_sahf( struct x86_function *p )
-{
-   emit_1ub(p, 0x9e);
-}
-
-void x86_mov( struct x86_function *p,
-	      struct x86_reg dst,
-	      struct x86_reg src )
-{
-   emit_op_modrm( p, 0x8b, 0x89, dst, src );
-}
-
-void x86_xor( struct x86_function *p,
-	      struct x86_reg dst,
-	      struct x86_reg src )
-{
-   emit_op_modrm( p, 0x33, 0x31, dst, src );
-}
-
-void x86_cmp( struct x86_function *p,
-	      struct x86_reg dst,
-	      struct x86_reg src )
-{
-   emit_op_modrm( p, 0x3b, 0x39, dst, src );
-}
-
-void x86_lea( struct x86_function *p,
-	      struct x86_reg dst,
-	      struct x86_reg src )
-{
-   emit_1ub(p, 0x8d);
-   emit_modrm( p, dst, src );
-}
-
-void x86_test( struct x86_function *p,
-	       struct x86_reg dst,
-	       struct x86_reg src )
-{
-   emit_1ub(p, 0x85);
-   emit_modrm( p, dst, src );
-}
-
-void x86_add( struct x86_function *p,
-	       struct x86_reg dst,
-	       struct x86_reg src )
-{
-   emit_op_modrm(p, 0x03, 0x01, dst, src );
-}
-
-void x86_mul( struct x86_function *p,
-	       struct x86_reg src )
-{
-   assert (src.file == file_REG32 && src.mod == mod_REG);
-   emit_op_modrm(p, 0xf7, 0, x86_make_reg (file_REG32, reg_SP), src );
-}
-
-void x86_sub( struct x86_function *p,
-	       struct x86_reg dst,
-	       struct x86_reg src )
-{
-   emit_op_modrm(p, 0x2b, 0x29, dst, src );
-}
-
-void x86_or( struct x86_function *p,
-             struct x86_reg dst,
-             struct x86_reg src )
-{
-   emit_op_modrm( p, 0x0b, 0x09, dst, src );
-}
-
-void x86_and( struct x86_function *p,
-              struct x86_reg dst,
-              struct x86_reg src )
-{
-   emit_op_modrm( p, 0x23, 0x21, dst, src );
-}
-
-
-
-/***********************************************************************
- * SSE instructions
- */
-
-
-void sse_movss( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_2ub(p, 0xF3, X86_TWOB);
-   emit_op_modrm( p, 0x10, 0x11, dst, src );
-}
-
-void sse_movaps( struct x86_function *p,
-		 struct x86_reg dst,
-		 struct x86_reg src )
-{
-   emit_1ub(p, X86_TWOB);
-   emit_op_modrm( p, 0x28, 0x29, dst, src );
-}
-
-void sse_movups( struct x86_function *p,
-		 struct x86_reg dst,
-		 struct x86_reg src )
-{
-   emit_1ub(p, X86_TWOB);
-   emit_op_modrm( p, 0x10, 0x11, dst, src );
-}
-
-void sse_movhps( struct x86_function *p,
-		 struct x86_reg dst,
-		 struct x86_reg src )
-{
-   assert(dst.mod != mod_REG || src.mod != mod_REG);
-   emit_1ub(p, X86_TWOB);
-   emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
-}
-
-void sse_movlps( struct x86_function *p,
-		 struct x86_reg dst,
-		 struct x86_reg src )
-{
-   assert(dst.mod != mod_REG || src.mod != mod_REG);
-   emit_1ub(p, X86_TWOB);
-   emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
-}
-
-void sse_maxps( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x5F);
-   emit_modrm( p, dst, src );
-}
-
-void sse_maxss( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
-   emit_modrm( p, dst, src );
-}
-
-void sse_divss( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
-   emit_modrm( p, dst, src );
-}
-
-void sse_minps( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x5D);
-   emit_modrm( p, dst, src );
-}
-
-void sse_subps( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x5C);
-   emit_modrm( p, dst, src );
-}
-
-void sse_mulps( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x59);
-   emit_modrm( p, dst, src );
-}
-
-void sse_mulss( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_3ub(p, 0xF3, X86_TWOB, 0x59);
-   emit_modrm( p, dst, src );
-}
-
-void sse_addps( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x58);
-   emit_modrm( p, dst, src );
-}
-
-void sse_addss( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_3ub(p, 0xF3, X86_TWOB, 0x58);
-   emit_modrm( p, dst, src );
-}
-
-void sse_andnps( struct x86_function *p,
-                 struct x86_reg dst,
-                 struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x55);
-   emit_modrm( p, dst, src );
-}
-
-void sse_andps( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x54);
-   emit_modrm( p, dst, src );
-}
-
-void sse_rsqrtps( struct x86_function *p,
-                  struct x86_reg dst,
-                  struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x52);
-   emit_modrm( p, dst, src );
-}
-
-void sse_rsqrtss( struct x86_function *p,
-		  struct x86_reg dst,
-		  struct x86_reg src )
-{
-   emit_3ub(p, 0xF3, X86_TWOB, 0x52);
-   emit_modrm( p, dst, src );
-
-}
-
-void sse_movhlps( struct x86_function *p,
-		  struct x86_reg dst,
-		  struct x86_reg src )
-{
-   assert(dst.mod == mod_REG && src.mod == mod_REG);
-   emit_2ub(p, X86_TWOB, 0x12);
-   emit_modrm( p, dst, src );
-}
-
-void sse_movlhps( struct x86_function *p,
-		  struct x86_reg dst,
-		  struct x86_reg src )
-{
-   assert(dst.mod == mod_REG && src.mod == mod_REG);
-   emit_2ub(p, X86_TWOB, 0x16);
-   emit_modrm( p, dst, src );
-}
-
-void sse_orps( struct x86_function *p,
-               struct x86_reg dst,
-               struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x56);
-   emit_modrm( p, dst, src );
-}
-
-void sse_xorps( struct x86_function *p,
-                struct x86_reg dst,
-                struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x57);
-   emit_modrm( p, dst, src );
-}
-
-void sse_cvtps2pi( struct x86_function *p,
-		   struct x86_reg dst,
-		   struct x86_reg src )
-{
-   assert(dst.file == file_MMX && 
-	  (src.file == file_XMM || src.mod != mod_REG));
-
-   p->need_emms = 1;
-
-   emit_2ub(p, X86_TWOB, 0x2d);
-   emit_modrm( p, dst, src );
-}
-
-
-/* Shufps can also be used to implement a reduced swizzle when dest ==
- * arg0.
- */
-void sse_shufps( struct x86_function *p,
-		 struct x86_reg dest,
-		 struct x86_reg arg0,
-		 unsigned char shuf) 
-{
-   emit_2ub(p, X86_TWOB, 0xC6);
-   emit_modrm(p, dest, arg0);
-   emit_1ub(p, shuf); 
-}
-
-void sse_cmpps( struct x86_function *p,
-		struct x86_reg dest,
-		struct x86_reg arg0,
-		unsigned char cc) 
-{
-   emit_2ub(p, X86_TWOB, 0xC2);
-   emit_modrm(p, dest, arg0);
-   emit_1ub(p, cc); 
-}
-
-void sse_pmovmskb( struct x86_function *p,
-                   struct x86_reg dest,
-                   struct x86_reg src)
-{
-    emit_3ub(p, 0x66, X86_TWOB, 0xD7);
-    emit_modrm(p, dest, src);
-}
-
-/***********************************************************************
- * SSE2 instructions
- */
-
-/**
- * Perform a reduced swizzle:
- */
-void sse2_pshufd( struct x86_function *p,
-		  struct x86_reg dest,
-		  struct x86_reg arg0,
-		  unsigned char shuf) 
-{
-   emit_3ub(p, 0x66, X86_TWOB, 0x70);
-   emit_modrm(p, dest, arg0);
-   emit_1ub(p, shuf); 
-}
-
-void sse2_cvttps2dq( struct x86_function *p,
-                     struct x86_reg dst,
-                     struct x86_reg src )
-{
-   emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
-   emit_modrm( p, dst, src );
-}
-
-void sse2_cvtps2dq( struct x86_function *p,
-		    struct x86_reg dst,
-		    struct x86_reg src )
-{
-   emit_3ub(p, 0x66, X86_TWOB, 0x5B);
-   emit_modrm( p, dst, src );
-}
-
-void sse2_packssdw( struct x86_function *p,
-		    struct x86_reg dst,
-		    struct x86_reg src )
-{
-   emit_3ub(p, 0x66, X86_TWOB, 0x6B);
-   emit_modrm( p, dst, src );
-}
-
-void sse2_packsswb( struct x86_function *p,
-		    struct x86_reg dst,
-		    struct x86_reg src )
-{
-   emit_3ub(p, 0x66, X86_TWOB, 0x63);
-   emit_modrm( p, dst, src );
-}
-
-void sse2_packuswb( struct x86_function *p,
-		    struct x86_reg dst,
-		    struct x86_reg src )
-{
-   emit_3ub(p, 0x66, X86_TWOB, 0x67);
-   emit_modrm( p, dst, src );
-}
-
-void sse2_rcpps( struct x86_function *p,
-                 struct x86_reg dst,
-                 struct x86_reg src )
-{
-   emit_2ub(p, X86_TWOB, 0x53);
-   emit_modrm( p, dst, src );
-}
-
-void sse2_rcpss( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_3ub(p, 0xF3, X86_TWOB, 0x53);
-   emit_modrm( p, dst, src );
-}
-
-void sse2_movd( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   emit_2ub(p, 0x66, X86_TWOB);
-   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
-}
-
-
-
-
-/***********************************************************************
- * x87 instructions
- */
-void x87_fist( struct x86_function *p, struct x86_reg dst )
-{
-   emit_1ub(p, 0xdb);
-   emit_modrm_noreg(p, 2, dst);
-}
-
-void x87_fistp( struct x86_function *p, struct x86_reg dst )
-{
-   emit_1ub(p, 0xdb);
-   emit_modrm_noreg(p, 3, dst);
-}
-
-void x87_fild( struct x86_function *p, struct x86_reg arg )
-{
-   emit_1ub(p, 0xdf);
-   emit_modrm_noreg(p, 0, arg);
-}
-
-void x87_fldz( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xee);
-}
-
-
-void x87_fldcw( struct x86_function *p, struct x86_reg arg )
-{
-   assert(arg.file == file_REG32);
-   assert(arg.mod != mod_REG);
-   emit_1ub(p, 0xd9);
-   emit_modrm_noreg(p, 5, arg);
-}
-
-void x87_fld1( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xe8);
-}
-
-void x87_fldl2e( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xea);
-}
-
-void x87_fldln2( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xed);
-}
-
-void x87_fwait( struct x86_function *p )
-{
-   emit_1ub(p, 0x9b);
-}
-
-void x87_fnclex( struct x86_function *p )
-{
-   emit_2ub(p, 0xdb, 0xe2);
-}
-
-void x87_fclex( struct x86_function *p )
-{
-   x87_fwait(p);
-   x87_fnclex(p);
-}
-
-
-static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,
-			  unsigned char dst0ub0,
-			  unsigned char dst0ub1,
-			  unsigned char arg0ub0,
-			  unsigned char arg0ub1,
-			  unsigned char argmem_noreg)
-{
-   assert(dst.file == file_x87);
-
-   if (arg.file == file_x87) {
-      if (dst.idx == 0) 
-	 emit_2ub(p, dst0ub0, dst0ub1+arg.idx);
-      else if (arg.idx == 0) 
-	 emit_2ub(p, arg0ub0, arg0ub1+arg.idx);
-      else
-	 assert(0);
-   }
-   else if (dst.idx == 0) {
-      assert(arg.file == file_REG32);
-      emit_1ub(p, 0xd8);
-      emit_modrm_noreg(p, argmem_noreg, arg);
-   }
-   else
-      assert(0);
-}
-
-void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
-   x87_arith_op(p, dst, arg, 
-		0xd8, 0xc8,
-		0xdc, 0xc8,
-		4);
-}
-
-void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
-   x87_arith_op(p, dst, arg, 
-		0xd8, 0xe0,
-		0xdc, 0xe8,
-		4);
-}
-
-void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
-   x87_arith_op(p, dst, arg, 
-		0xd8, 0xe8,
-		0xdc, 0xe0,
-		5);
-}
-
-void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
-   x87_arith_op(p, dst, arg, 
-		0xd8, 0xc0,
-		0xdc, 0xc0,
-		0);
-}
-
-void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
-   x87_arith_op(p, dst, arg, 
-		0xd8, 0xf0,
-		0xdc, 0xf8,
-		6);
-}
-
-void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
-{
-   x87_arith_op(p, dst, arg, 
-		0xd8, 0xf8,
-		0xdc, 0xf0,
-		7);
-}
-
-void x87_fmulp( struct x86_function *p, struct x86_reg dst )
-{
-   assert(dst.file == file_x87);
-   assert(dst.idx >= 1);
-   emit_2ub(p, 0xde, 0xc8+dst.idx);
-}
-
-void x87_fsubp( struct x86_function *p, struct x86_reg dst )
-{
-   assert(dst.file == file_x87);
-   assert(dst.idx >= 1);
-   emit_2ub(p, 0xde, 0xe8+dst.idx);
-}
-
-void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
-{
-   assert(dst.file == file_x87);
-   assert(dst.idx >= 1);
-   emit_2ub(p, 0xde, 0xe0+dst.idx);
-}
-
-void x87_faddp( struct x86_function *p, struct x86_reg dst )
-{
-   assert(dst.file == file_x87);
-   assert(dst.idx >= 1);
-   emit_2ub(p, 0xde, 0xc0+dst.idx);
-}
-
-void x87_fdivp( struct x86_function *p, struct x86_reg dst )
-{
-   assert(dst.file == file_x87);
-   assert(dst.idx >= 1);
-   emit_2ub(p, 0xde, 0xf8+dst.idx);
-}
-
-void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
-{
-   assert(dst.file == file_x87);
-   assert(dst.idx >= 1);
-   emit_2ub(p, 0xde, 0xf0+dst.idx);
-}
-
-void x87_fucom( struct x86_function *p, struct x86_reg arg )
-{
-   assert(arg.file == file_x87);
-   emit_2ub(p, 0xdd, 0xe0+arg.idx);
-}
-
-void x87_fucomp( struct x86_function *p, struct x86_reg arg )
-{
-   assert(arg.file == file_x87);
-   emit_2ub(p, 0xdd, 0xe8+arg.idx);
-}
-
-void x87_fucompp( struct x86_function *p )
-{
-   emit_2ub(p, 0xda, 0xe9);
-}
-
-void x87_fxch( struct x86_function *p, struct x86_reg arg )
-{
-   assert(arg.file == file_x87);
-   emit_2ub(p, 0xd9, 0xc8+arg.idx);
-}
-
-void x87_fabs( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xe1);
-}
-
-void x87_fchs( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xe0);
-}
-
-void x87_fcos( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xff);
-}
-
-
-void x87_fprndint( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xfc);
-}
-
-void x87_fscale( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xfd);
-}
-
-void x87_fsin( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xfe);
-}
-
-void x87_fsincos( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xfb);
-}
-
-void x87_fsqrt( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xfa);
-}
-
-void x87_fxtract( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xf4);
-}
-
-/* st0 = (2^st0)-1
- *
- * Restrictions: -1.0 <= st0 <= 1.0
- */
-void x87_f2xm1( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xf0);
-}
-
-/* st1 = st1 * log2(st0);
- * pop_stack;
- */
-void x87_fyl2x( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xf1);
-}
-
-/* st1 = st1 * log2(st0 + 1.0);
- * pop_stack;
- *
- * A fast operation, with restrictions: -.29 < st0 < .29 
- */
-void x87_fyl2xp1( struct x86_function *p )
-{
-   emit_2ub(p, 0xd9, 0xf9);
-}
-
-
-void x87_fld( struct x86_function *p, struct x86_reg arg )
-{
-   if (arg.file == file_x87) 
-      emit_2ub(p, 0xd9, 0xc0 + arg.idx);
-   else {
-      emit_1ub(p, 0xd9);
-      emit_modrm_noreg(p, 0, arg);
-   }
-}
-
-void x87_fst( struct x86_function *p, struct x86_reg dst )
-{
-   if (dst.file == file_x87) 
-      emit_2ub(p, 0xdd, 0xd0 + dst.idx);
-   else {
-      emit_1ub(p, 0xd9);
-      emit_modrm_noreg(p, 2, dst);
-   }
-}
-
-void x87_fstp( struct x86_function *p, struct x86_reg dst )
-{
-   if (dst.file == file_x87) 
-      emit_2ub(p, 0xdd, 0xd8 + dst.idx);
-   else {
-      emit_1ub(p, 0xd9);
-      emit_modrm_noreg(p, 3, dst);
-   }
-}
-
-void x87_fcom( struct x86_function *p, struct x86_reg dst )
-{
-   if (dst.file == file_x87) 
-      emit_2ub(p, 0xd8, 0xd0 + dst.idx);
-   else {
-      emit_1ub(p, 0xd8);
-      emit_modrm_noreg(p, 2, dst);
-   }
-}
-
-void x87_fcomp( struct x86_function *p, struct x86_reg dst )
-{
-   if (dst.file == file_x87) 
-      emit_2ub(p, 0xd8, 0xd8 + dst.idx);
-   else {
-      emit_1ub(p, 0xd8);
-      emit_modrm_noreg(p, 3, dst);
-   }
-}
-
-
-void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
-{
-   assert(dst.file == file_REG32);
-
-   if (dst.idx == reg_AX &&
-       dst.mod == mod_REG) 
-      emit_2ub(p, 0xdf, 0xe0);
-   else {
-      emit_1ub(p, 0xdd);
-      emit_modrm_noreg(p, 7, dst);
-   }
-}
-
-
-
-
-/***********************************************************************
- * MMX instructions
- */
-
-void mmx_emms( struct x86_function *p )
-{
-   assert(p->need_emms);
-   emit_2ub(p, 0x0f, 0x77);
-   p->need_emms = 0;
-}
-
-void mmx_packssdw( struct x86_function *p,
-		   struct x86_reg dst,
-		   struct x86_reg src )
-{
-   assert(dst.file == file_MMX && 
-	  (src.file == file_MMX || src.mod != mod_REG));
-
-   p->need_emms = 1;
-
-   emit_2ub(p, X86_TWOB, 0x6b);
-   emit_modrm( p, dst, src );
-}
-
-void mmx_packuswb( struct x86_function *p,
-		   struct x86_reg dst,
-		   struct x86_reg src )
-{
-   assert(dst.file == file_MMX && 
-	  (src.file == file_MMX || src.mod != mod_REG));
-
-   p->need_emms = 1;
-
-   emit_2ub(p, X86_TWOB, 0x67);
-   emit_modrm( p, dst, src );
-}
-
-void mmx_movd( struct x86_function *p,
-	       struct x86_reg dst,
-	       struct x86_reg src )
-{
-   p->need_emms = 1;
-   emit_1ub(p, X86_TWOB);
-   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
-}
-
-void mmx_movq( struct x86_function *p,
-	       struct x86_reg dst,
-	       struct x86_reg src )
-{
-   p->need_emms = 1;
-   emit_1ub(p, X86_TWOB);
-   emit_op_modrm( p, 0x6f, 0x7f, dst, src );
-}
-
-
-/***********************************************************************
- * Helper functions
- */
-
-
-/* Retreive a reference to one of the function arguments, taking into
- * account any push/pop activity:
- */
-struct x86_reg x86_fn_arg( struct x86_function *p,
-			   unsigned arg )
-{
-   return x86_make_disp(x86_make_reg(file_REG32, reg_SP), 
-			p->stack_offset + arg * 4);	/* ??? */
-}
-
-
-void x86_init_func( struct x86_function *p )
-{
-   p->size = 0;
-   p->store = NULL;
-   p->csr = p->store;
-}
-
-void x86_init_func_size( struct x86_function *p, unsigned code_size )
-{
-   p->size = code_size;
-   p->store = _mesa_exec_malloc(code_size);
-   p->csr = p->store;
-}
-
-void x86_release_func( struct x86_function *p )
-{
-   _mesa_exec_free(p->store);
-   p->store = NULL;
-   p->csr = NULL;
-   p->size = 0;
-}
-
-
-void (*x86_get_func( struct x86_function *p ))(void)
-{
-   if (DISASSEM && p->store)
-      _mesa_printf("disassemble %p %p\n", p->store, p->csr);
-   return (void (*)(void)) (unsigned long) p->store;
-}
-
-#else
-
-void x86sse_dummy( void )
-{
-}
-
-#endif
diff --git a/src/gallium/auxiliary/rtasm/x86sse.h b/src/gallium/auxiliary/rtasm/x86sse.h
deleted file mode 100644
index c2aa416492..0000000000
--- a/src/gallium/auxiliary/rtasm/x86sse.h
+++ /dev/null
@@ -1,256 +0,0 @@
-
-#ifndef _X86SSE_H_
-#define _X86SSE_H_
-
-#if defined(__i386__) || defined(__386__)
-
-/* It is up to the caller to ensure that instructions issued are
- * suitable for the host cpu.  There are no checks made in this module
- * for mmx/sse/sse2 support on the cpu.
- */
-struct x86_reg {
-   unsigned file:3;
-   unsigned idx:3;
-   unsigned mod:2;		/* mod_REG if this is just a register */
-   int      disp:24;		/* only +/- 23bits of offset - should be enough... */
-};
-
-struct x86_function {
-   unsigned size;
-   unsigned char *store;
-   unsigned char *csr;
-   unsigned stack_offset;
-   int need_emms;
-   const char *fn;
-};
-
-enum x86_reg_file {
-   file_REG32,
-   file_MMX,
-   file_XMM,
-   file_x87
-};
-
-/* Values for mod field of modr/m byte
- */
-enum x86_reg_mod {
-   mod_INDIRECT,
-   mod_DISP8,
-   mod_DISP32,
-   mod_REG
-};
-
-enum x86_reg_name {
-   reg_AX,
-   reg_CX,
-   reg_DX,
-   reg_BX,
-   reg_SP,
-   reg_BP,
-   reg_SI,
-   reg_DI
-};
-
-
-enum x86_cc {
-   cc_O,			/* overflow */
-   cc_NO,			/* not overflow */
-   cc_NAE,			/* not above or equal / carry */
-   cc_AE,			/* above or equal / not carry */
-   cc_E,			/* equal / zero */
-   cc_NE			/* not equal / not zero */
-};
-
-enum sse_cc {
-   cc_Equal,
-   cc_LessThan,
-   cc_LessThanEqual,
-   cc_Unordered,
-   cc_NotEqual,
-   cc_NotLessThan,
-   cc_NotLessThanEqual,
-   cc_Ordered
-};
-
-#define cc_Z  cc_E
-#define cc_NZ cc_NE
-
-/* Begin/end/retreive function creation:
- */
-
-
-void x86_init_func( struct x86_function *p );
-void x86_init_func_size( struct x86_function *p, unsigned code_size );
-void x86_release_func( struct x86_function *p );
-void (*x86_get_func( struct x86_function *p ))( void );
-
-
-
-/* Create and manipulate registers and regmem values:
- */
-struct x86_reg x86_make_reg( enum x86_reg_file file,
-			     enum x86_reg_name idx );
-
-struct x86_reg x86_make_disp( struct x86_reg reg,
-			      int disp );
-
-struct x86_reg x86_deref( struct x86_reg reg );
-
-struct x86_reg x86_get_base_reg( struct x86_reg reg );
-
-
-/* Labels, jumps and fixup:
- */
-unsigned char *x86_get_label( struct x86_function *p );
-
-void x86_jcc( struct x86_function *p,
-	      enum x86_cc cc,
-	      unsigned char *label );
-
-unsigned char *x86_jcc_forward( struct x86_function *p,
-			  enum x86_cc cc );
-
-unsigned char *x86_jmp_forward( struct x86_function *p);
-
-unsigned char *x86_call_forward( struct x86_function *p);
-
-void x86_fixup_fwd_jump( struct x86_function *p,
-			 unsigned char *fixup );
-
-void x86_jmp( struct x86_function *p, unsigned char *label );
-
-/* void x86_call( struct x86_function *p, void (*label)() ); */
-void x86_call( struct x86_function *p, struct x86_reg reg);
-
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
-void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
-
-
-/* Macro for sse_shufps() and sse2_pshufd():
- */
-#define SHUF(_x,_y,_z,_w)       (((_x)<<0) | ((_y)<<2) | ((_z)<<4) | ((_w)<<6))
-#define SHUF_NOOP               RSW(0,1,2,3)
-#define GET_SHUF(swz, idx)      (((swz) >> ((idx)*2)) & 0x3)
-
-void mmx_emms( struct x86_function *p );
-void mmx_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-
-void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
-                  unsigned char shuf );
-void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-
-void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src,
-                unsigned char cc );
-void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movlhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_orps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_xorps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
-                 unsigned char shuf );
-void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
-
-void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_cmp( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_dec( struct x86_function *p, struct x86_reg reg );
-void x86_inc( struct x86_function *p, struct x86_reg reg );
-void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_mul( struct x86_function *p, struct x86_reg src );
-void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_pop( struct x86_function *p, struct x86_reg reg );
-void x86_push( struct x86_function *p, struct x86_reg reg );
-void x86_ret( struct x86_function *p );
-void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-void x86_sahf( struct x86_function *p );
-
-void x87_f2xm1( struct x86_function *p );
-void x87_fabs( struct x86_function *p );
-void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_faddp( struct x86_function *p, struct x86_reg dst );
-void x87_fchs( struct x86_function *p );
-void x87_fclex( struct x86_function *p );
-void x87_fcom( struct x86_function *p, struct x86_reg dst );
-void x87_fcomp( struct x86_function *p, struct x86_reg dst );
-void x87_fcos( struct x86_function *p );
-void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fdivp( struct x86_function *p, struct x86_reg dst );
-void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fdivrp( struct x86_function *p, struct x86_reg dst );
-void x87_fild( struct x86_function *p, struct x86_reg arg );
-void x87_fist( struct x86_function *p, struct x86_reg dst );
-void x87_fistp( struct x86_function *p, struct x86_reg dst );
-void x87_fld( struct x86_function *p, struct x86_reg arg );
-void x87_fld1( struct x86_function *p );
-void x87_fldcw( struct x86_function *p, struct x86_reg arg );
-void x87_fldl2e( struct x86_function *p );
-void x87_fldln2( struct x86_function *p );
-void x87_fldz( struct x86_function *p );
-void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fmulp( struct x86_function *p, struct x86_reg dst );
-void x87_fnclex( struct x86_function *p );
-void x87_fprndint( struct x86_function *p );
-void x87_fscale( struct x86_function *p );
-void x87_fsin( struct x86_function *p );
-void x87_fsincos( struct x86_function *p );
-void x87_fsqrt( struct x86_function *p );
-void x87_fst( struct x86_function *p, struct x86_reg dst );
-void x87_fstp( struct x86_function *p, struct x86_reg dst );
-void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fsubp( struct x86_function *p, struct x86_reg dst );
-void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
-void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
-void x87_fxch( struct x86_function *p, struct x86_reg dst );
-void x87_fxtract( struct x86_function *p );
-void x87_fyl2x( struct x86_function *p );
-void x87_fyl2xp1( struct x86_function *p );
-void x87_fwait( struct x86_function *p );
-void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
-void x87_fucompp( struct x86_function *p );
-void x87_fucomp( struct x86_function *p, struct x86_reg arg );
-void x87_fucom( struct x86_function *p, struct x86_reg arg );
-
-
-
-/* Retreive a reference to one of the function arguments, taking into
- * account any push/pop activity.  Note - doesn't track explict
- * manipulation of ESP by other instructions.
- */
-struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg );
-
-#endif
-#endif
-- 
cgit v1.2.3


From d2f6c9ab10656f6ecda131a6785a60565026d249 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 19 Feb 2008 12:05:32 +0900
Subject: Add copyright headers to all rtasm source files.

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 23 +++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h | 28 +++++++++++++++++++++++++---
 2 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 3c885a9fff..b332192a62 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -1,3 +1,26 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #if defined(__i386__) || defined(__386__)
 
 #include "pipe/p_compiler.h"
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index c2aa416492..e4576001bf 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -1,6 +1,28 @@
-
-#ifndef _X86SSE_H_
-#define _X86SSE_H_
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _RTASM_X86SSE_H_
+#define _RTASM_X86SSE_H_
 
 #if defined(__i386__) || defined(__386__)
 
-- 
cgit v1.2.3


From 17158c2f00f5bee29ec8239367fd5498f22e4a91 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 19 Feb 2008 12:24:42 +0900
Subject: Move mm.c code into util module.

Using the u_ prefix to distingish the c source files that support gallium
interfaces and those that have really no relation with gallium itself.
---
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c | 302 +-----------------------
 src/gallium/auxiliary/rtasm/Makefile            |   5 +-
 src/gallium/auxiliary/rtasm/SConscript          |   3 +-
 src/gallium/auxiliary/rtasm/mm.c                | 283 ----------------------
 src/gallium/auxiliary/rtasm/mm.h                |  89 -------
 src/gallium/auxiliary/rtasm/rtasm_execmem.c     |   2 +-
 src/gallium/auxiliary/util/Makefile             |   3 +-
 src/gallium/auxiliary/util/SConscript           |   1 +
 src/gallium/auxiliary/util/u_mm.c               | 283 ++++++++++++++++++++++
 src/gallium/auxiliary/util/u_mm.h               |  91 +++++++
 10 files changed, 382 insertions(+), 680 deletions(-)
 delete mode 100644 src/gallium/auxiliary/rtasm/mm.c
 delete mode 100644 src/gallium/auxiliary/rtasm/mm.h
 create mode 100644 src/gallium/auxiliary/util/u_mm.c
 create mode 100644 src/gallium/auxiliary/util/u_mm.h

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index 969aab51b5..983a105347 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -1,7 +1,6 @@
 /**************************************************************************
  *
  * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
- * Copyright 1999 Wittawat Yamwong
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -40,6 +39,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
 #include "pipe/p_util.h"
+#include "util/u_mm.h"
 #include "pb_buffer.h"
 #include "pb_bufmgr.h"
 
@@ -50,306 +50,6 @@
 #define SUPER(__derived) (&(__derived)->base)
 
 
-struct mem_block 
-{
-   struct mem_block *next, *prev;
-   struct mem_block *next_free, *prev_free;
-   struct mem_block *heap;
-   int ofs, size;
-   unsigned int free:1;
-   unsigned int reserved:1;
-};
-
-
-#ifdef DEBUG
-/**
- * For debugging purposes.
- */
-static void
-mmDumpMemInfo(const struct mem_block *heap)
-{
-   debug_printf("Memory heap %p:\n", (void *)heap);
-   if (heap == 0) {
-      debug_printf("  heap == 0\n");
-   } else {
-      const struct mem_block *p;
-
-      for(p = heap->next; p != heap; p = p->next) {
-	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
-		 p->free ? 'F':'.',
-		 p->reserved ? 'R':'.');
-      }
-
-      debug_printf("\nFree list:\n");
-
-      for(p = heap->next_free; p != heap; p = p->next_free) {
-	 debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
-		 p->free ? 'F':'.',
-		 p->reserved ? 'R':'.');
-      }
-
-   }
-   debug_printf("End of memory blocks\n");
-}
-#endif
-
-
-/** 
- * input: total size in bytes
- * return: a heap pointer if OK, NULL if error
- */
-static struct mem_block *
-mmInit(int ofs, int size)
-{
-   struct mem_block *heap, *block;
-  
-   if (size <= 0) 
-      return NULL;
-
-   heap = CALLOC_STRUCT(mem_block);
-   if (!heap) 
-      return NULL;
-   
-   block = CALLOC_STRUCT(mem_block);
-   if (!block) {
-      FREE(heap);
-      return NULL;
-   }
-
-   heap->next = block;
-   heap->prev = block;
-   heap->next_free = block;
-   heap->prev_free = block;
-
-   block->heap = heap;
-   block->next = heap;
-   block->prev = heap;
-   block->next_free = heap;
-   block->prev_free = heap;
-
-   block->ofs = ofs;
-   block->size = size;
-   block->free = 1;
-
-   return heap;
-}
-
-
-static struct mem_block *
-SliceBlock(struct mem_block *p, 
-           int startofs, int size, 
-           int reserved, int alignment)
-{
-   struct mem_block *newblock;
-
-   /* break left  [p, newblock, p->next], then p = newblock */
-   if (startofs > p->ofs) {
-      newblock = CALLOC_STRUCT(mem_block);
-      if (!newblock)
-	 return NULL;
-      newblock->ofs = startofs;
-      newblock->size = p->size - (startofs - p->ofs);
-      newblock->free = 1;
-      newblock->heap = p->heap;
-
-      newblock->next = p->next;
-      newblock->prev = p;
-      p->next->prev = newblock;
-      p->next = newblock;
-
-      newblock->next_free = p->next_free;
-      newblock->prev_free = p;
-      p->next_free->prev_free = newblock;
-      p->next_free = newblock;
-
-      p->size -= newblock->size;
-      p = newblock;
-   }
-
-   /* break right, also [p, newblock, p->next] */
-   if (size < p->size) {
-      newblock = CALLOC_STRUCT(mem_block);
-      if (!newblock)
-	 return NULL;
-      newblock->ofs = startofs + size;
-      newblock->size = p->size - size;
-      newblock->free = 1;
-      newblock->heap = p->heap;
-
-      newblock->next = p->next;
-      newblock->prev = p;
-      p->next->prev = newblock;
-      p->next = newblock;
-
-      newblock->next_free = p->next_free;
-      newblock->prev_free = p;
-      p->next_free->prev_free = newblock;
-      p->next_free = newblock;
-	 
-      p->size = size;
-   }
-
-   /* p = middle block */
-   p->free = 0;
-
-   /* Remove p from the free list: 
-    */
-   p->next_free->prev_free = p->prev_free;
-   p->prev_free->next_free = p->next_free;
-
-   p->next_free = 0;
-   p->prev_free = 0;
-
-   p->reserved = reserved;
-   return p;
-}
-
-
-/**
- * Allocate 'size' bytes with 2^align2 bytes alignment,
- * restrict the search to free memory after 'startSearch'
- * depth and back buffers should be in different 4mb banks
- * to get better page hits if possible
- * input:	size = size of block
- *       	align2 = 2^align2 bytes alignment
- *		startSearch = linear offset from start of heap to begin search
- * return: pointer to the allocated block, 0 if error
- */
-static struct mem_block *
-mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
-{
-   struct mem_block *p;
-   const int mask = (1 << align2)-1;
-   int startofs = 0;
-   int endofs;
-
-   if (!heap || align2 < 0 || size <= 0)
-      return NULL;
-
-   for (p = heap->next_free; p != heap; p = p->next_free) {
-      assert(p->free);
-
-      startofs = (p->ofs + mask) & ~mask;
-      if ( startofs < startSearch ) {
-	 startofs = startSearch;
-      }
-      endofs = startofs+size;
-      if (endofs <= (p->ofs+p->size))
-	 break;
-   }
-
-   if (p == heap) 
-      return NULL;
-
-   assert(p->free);
-   p = SliceBlock(p,startofs,size,0,mask+1);
-
-   return p;
-}
-
-
-#if 0
-/**
- * Free block starts at offset
- * input: pointer to a heap, start offset
- * return: pointer to a block
- */
-static struct mem_block *
-mmFindBlock(struct mem_block *heap, int start)
-{
-   struct mem_block *p;
-
-   for (p = heap->next; p != heap; p = p->next) {
-      if (p->ofs == start) 
-	 return p;
-   }
-
-   return NULL;
-}
-#endif
-
-
-static INLINE int
-Join2Blocks(struct mem_block *p)
-{
-   /* XXX there should be some assertions here */
-
-   /* NOTE: heap->free == 0 */
-
-   if (p->free && p->next->free) {
-      struct mem_block *q = p->next;
-
-      assert(p->ofs + p->size == q->ofs);
-      p->size += q->size;
-
-      p->next = q->next;
-      q->next->prev = p;
-
-      q->next_free->prev_free = q->prev_free; 
-      q->prev_free->next_free = q->next_free;
-     
-      FREE(q);
-      return 1;
-   }
-   return 0;
-}
-
-
-/**
- * Free block starts at offset
- * input: pointer to a block
- * return: 0 if OK, -1 if error
- */
-static int
-mmFreeMem(struct mem_block *b)
-{
-   if (!b)
-      return 0;
-
-   if (b->free) {
-      debug_printf("block already free\n");
-      return -1;
-   }
-   if (b->reserved) {
-      debug_printf("block is reserved\n");
-      return -1;
-   }
-
-   b->free = 1;
-   b->next_free = b->heap->next_free;
-   b->prev_free = b->heap;
-   b->next_free->prev_free = b;
-   b->prev_free->next_free = b;
-
-   Join2Blocks(b);
-   if (b->prev != b->heap)
-      Join2Blocks(b->prev);
-
-   return 0;
-}
-
-
-/**
- * destroy MM
- */
-static void
-mmDestroy(struct mem_block *heap)
-{
-   struct mem_block *p;
-
-   if (!heap)
-      return;
-
-   for (p = heap->next; p != heap; ) {
-      struct mem_block *next = p->next;
-      FREE(p);
-      p = next;
-   }
-
-   FREE(heap);
-}
-
-
 struct mm_pb_manager
 {
    struct pb_manager base;
diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index 7c8ac60794..edfae2a204 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -5,9 +5,8 @@ include $(TOP)/configs/current
 LIBNAME = rtasm
 
 DRIVER_SOURCES = \
-	execmem.c \
-	x86sse.c \
-	mm.c
+	rtasm_execmem.c \
+	rtasm_x86sse.c
 
 C_SOURCES = \
 	$(DRIVER_SOURCES)
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index de8456e0ca..6eca1fe4c0 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -4,8 +4,7 @@ rtasm = env.ConvenienceLibrary(
 	target = 'rtasm',
 	source = [
 		'rtasm_execmem.c',
-		'rtasm_x86sse.c',
-		'mm.c',
+		'rtasm_x86sse.c'
 	])
 
 auxiliaries.insert(0, rtasm)
diff --git a/src/gallium/auxiliary/rtasm/mm.c b/src/gallium/auxiliary/rtasm/mm.c
deleted file mode 100644
index 15f50491da..0000000000
--- a/src/gallium/auxiliary/rtasm/mm.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * GLX Hardware Device Driver common code
- * Copyright (C) 1999 Wittawat Yamwong
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, 
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
- * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
-#include "pipe/p_debug.h"
-
-#include "mm.h"
-
-
-void
-mmDumpMemInfo(const struct mem_block *heap)
-{
-   debug_printf("Memory heap %p:\n", (void *)heap);
-   if (heap == 0) {
-      debug_printf("  heap == 0\n");
-   } else {
-      const struct mem_block *p;
-
-      for(p = heap->next; p != heap; p = p->next) {
-	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
-		 p->free ? 'F':'.',
-		 p->reserved ? 'R':'.');
-      }
-
-      debug_printf("\nFree list:\n");
-
-      for(p = heap->next_free; p != heap; p = p->next_free) {
-	 debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
-		 p->free ? 'F':'.',
-		 p->reserved ? 'R':'.');
-      }
-
-   }
-   debug_printf("End of memory blocks\n");
-}
-
-struct mem_block *
-mmInit(int ofs, int size)
-{
-   struct mem_block *heap, *block;
-  
-   if (size <= 0) 
-      return NULL;
-
-   heap = CALLOC_STRUCT(mem_block);
-   if (!heap) 
-      return NULL;
-   
-   block = CALLOC_STRUCT(mem_block);
-   if (!block) {
-      FREE(heap);
-      return NULL;
-   }
-
-   heap->next = block;
-   heap->prev = block;
-   heap->next_free = block;
-   heap->prev_free = block;
-
-   block->heap = heap;
-   block->next = heap;
-   block->prev = heap;
-   block->next_free = heap;
-   block->prev_free = heap;
-
-   block->ofs = ofs;
-   block->size = size;
-   block->free = 1;
-
-   return heap;
-}
-
-
-static struct mem_block *
-SliceBlock(struct mem_block *p, 
-           int startofs, int size, 
-           int reserved, int alignment)
-{
-   struct mem_block *newblock;
-
-   /* break left  [p, newblock, p->next], then p = newblock */
-   if (startofs > p->ofs) {
-      newblock = CALLOC_STRUCT(mem_block);
-      if (!newblock)
-	 return NULL;
-      newblock->ofs = startofs;
-      newblock->size = p->size - (startofs - p->ofs);
-      newblock->free = 1;
-      newblock->heap = p->heap;
-
-      newblock->next = p->next;
-      newblock->prev = p;
-      p->next->prev = newblock;
-      p->next = newblock;
-
-      newblock->next_free = p->next_free;
-      newblock->prev_free = p;
-      p->next_free->prev_free = newblock;
-      p->next_free = newblock;
-
-      p->size -= newblock->size;
-      p = newblock;
-   }
-
-   /* break right, also [p, newblock, p->next] */
-   if (size < p->size) {
-      newblock = CALLOC_STRUCT(mem_block);
-      if (!newblock)
-	 return NULL;
-      newblock->ofs = startofs + size;
-      newblock->size = p->size - size;
-      newblock->free = 1;
-      newblock->heap = p->heap;
-
-      newblock->next = p->next;
-      newblock->prev = p;
-      p->next->prev = newblock;
-      p->next = newblock;
-
-      newblock->next_free = p->next_free;
-      newblock->prev_free = p;
-      p->next_free->prev_free = newblock;
-      p->next_free = newblock;
-	 
-      p->size = size;
-   }
-
-   /* p = middle block */
-   p->free = 0;
-
-   /* Remove p from the free list: 
-    */
-   p->next_free->prev_free = p->prev_free;
-   p->prev_free->next_free = p->next_free;
-
-   p->next_free = 0;
-   p->prev_free = 0;
-
-   p->reserved = reserved;
-   return p;
-}
-
-
-struct mem_block *
-mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
-{
-   struct mem_block *p;
-   const int mask = (1 << align2)-1;
-   int startofs = 0;
-   int endofs;
-
-   if (!heap || align2 < 0 || size <= 0)
-      return NULL;
-
-   for (p = heap->next_free; p != heap; p = p->next_free) {
-      assert(p->free);
-
-      startofs = (p->ofs + mask) & ~mask;
-      if ( startofs < startSearch ) {
-	 startofs = startSearch;
-      }
-      endofs = startofs+size;
-      if (endofs <= (p->ofs+p->size))
-	 break;
-   }
-
-   if (p == heap) 
-      return NULL;
-
-   assert(p->free);
-   p = SliceBlock(p,startofs,size,0,mask+1);
-
-   return p;
-}
-
-
-struct mem_block *
-mmFindBlock(struct mem_block *heap, int start)
-{
-   struct mem_block *p;
-
-   for (p = heap->next; p != heap; p = p->next) {
-      if (p->ofs == start) 
-	 return p;
-   }
-
-   return NULL;
-}
-
-
-static INLINE int
-Join2Blocks(struct mem_block *p)
-{
-   /* XXX there should be some assertions here */
-
-   /* NOTE: heap->free == 0 */
-
-   if (p->free && p->next->free) {
-      struct mem_block *q = p->next;
-
-      assert(p->ofs + p->size == q->ofs);
-      p->size += q->size;
-
-      p->next = q->next;
-      q->next->prev = p;
-
-      q->next_free->prev_free = q->prev_free; 
-      q->prev_free->next_free = q->next_free;
-     
-      FREE(q);
-      return 1;
-   }
-   return 0;
-}
-
-int
-mmFreeMem(struct mem_block *b)
-{
-   if (!b)
-      return 0;
-
-   if (b->free) {
-      debug_printf("block already free\n");
-      return -1;
-   }
-   if (b->reserved) {
-      debug_printf("block is reserved\n");
-      return -1;
-   }
-
-   b->free = 1;
-   b->next_free = b->heap->next_free;
-   b->prev_free = b->heap;
-   b->next_free->prev_free = b;
-   b->prev_free->next_free = b;
-
-   Join2Blocks(b);
-   if (b->prev != b->heap)
-      Join2Blocks(b->prev);
-
-   return 0;
-}
-
-
-void
-mmDestroy(struct mem_block *heap)
-{
-   struct mem_block *p;
-
-   if (!heap)
-      return;
-
-   for (p = heap->next; p != heap; ) {
-      struct mem_block *next = p->next;
-      FREE(p);
-      p = next;
-   }
-
-   FREE(heap);
-}
diff --git a/src/gallium/auxiliary/rtasm/mm.h b/src/gallium/auxiliary/rtasm/mm.h
deleted file mode 100644
index f469b18d3e..0000000000
--- a/src/gallium/auxiliary/rtasm/mm.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * GLX Hardware Device Driver common code
- * Copyright (C) 1999 Wittawat Yamwong
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * KEITH WHITWELL, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, 
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
- * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-/**
- * Memory manager code.  Primarily used by device drivers to manage texture
- * heaps, etc.
- */
-
-
-#ifndef MM_H
-#define MM_H
-
-
-struct mem_block {
-   struct mem_block *next, *prev;
-   struct mem_block *next_free, *prev_free;
-   struct mem_block *heap;
-   int ofs,size;
-   unsigned int free:1;
-   unsigned int reserved:1;
-};
-
-
-
-/** 
- * input: total size in bytes
- * return: a heap pointer if OK, NULL if error
- */
-extern struct mem_block *mmInit(int ofs, int size);
-
-/**
- * Allocate 'size' bytes with 2^align2 bytes alignment,
- * restrict the search to free memory after 'startSearch'
- * depth and back buffers should be in different 4mb banks
- * to get better page hits if possible
- * input:	size = size of block
- *       	align2 = 2^align2 bytes alignment
- *		startSearch = linear offset from start of heap to begin search
- * return: pointer to the allocated block, 0 if error
- */
-extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2, 
-                            int startSearch);
-
-/**
- * Free block starts at offset
- * input: pointer to a block
- * return: 0 if OK, -1 if error
- */
-extern int mmFreeMem(struct mem_block *b);
-
-/**
- * Free block starts at offset
- * input: pointer to a heap, start offset
- * return: pointer to a block
- */
-extern struct mem_block *mmFindBlock(struct mem_block *heap, int start);
-
-/**
- * destroy MM
- */
-extern void mmDestroy(struct mem_block *mmInit);
-
-/**
- * For debuging purpose.
- */
-extern void mmDumpMemInfo(const struct mem_block *mmInit);
-
-#endif
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index cb13db2498..9c78fa5626 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -46,7 +46,7 @@
 
 #include <unistd.h>
 #include <sys/mman.h>
-#include "mm.h"
+#include "util/u_mm.h"
 
 #define EXEC_HEAP_SIZE (10*1024*1024)
 
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index b8cb148c4f..7cc2aa44f9 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -7,7 +7,8 @@ LIBNAME = util
 DRIVER_SOURCES = \
 	p_debug.c \
 	p_tile.c \
-	p_util.c
+	p_util.c \
+	u_mm.c
 
 C_SOURCES = \
 	$(DRIVER_SOURCES)
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index b126cf44d6..4717941434 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -6,6 +6,7 @@ util = env.ConvenienceLibrary(
 		'p_debug.c',
 		'p_tile.c',
 		'p_util.c',
+		'u_mm.c',
 	])
 
 auxiliaries.insert(0, util)
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
new file mode 100644
index 0000000000..b49ae074e0
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -0,0 +1,283 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999 Wittawat Yamwong
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, 
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
+#include "pipe/p_debug.h"
+
+#include "util/u_mm.h"
+
+
+void
+mmDumpMemInfo(const struct mem_block *heap)
+{
+   debug_printf("Memory heap %p:\n", (void *)heap);
+   if (heap == 0) {
+      debug_printf("  heap == 0\n");
+   } else {
+      const struct mem_block *p;
+
+      for(p = heap->next; p != heap; p = p->next) {
+	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+		 p->free ? 'F':'.',
+		 p->reserved ? 'R':'.');
+      }
+
+      debug_printf("\nFree list:\n");
+
+      for(p = heap->next_free; p != heap; p = p->next_free) {
+	 debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+		 p->free ? 'F':'.',
+		 p->reserved ? 'R':'.');
+      }
+
+   }
+   debug_printf("End of memory blocks\n");
+}
+
+struct mem_block *
+mmInit(int ofs, int size)
+{
+   struct mem_block *heap, *block;
+  
+   if (size <= 0) 
+      return NULL;
+
+   heap = CALLOC_STRUCT(mem_block);
+   if (!heap) 
+      return NULL;
+   
+   block = CALLOC_STRUCT(mem_block);
+   if (!block) {
+      FREE(heap);
+      return NULL;
+   }
+
+   heap->next = block;
+   heap->prev = block;
+   heap->next_free = block;
+   heap->prev_free = block;
+
+   block->heap = heap;
+   block->next = heap;
+   block->prev = heap;
+   block->next_free = heap;
+   block->prev_free = heap;
+
+   block->ofs = ofs;
+   block->size = size;
+   block->free = 1;
+
+   return heap;
+}
+
+
+static struct mem_block *
+SliceBlock(struct mem_block *p, 
+           int startofs, int size, 
+           int reserved, int alignment)
+{
+   struct mem_block *newblock;
+
+   /* break left  [p, newblock, p->next], then p = newblock */
+   if (startofs > p->ofs) {
+      newblock = CALLOC_STRUCT(mem_block);
+      if (!newblock)
+	 return NULL;
+      newblock->ofs = startofs;
+      newblock->size = p->size - (startofs - p->ofs);
+      newblock->free = 1;
+      newblock->heap = p->heap;
+
+      newblock->next = p->next;
+      newblock->prev = p;
+      p->next->prev = newblock;
+      p->next = newblock;
+
+      newblock->next_free = p->next_free;
+      newblock->prev_free = p;
+      p->next_free->prev_free = newblock;
+      p->next_free = newblock;
+
+      p->size -= newblock->size;
+      p = newblock;
+   }
+
+   /* break right, also [p, newblock, p->next] */
+   if (size < p->size) {
+      newblock = CALLOC_STRUCT(mem_block);
+      if (!newblock)
+	 return NULL;
+      newblock->ofs = startofs + size;
+      newblock->size = p->size - size;
+      newblock->free = 1;
+      newblock->heap = p->heap;
+
+      newblock->next = p->next;
+      newblock->prev = p;
+      p->next->prev = newblock;
+      p->next = newblock;
+
+      newblock->next_free = p->next_free;
+      newblock->prev_free = p;
+      p->next_free->prev_free = newblock;
+      p->next_free = newblock;
+	 
+      p->size = size;
+   }
+
+   /* p = middle block */
+   p->free = 0;
+
+   /* Remove p from the free list: 
+    */
+   p->next_free->prev_free = p->prev_free;
+   p->prev_free->next_free = p->next_free;
+
+   p->next_free = 0;
+   p->prev_free = 0;
+
+   p->reserved = reserved;
+   return p;
+}
+
+
+struct mem_block *
+mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+{
+   struct mem_block *p;
+   const int mask = (1 << align2)-1;
+   int startofs = 0;
+   int endofs;
+
+   if (!heap || align2 < 0 || size <= 0)
+      return NULL;
+
+   for (p = heap->next_free; p != heap; p = p->next_free) {
+      assert(p->free);
+
+      startofs = (p->ofs + mask) & ~mask;
+      if ( startofs < startSearch ) {
+	 startofs = startSearch;
+      }
+      endofs = startofs+size;
+      if (endofs <= (p->ofs+p->size))
+	 break;
+   }
+
+   if (p == heap) 
+      return NULL;
+
+   assert(p->free);
+   p = SliceBlock(p,startofs,size,0,mask+1);
+
+   return p;
+}
+
+
+struct mem_block *
+mmFindBlock(struct mem_block *heap, int start)
+{
+   struct mem_block *p;
+
+   for (p = heap->next; p != heap; p = p->next) {
+      if (p->ofs == start) 
+	 return p;
+   }
+
+   return NULL;
+}
+
+
+static INLINE int
+Join2Blocks(struct mem_block *p)
+{
+   /* XXX there should be some assertions here */
+
+   /* NOTE: heap->free == 0 */
+
+   if (p->free && p->next->free) {
+      struct mem_block *q = p->next;
+
+      assert(p->ofs + p->size == q->ofs);
+      p->size += q->size;
+
+      p->next = q->next;
+      q->next->prev = p;
+
+      q->next_free->prev_free = q->prev_free; 
+      q->prev_free->next_free = q->next_free;
+     
+      FREE(q);
+      return 1;
+   }
+   return 0;
+}
+
+int
+mmFreeMem(struct mem_block *b)
+{
+   if (!b)
+      return 0;
+
+   if (b->free) {
+      debug_printf("block already free\n");
+      return -1;
+   }
+   if (b->reserved) {
+      debug_printf("block is reserved\n");
+      return -1;
+   }
+
+   b->free = 1;
+   b->next_free = b->heap->next_free;
+   b->prev_free = b->heap;
+   b->next_free->prev_free = b;
+   b->prev_free->next_free = b;
+
+   Join2Blocks(b);
+   if (b->prev != b->heap)
+      Join2Blocks(b->prev);
+
+   return 0;
+}
+
+
+void
+mmDestroy(struct mem_block *heap)
+{
+   struct mem_block *p;
+
+   if (!heap)
+      return;
+
+   for (p = heap->next; p != heap; ) {
+      struct mem_block *next = p->next;
+      FREE(p);
+      p = next;
+   }
+
+   FREE(heap);
+}
diff --git a/src/gallium/auxiliary/util/u_mm.h b/src/gallium/auxiliary/util/u_mm.h
new file mode 100644
index 0000000000..b226b101cb
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_mm.h
@@ -0,0 +1,91 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999 Wittawat Yamwong
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * KEITH WHITWELL, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, 
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Memory manager code.  Primarily used by device drivers to manage texture
+ * heaps, etc.
+ */
+
+
+#ifndef _U_MM_H_
+#define _U_MM_H_
+
+
+struct mem_block {
+   struct mem_block *next, *prev;
+   struct mem_block *next_free, *prev_free;
+   struct mem_block *heap;
+   int ofs,size;
+   unsigned int free:1;
+   unsigned int reserved:1;
+};
+
+
+
+/** 
+ * input: total size in bytes
+ * return: a heap pointer if OK, NULL if error
+ */
+extern struct mem_block *mmInit(int ofs, int size);
+
+/**
+ * Allocate 'size' bytes with 2^align2 bytes alignment,
+ * restrict the search to free memory after 'startSearch'
+ * depth and back buffers should be in different 4mb banks
+ * to get better page hits if possible
+ * input:	size = size of block
+ *       	align2 = 2^align2 bytes alignment
+ *		startSearch = linear offset from start of heap to begin search
+ * return: pointer to the allocated block, 0 if error
+ */
+extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2, 
+                            int startSearch);
+
+/**
+ * Free block starts at offset
+ * input: pointer to a block
+ * return: 0 if OK, -1 if error
+ */
+extern int mmFreeMem(struct mem_block *b);
+
+/**
+ * Free block starts at offset
+ * input: pointer to a heap, start offset
+ * return: pointer to a block
+ */
+extern struct mem_block *mmFindBlock(struct mem_block *heap, int start);
+
+/**
+ * destroy MM
+ */
+extern void mmDestroy(struct mem_block *mmInit);
+
+/**
+ * For debuging purpose.
+ */
+extern void mmDumpMemInfo(const struct mem_block *mmInit);
+
+#endif
-- 
cgit v1.2.3


From 90b2beb661f630966788a6e909dc759c99e38973 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 19 Feb 2008 13:27:13 +0900
Subject: Simplify makefile boilerplate code.

Don't define ASM_SOURCES variable globally -- reserve that variable to be defined
locally by makefiles, together with C_SOURCES and CPP_SOURCES.
---
 configs/beos                              |  4 ++--
 configs/default                           |  2 +-
 configs/freebsd-dri                       |  2 +-
 configs/freebsd-dri-amd64                 |  4 ++--
 configs/freebsd-dri-x86                   |  4 ++--
 configs/linux-directfb                    |  4 ++--
 configs/linux-dri                         |  2 +-
 configs/linux-dri-ppc                     |  2 +-
 configs/linux-dri-x86                     |  4 ++--
 configs/linux-dri-x86-64                  |  4 ++--
 configs/linux-dri-xcb                     |  2 +-
 configs/linux-icc                         |  4 ++--
 configs/linux-icc-static                  |  4 ++--
 configs/linux-indirect                    |  2 +-
 configs/linux-solo                        |  2 +-
 configs/linux-solo-x86                    |  4 ++--
 configs/linux-sparc                       |  4 ++--
 configs/linux-x86                         |  4 ++--
 configs/linux-x86-64                      |  4 ++--
 configs/linux-x86-glide                   |  4 ++--
 configs/sunos5-gcc                        |  4 ++--
 src/gallium/auxiliary/cso_cache/Makefile  |  7 +------
 src/gallium/auxiliary/draw/Makefile       |  7 +------
 src/gallium/auxiliary/pipebuffer/Makefile |  8 +-------
 src/gallium/auxiliary/rtasm/Makefile      |  8 +-------
 src/gallium/auxiliary/tgsi/Makefile       |  8 +-------
 src/gallium/auxiliary/util/Makefile       |  8 +-------
 src/gallium/drivers/failover/Makefile     |  9 +--------
 src/gallium/drivers/i915simple/Makefile   |  9 +--------
 src/gallium/drivers/i965simple/Makefile   | 25 +++++++------------------
 src/gallium/drivers/softpipe/Makefile     |  9 +--------
 src/gallium/winsys/dri/Makefile.template  |  5 +++--
 src/glx/x11/Makefile                      |  6 +++---
 src/mesa/sources                          |  6 +++---
 34 files changed, 60 insertions(+), 125 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/configs/beos b/configs/beos
index 2b74af739d..c6e972789a 100644
--- a/configs/beos
+++ b/configs/beos
@@ -26,8 +26,8 @@ ifeq ($(CPU), x86)
 		-DUSE_3DNOW_ASM \
 		-DUSE_SSE_ASM
 	
-	ASM_SOURCES = $(X86_SOURCES)
-	ASM_API = $(X86_API)
+	MESA_ASM_SOURCES = $(X86_SOURCES)
+	GLAPI_ASM_SOURCES = $(X86_API)
 
 	CC = gcc
 	CXX = g++
diff --git a/configs/default b/configs/default
index c9be5ec3e3..48ddd29282 100644
--- a/configs/default
+++ b/configs/default
@@ -51,7 +51,7 @@ OSMESA_LIB_NAME = lib$(OSMESA_LIB).so
 
 
 # Optional assembly language optimization files for libGL
-ASM_SOURCES = 
+MESA_ASM_SOURCES = 
 
 # GLw widget sources (Append "GLwMDrawA.c" here and add -lXm to GLW_LIB_DEPS in
 # order to build the Motif widget too)
diff --git a/configs/freebsd-dri b/configs/freebsd-dri
index 67d253b869..6fc1abbc80 100644
--- a/configs/freebsd-dri
+++ b/configs/freebsd-dri
@@ -22,7 +22,7 @@ CFLAGS = $(WARN_FLAGS) $(OPT_FLAGS) $(PIC_FLAGS) -Wmissing-prototypes -std=c99 -
 
 CXXFLAGS = $(WARN_FLAGS) $(OPT_FLAGS) $(PIC_FLAGS) $(DEFINES) -Wall -ansi -pedantic $(ASM_FLAGS) $(X11_INCLUDES) 
 
-ASM_SOURCES = 
+MESA_ASM_SOURCES = 
 
 # Library/program dependencies
 LIBDRM_CFLAGS = `pkg-config --cflags libdrm`
diff --git a/configs/freebsd-dri-amd64 b/configs/freebsd-dri-amd64
index 39341b9701..bb6c361398 100644
--- a/configs/freebsd-dri-amd64
+++ b/configs/freebsd-dri-amd64
@@ -6,5 +6,5 @@ include $(TOP)/configs/freebsd-dri
 CONFIG_NAME = freebsd-dri-x86-64
 
 ASM_FLAGS = -DUSE_X86_64_ASM
-ASM_SOURCES = $(X86-64_SOURCES)
-ASM_API = $(X86-64_API)
+MESA_ASM_SOURCES = $(X86-64_SOURCES)
+GLAPI_ASM_SOURCES = $(X86-64_API)
diff --git a/configs/freebsd-dri-x86 b/configs/freebsd-dri-x86
index af0d27ff47..9475437fc5 100644
--- a/configs/freebsd-dri-x86
+++ b/configs/freebsd-dri-x86
@@ -9,5 +9,5 @@ CONFIG_NAME = freebsd-dri-x86
 PIC_FLAGS = 
 
 ASM_FLAGS = -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM
-ASM_SOURCES = $(X86_SOURCES)
-ASM_API = $(X86_API)
+MESA_ASM_SOURCES = $(X86_SOURCES)
+GLAPI_ASM_SOURCES = $(X86_API)
diff --git a/configs/linux-directfb b/configs/linux-directfb
index dff27f7850..2ed94fe275 100644
--- a/configs/linux-directfb
+++ b/configs/linux-directfb
@@ -17,8 +17,8 @@ HAVE_X86 = $(shell uname -m | grep 'i[3-6]86' >/dev/null && echo yes)
 ifeq ($(HAVE_X86), yes)
      CFLAGS   += -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM
      CXXFLAGS += -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM
-     ASM_SOURCES = $(X86_SOURCES)
-     ASM_API = $(X86_API)
+     MESA_ASM_SOURCES = $(X86_SOURCES)
+     GLAPI_ASM_SOURCES = $(X86_API)
 endif
 
 # Directories
diff --git a/configs/linux-dri b/configs/linux-dri
index c45b600013..67e60cbd4c 100644
--- a/configs/linux-dri
+++ b/configs/linux-dri
@@ -33,7 +33,7 @@ CFLAGS = -Wall -Wmissing-prototypes -std=c99 -ffast-math \
 CXXFLAGS = -Wall $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(DEFINES)
 
 
-ASM_SOURCES = 
+MESA_ASM_SOURCES = 
 
 # Library/program dependencies
 EXTRA_LIB_PATH=-L/usr/X11R6/lib
diff --git a/configs/linux-dri-ppc b/configs/linux-dri-ppc
index fb87688065..a3a3ca83cb 100644
--- a/configs/linux-dri-ppc
+++ b/configs/linux-dri-ppc
@@ -9,7 +9,7 @@ OPT_FLAGS = -Os -mcpu=603
 PIC_FLAGS = -fPIC
 
 ASM_FLAGS = -DUSE_PPC_ASM -DUSE_VMX_ASM
-ASM_SOURCES = $(PPC_SOURCES)
+MESA_ASM_SOURCES = $(PPC_SOURCES)
 
 # Build only the drivers for cards that exist on PowerPC.  At some point MGA
 # will be added, but not yet.
diff --git a/configs/linux-dri-x86 b/configs/linux-dri-x86
index b196004e58..ec8242dd68 100644
--- a/configs/linux-dri-x86
+++ b/configs/linux-dri-x86
@@ -12,6 +12,6 @@ PIC_FLAGS =
 ARCH_FLAGS = -m32
 
 ASM_FLAGS = -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM
-ASM_SOURCES = $(X86_SOURCES)
-ASM_API = $(X86_API)
+MESA_ASM_SOURCES = $(X86_SOURCES)
+GLAPI_ASM_SOURCES = $(X86_API)
 
diff --git a/configs/linux-dri-x86-64 b/configs/linux-dri-x86-64
index 821ab3e336..bb56de375a 100644
--- a/configs/linux-dri-x86-64
+++ b/configs/linux-dri-x86-64
@@ -8,8 +8,8 @@ CONFIG_NAME = linux-dri-x86-64
 ARCH_FLAGS = -m64
 
 ASM_FLAGS = -DUSE_X86_64_ASM
-ASM_SOURCES = $(X86-64_SOURCES)
-ASM_API = $(X86-64_API)
+MESA_ASM_SOURCES = $(X86-64_SOURCES)
+GLAPI_ASM_SOURCES = $(X86-64_API)
 
 LIB_DIR = lib64
 
diff --git a/configs/linux-dri-xcb b/configs/linux-dri-xcb
index ea4bdf1864..fbf9b9b268 100644
--- a/configs/linux-dri-xcb
+++ b/configs/linux-dri-xcb
@@ -33,7 +33,7 @@ CFLAGS = -Wall -Wmissing-prototypes $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) \
 CXXFLAGS = -Wall $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(DEFINES)
 
 
-ASM_SOURCES = 
+MESA_ASM_SOURCES = 
 
 # Library/program dependencies
 EXTRA_LIB_PATH=$(shell pkg-config --libs-only-L x11)
diff --git a/configs/linux-icc b/configs/linux-icc
index 978a45af70..d90a1dab3d 100644
--- a/configs/linux-icc
+++ b/configs/linux-icc
@@ -16,7 +16,7 @@ GL_LIB_DEPS = -L/usr/X11R6/lib -lX11 -lXext -lm -lpthread
 GLUT_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GLU_LIB) -l$(GL_LIB) -L/usr/X11R6/lib -lX11 -lXmu -lXt -lXi -lm
 APP_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -lm
 
-ASM_SOURCES = $(X86_SOURCES)
-ASM_API = $(X86_API)
+MESA_ASM_SOURCES = $(X86_SOURCES)
+GLAPI_ASM_SOURCES = $(X86_API)
 
 
diff --git a/configs/linux-icc-static b/configs/linux-icc-static
index 0c957568c2..384db3bfe4 100644
--- a/configs/linux-icc-static
+++ b/configs/linux-icc-static
@@ -23,5 +23,5 @@ GL_LIB_DEPS =
 GLUT_LIB_DEPS =
 APP_LIB_DEPS = -L$(TOP)/$(LIB_DIR) -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -L/usr/X11R6/lib -lX11 -lXmu -lXt -lXi -lm -lpthread -lcxa -lunwind
 
-ASM_SOURCES = $(X86_SOURCES)
-ASM_API = $(X86_API)
+MESA_ASM_SOURCES = $(X86_SOURCES)
+GLAPI_ASM_SOURCES = $(X86_API)
diff --git a/configs/linux-indirect b/configs/linux-indirect
index bd33345ed7..0c4805ea87 100644
--- a/configs/linux-indirect
+++ b/configs/linux-indirect
@@ -34,7 +34,7 @@ CFLAGS   = $(WARN_FLAGS) $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(DEFINES) \
 CXXFLAGS = $(WARN_FLAGS) $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(DEFINES)
 
 
-ASM_SOURCES = 
+MESA_ASM_SOURCES = 
 
 # Library/program dependencies
 EXTRA_LIB_PATH=-L/usr/X11R6/lib
diff --git a/configs/linux-solo b/configs/linux-solo
index d49b972228..3145e12775 100644
--- a/configs/linux-solo
+++ b/configs/linux-solo
@@ -33,7 +33,7 @@ CFLAGS   = $(WARN_FLAGS) $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(DEFINES) \
 CXXFLAGS = $(WARN_FLAGS) $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(DEFINES)
 
 
-ASM_SOURCES = 
+MESA_ASM_SOURCES = 
 
 # Library/program dependencies
 DRI_LIB_DEPS = -lm -lpthread -lexpat -ldl -L$(TOP)/$(LIB_DIR) $(PCIACCESS_LIB)
diff --git a/configs/linux-solo-x86 b/configs/linux-solo-x86
index 13cab37658..5f5aa09c82 100644
--- a/configs/linux-solo-x86
+++ b/configs/linux-solo-x86
@@ -9,5 +9,5 @@ CONFIG_NAME = linux-solo-x86
 PIC_FLAGS = 
 
 ASM_FLAGS = -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM
-ASM_SOURCES = $(X86_SOURCES)
-ASM_API = $(X86_API)
+MESA_ASM_SOURCES = $(X86_SOURCES)
+GLAPI_ASM_SOURCES = $(X86_API)
diff --git a/configs/linux-sparc b/configs/linux-sparc
index 9925afc19b..346d438c28 100644
--- a/configs/linux-sparc
+++ b/configs/linux-sparc
@@ -5,5 +5,5 @@ include $(TOP)/configs/linux
 CONFIG_NAME = linux-sparc
 
 #ASM_FLAGS = -DUSE_SPARC_ASM
-#ASM_SOURCES = $(SPARC_SOURCES)
-#ASM_API = $(SPARC_API)
+#MESA_ASM_SOURCES = $(SPARC_SOURCES)
+#GLAPI_ASM_SOURCES = $(SPARC_API)
diff --git a/configs/linux-x86 b/configs/linux-x86
index 18fa06101d..a4cf4e8d62 100644
--- a/configs/linux-x86
+++ b/configs/linux-x86
@@ -5,5 +5,5 @@ include $(TOP)/configs/linux
 CONFIG_NAME = linux-x86
 
 ASM_FLAGS = -DUSE_X86_ASM -DUSE_MMX_ASM -DUSE_3DNOW_ASM -DUSE_SSE_ASM
-ASM_SOURCES = $(X86_SOURCES)
-ASM_API = $(X86_API)
+MESA_ASM_SOURCES = $(X86_SOURCES)
+GLAPI_ASM_SOURCES = $(X86_API)
diff --git a/configs/linux-x86-64 b/configs/linux-x86-64
index 67c0391836..c2441e09d0 100644
--- a/configs/linux-x86-64
+++ b/configs/linux-x86-64
@@ -6,8 +6,8 @@ CONFIG_NAME = linux-x86-64
 
 ARCH_FLAGS = -m64
 
-ASM_SOURCES = $(X86-64_SOURCES)
-ASM_API = $(X86-64_API)
+MESA_ASM_SOURCES = $(X86-64_SOURCES)
+GLAPI_ASM_SOURCES = $(X86-64_API)
 ASM_FLAGS = -DUSE_X86_64_ASM
 
 LIB_DIR = lib64
diff --git a/configs/linux-x86-glide b/configs/linux-x86-glide
index f2f8aeea60..b963fbdc66 100644
--- a/configs/linux-x86-glide
+++ b/configs/linux-x86-glide
@@ -15,8 +15,8 @@ CXXFLAGS = -Wall -O3 -ansi -pedantic -fPIC -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199
 GLUT_CFLAGS = -fexceptions
 
 
-ASM_SOURCES = $(X86_SOURCES)
-ASM_API = $(X86_API)
+MESA_ASM_SOURCES = $(X86_SOURCES)
+GLAPI_ASM_SOURCES = $(X86_API)
 
 # Library/program dependencies
 GL_LIB_DEPS = -L/usr/X11R6/lib -lX11 -lXext -L/usr/local/glide/lib -lglide3x -lm -lpthread
diff --git a/configs/sunos5-gcc b/configs/sunos5-gcc
index 77b293c545..3fa13d0496 100644
--- a/configs/sunos5-gcc
+++ b/configs/sunos5-gcc
@@ -16,8 +16,8 @@ ARCH_FLAGS ?=
 
 DEFINES = -D_REENTRANT -DUSE_XSHM
 
-ASM_SOURCES = $(SPARC_SOURCES)
-ASM_API = $(SPARC_API)
+MESA_ASM_SOURCES = $(SPARC_SOURCES)
+GLAPI_ASM_SOURCES = $(SPARC_API)
 ASM_FLAGS = -DUSE_SPARC_ASM
 
 CFLAGS   = $(WARN_FLAGS) $(OPT_FLAGS) $(PIC_FLAGS) $(ARCH_FLAGS) $(DEFINES) \
diff --git a/src/gallium/auxiliary/cso_cache/Makefile b/src/gallium/auxiliary/cso_cache/Makefile
index 8248b097fd..3e49266163 100644
--- a/src/gallium/auxiliary/cso_cache/Makefile
+++ b/src/gallium/auxiliary/cso_cache/Makefile
@@ -3,15 +3,10 @@ include $(TOP)/configs/current
 
 LIBNAME = cso_cache
 
-DRIVER_SOURCES = \
+C_SOURCES = \
 	cso_cache.c \
 	cso_hash.c
 
-C_SOURCES = \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
index c8000cbe9c..1ee9eca0ca 100644
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -3,7 +3,7 @@ include $(TOP)/configs/current
 
 LIBNAME = draw
 
-DRIVER_SOURCES = \
+C_SOURCES = \
 	draw_aaline.c \
 	draw_clip.c \
 	draw_vs_exec.c \
@@ -29,11 +29,6 @@ DRIVER_SOURCES = \
 	draw_vf_sse.c \
 	draw_wide_prims.c
 
-C_SOURCES = \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/auxiliary/pipebuffer/Makefile b/src/gallium/auxiliary/pipebuffer/Makefile
index 588629e870..a9fa518c67 100644
--- a/src/gallium/auxiliary/pipebuffer/Makefile
+++ b/src/gallium/auxiliary/pipebuffer/Makefile
@@ -1,10 +1,9 @@
-
 TOP = ../../../..
 include $(TOP)/configs/current
 
 LIBNAME = pipebuffer
 
-DRIVER_SOURCES = \
+C_SOURCES = \
 	pb_buffer_fenced.c \
 	pb_buffer_malloc.c \
 	pb_bufmgr_fenced.c \
@@ -12,11 +11,6 @@ DRIVER_SOURCES = \
 	pb_bufmgr_pool.c \
 	pb_winsys.c
 
-C_SOURCES = \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index edfae2a204..bc339d2aa6 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -1,18 +1,12 @@
-
 TOP = ../../../..
 include $(TOP)/configs/current
 
 LIBNAME = rtasm
 
-DRIVER_SOURCES = \
+C_SOURCES = \
 	rtasm_execmem.c \
 	rtasm_x86sse.c
 
-C_SOURCES = \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
index 8bb62b2a0a..71f64b747c 100644
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -1,10 +1,9 @@
-
 TOP = ../../../..
 include $(TOP)/configs/current
 
 LIBNAME = tgsi
 
-DRIVER_SOURCES = \
+C_SOURCES = \
 	exec/tgsi_exec.c \
 	exec/tgsi_sse2.c \
 	util/tgsi_build.c \
@@ -13,11 +12,6 @@ DRIVER_SOURCES = \
 	util/tgsi_transform.c \
 	util/tgsi_util.c
 
-C_SOURCES = \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index 7cc2aa44f9..906a46d6b4 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -1,20 +1,14 @@
-
 TOP = ../../../..
 include $(TOP)/configs/current
 
 LIBNAME = util
 
-DRIVER_SOURCES = \
+C_SOURCES = \
 	p_debug.c \
 	p_tile.c \
 	p_util.c \
 	u_mm.c
 
-C_SOURCES = \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/drivers/failover/Makefile b/src/gallium/drivers/failover/Makefile
index 14389bd055..f08b8df07a 100644
--- a/src/gallium/drivers/failover/Makefile
+++ b/src/gallium/drivers/failover/Makefile
@@ -1,20 +1,13 @@
-
 TOP = ../../../..
 include $(TOP)/configs/current
 
 LIBNAME = failover
 
-DRIVER_SOURCES = \
+C_SOURCES = \
 	fo_state.c \
 	fo_state_emit.c \
 	fo_context.c 
 
-C_SOURCES = \
-	$(COMMON_SOURCES) \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/drivers/i915simple/Makefile b/src/gallium/drivers/i915simple/Makefile
index ee22ba86f9..2a75f5d57c 100644
--- a/src/gallium/drivers/i915simple/Makefile
+++ b/src/gallium/drivers/i915simple/Makefile
@@ -1,10 +1,9 @@
-
 TOP = ../../../..
 include $(TOP)/configs/current
 
 LIBNAME = i915simple
 
-DRIVER_SOURCES = \
+C_SOURCES = \
 	i915_blit.c \
 	i915_clear.c \
 	i915_flush.c \
@@ -26,12 +25,6 @@ DRIVER_SOURCES = \
 	i915_fpc_translate.c \
 	i915_surface.c 
 
-C_SOURCES = \
-	$(COMMON_SOURCES) \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/drivers/i965simple/Makefile b/src/gallium/drivers/i965simple/Makefile
index 1dec1f9749..cc8580836c 100644
--- a/src/gallium/drivers/i965simple/Makefile
+++ b/src/gallium/drivers/i965simple/Makefile
@@ -1,14 +1,13 @@
-
 TOP = ../../../..
 include $(TOP)/configs/current
 
 LIBNAME = i965simple
 
-DRIVER_SOURCES = \
-        brw_blit.c \
-        brw_flush.c \
-        brw_strings.c \
-        brw_surface.c \
+C_SOURCES = \
+	brw_blit.c \
+	brw_flush.c \
+	brw_strings.c \
+	brw_surface.c \
 	brw_cc.c \
 	brw_clip.c \
 	brw_clip_line.c \
@@ -31,8 +30,8 @@ DRIVER_SOURCES = \
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
-        brw_shader_info.c \
-        brw_state.c \
+	brw_shader_info.c \
+	brw_state.c \
 	brw_state_batch.c \
 	brw_state_cache.c \
 	brw_state_pool.c \
@@ -51,16 +50,6 @@ DRIVER_SOURCES = \
 	brw_wm_state.c \
 	brw_wm_surface_state.c
 
-C_SOURCES = \
-	$(COMMON_SOURCES) \
-	$(COMMON_BM_SOURCES) \
-	$(MINIGLX_SOURCES) \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES =
-
-DRIVER_DEFINES = -I.
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 5479daf8ea..539ffb77f5 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -1,10 +1,9 @@
-
 TOP = ../../../..
 include $(TOP)/configs/current
 
 LIBNAME = softpipe
 
-DRIVER_SOURCES = \
+C_SOURCES = \
 	sp_fs_exec.c \
 	sp_fs_sse.c \
 	sp_fs_llvm.c \
@@ -41,12 +40,6 @@ DRIVER_SOURCES = \
 	sp_tile_cache.c \
 	sp_surface.c 
 
-C_SOURCES = \
-	$(COMMON_SOURCES) \
-	$(DRIVER_SOURCES)
-
-ASM_SOURCES = 
-
 include ../../Makefile.template
 
 symlinks:
diff --git a/src/gallium/winsys/dri/Makefile.template b/src/gallium/winsys/dri/Makefile.template
index 65a93bd53e..3bc1fdd4d4 100644
--- a/src/gallium/winsys/dri/Makefile.template
+++ b/src/gallium/winsys/dri/Makefile.template
@@ -25,8 +25,9 @@ WINOBJ=
 WINLIB=
 INCLUDES = $(SHARED_INCLUDES) $(EXPAT_INCLUDES)
 
-OBJECTS = $(C_SOURCES:.c=.o) \
-	  $(ASM_SOURCES:.S=.o) 
+OBJECTS = \
+	$(C_SOURCES:.c=.o) \
+	$(ASM_SOURCES:.S=.o) 
 
 else
 # miniglx
diff --git a/src/glx/x11/Makefile b/src/glx/x11/Makefile
index 5f74fcff06..b404727f08 100644
--- a/src/glx/x11/Makefile
+++ b/src/glx/x11/Makefile
@@ -35,7 +35,7 @@ SOURCES = \
 
 include $(TOP)/src/mesa/sources
 
-MESA_ASM_API = $(addprefix $(TOP)/src/mesa/, $(ASM_API))
+MESA_GLAPI_ASM_SOURCES = $(addprefix $(TOP)/src/mesa/, $(GLAPI_ASM_SOURCES))
 MESA_GLAPI_SOURCES = $(addprefix $(TOP)/src/mesa/, $(GLAPI_SOURCES))
 MESA_GLAPI_OBJECTS = $(addprefix $(TOP)/src/mesa/, $(GLAPI_OBJECTS))
 
@@ -70,11 +70,11 @@ $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME):  $(OBJECTS) Makefile
 		-install $(TOP)/$(LIB_DIR) $(GL_LIB_DEPS) $(OBJECTS)
 
 
-depend: $(SOURCES) $(MESA_GLAPI_SOURCES) $(MESA_ASM_API) Makefile
+depend: $(SOURCES) $(MESA_GLAPI_SOURCES) $(MESA_GLAPI_ASM_SOURCES) Makefile
 	rm -f depend
 	touch depend
 	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDES) $(SOURCES) \
-		$(MESA_GLAPI_SOURCES) $(MESA_ASM_API) 
+		$(MESA_GLAPI_SOURCES) $(MESA_GLAPI_ASM_SOURCES) 
 
 
 # Emacs tags
diff --git a/src/mesa/sources b/src/mesa/sources
index 0d185fd5f3..9e56694893 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -320,7 +320,7 @@ FBDEV_DRIVER_SOURCES =			\
 ALL_SOURCES = \
 	$(GLAPI_SOURCES)	\
 	$(SOLO_SOURCES)		\
-	$(ASM_SOURCES)		\
+	$(MESA_ASM_SOURCES)		\
 	$(COMMON_DRIVER_SOURCES)\
 	$(X11_DRIVER_SOURCES)	\
 	$(FBDEV_DRIVER_SOURCES) \
@@ -353,11 +353,11 @@ CORE_SOURCES = \
 
 SOLO_OBJECTS = \
 	$(SOLO_SOURCES:.c=.o) \
-	$(ASM_SOURCES:.S=.o)
+	$(MESA_ASM_SOURCES:.S=.o)
 
 GLAPI_OBJECTS = \
 	$(GLAPI_SOURCES:.c=.o) \
-	$(ASM_API:.S=.o)
+	$(GLAPI_ASM_SOURCES:.S=.o)
 
 CORE_OBJECTS = $(SOLO_OBJECTS) $(GLAPI_OBJECTS)
 
-- 
cgit v1.2.3


From b0eef0dc2557febea7d425fee1f9c2da382898a6 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 19 Feb 2008 13:41:38 +0900
Subject: Add run-time cpu capabilities detection stubs.

---
 src/gallium/auxiliary/draw/draw_vf_sse.c |  6 ++--
 src/gallium/auxiliary/rtasm/Makefile     |  1 +
 src/gallium/auxiliary/rtasm/SConscript   |  1 +
 src/gallium/auxiliary/rtasm/rtasm_cpu.c  | 50 ++++++++++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_cpu.h  | 42 +++++++++++++++++++++++++++
 5 files changed, 97 insertions(+), 3 deletions(-)
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_cpu.c
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_cpu.h

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/draw/draw_vf_sse.c b/src/gallium/auxiliary/draw/draw_vf_sse.c
index 1e889deeea..6076f9849d 100644
--- a/src/gallium/auxiliary/draw/draw_vf_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vf_sse.c
@@ -35,8 +35,8 @@
 
 #if defined(USE_SSE_ASM)
 
+#include "rtasm/rtasm_cpu.h"
 #include "rtasm/rtasm_x86sse.h"
-#include "x86/common_x86_asm.h"
 
 
 #define X    0
@@ -576,7 +576,7 @@ void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
 {
    struct x86_program p;   
 
-   if (!cpu_has_xmm) {
+   if (!rtasm_cpu_has_sse()) {
       vf->codegen_emit = NULL;
       return;
    }
@@ -586,7 +586,7 @@ void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
    p.vf = vf;
    p.inputs_safe = 0;		/* for now */
    p.outputs_safe = 1;		/* for now */
-   p.have_sse2 = cpu_has_xmm2;
+   p.have_sse2 = rtasm_cpu_has_sse2();
    p.identity = x86_make_reg(file_XMM, 6);
    p.chan0 = x86_make_reg(file_XMM, 7);
 
diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index bc339d2aa6..9b972f8f86 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -4,6 +4,7 @@ include $(TOP)/configs/current
 LIBNAME = rtasm
 
 C_SOURCES = \
+	rtasm_cpu.c \
 	rtasm_execmem.c \
 	rtasm_x86sse.c
 
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index 6eca1fe4c0..ac41a4f212 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -3,6 +3,7 @@ Import('*')
 rtasm = env.ConvenienceLibrary(
 	target = 'rtasm',
 	source = [
+		'rtasm_cpu.c',
 		'rtasm_execmem.c',
 		'rtasm_x86sse.c'
 	])
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
new file mode 100644
index 0000000000..eb3359750b
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -0,0 +1,50 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "rtasm_cpu.h"
+
+
+int rtasm_cpu_has_sse(void)
+{
+   /* FIXME: actually detect this at run-time */
+#if defined(__i386__) || defined(__386__)
+   return 1;
+#else
+   return 0;
+#endif
+}
+
+int rtasm_cpu_has_sse2(void) 
+{
+   /* FIXME: actually detect this at run-time */
+#if defined(__i386__) || defined(__386__)
+   return 1;
+#else
+   return 0;
+#endif
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.h b/src/gallium/auxiliary/rtasm/rtasm_cpu.h
new file mode 100644
index 0000000000..ebc71634fd
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Runtime detection of CPU capabilities.
+ */
+
+#ifndef _RTASM_CPU_H_
+#define _RTASM_CPU_H_
+
+
+int rtasm_cpu_has_sse(void);
+
+int rtasm_cpu_has_sse2(void);
+
+
+#endif /* _RTASM_CPU_H_ */
-- 
cgit v1.2.3


From 5d78212d752e021555356bbb9cc5993ad6d9e847 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 19 Feb 2008 14:00:16 +0900
Subject: Bring in ppc spe rtasm into gallium's rtasm module.

Moving files since these are not being used outside gallium.
---
 src/gallium/auxiliary/rtasm/Makefile             |   3 +-
 src/gallium/auxiliary/rtasm/SConscript           |   3 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      | 386 +++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      | 314 ++++++++++++++++++
 src/gallium/drivers/cell/ppu/cell_context.h      |   2 +-
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |   2 +-
 src/mesa/ppc/rtasm/spe_asm.c                     | 385 ----------------------
 src/mesa/ppc/rtasm/spe_asm.h                     | 314 ------------------
 src/mesa/sources                                 |   1 -
 9 files changed, 706 insertions(+), 704 deletions(-)
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
 delete mode 100644 src/mesa/ppc/rtasm/spe_asm.c
 delete mode 100644 src/mesa/ppc/rtasm/spe_asm.h

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index 9b972f8f86..39b8a4dbd7 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -6,7 +6,8 @@ LIBNAME = rtasm
 C_SOURCES = \
 	rtasm_cpu.c \
 	rtasm_execmem.c \
-	rtasm_x86sse.c
+	rtasm_x86sse.c \
+	rtasm_ppc_spe.c
 
 include ../../Makefile.template
 
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index ac41a4f212..8ea25922aa 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -5,7 +5,8 @@ rtasm = env.ConvenienceLibrary(
 	source = [
 		'rtasm_cpu.c',
 		'rtasm_execmem.c',
-		'rtasm_x86sse.c'
+		'rtasm_x86sse.c',
+		'rtasm_ppc_spe.c',
 	])
 
 auxiliaries.insert(0, rtasm)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
new file mode 100644
index 0000000000..95a2d6fcbb
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -0,0 +1,386 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Real-time assembly generation interface for Cell B.E. SPEs.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
+#include "rtasm_ppc_spe.h"
+
+#ifdef GALLIUM_CELL
+/**
+ * SPE instruction types
+ *
+ * There are 6 primary instruction encodings used on the Cell's SPEs.  Each of
+ * the following unions encodes one type.
+ *
+ * \bug
+ * If, at some point, we start generating SPE code from a little-endian host
+ * these unions will not work.
+ */
+/*@{*/
+/**
+ * Encode one output register with two input registers
+ */
+union spe_inst_RR {
+    uint32_t bits;
+    struct {
+	unsigned op:11;
+	unsigned rB:7;
+	unsigned rA:7;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with three input registers
+ */
+union spe_inst_RRR {
+    uint32_t bits;
+    struct {
+	unsigned op:4;
+	unsigned rT:7;
+	unsigned rB:7;
+	unsigned rA:7;
+	unsigned rC:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with one input reg. and a 7-bit signed immed
+ */
+union spe_inst_RI7 {
+    uint32_t bits;
+    struct {
+	unsigned op:11;
+	unsigned i7:7;
+	unsigned rA:7;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with one input reg. and an 8-bit signed immed
+ */
+union spe_inst_RI8 {
+    uint32_t bits;
+    struct {
+	unsigned op:10;
+	unsigned i8:8;
+	unsigned rA:7;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with one input reg. and a 10-bit signed immed
+ */
+union spe_inst_RI10 {
+    uint32_t bits;
+    struct {
+	unsigned op:8;
+	unsigned i10:10;
+	unsigned rA:7;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with a 16-bit signed immediate
+ */
+union spe_inst_RI16 {
+    uint32_t bits;
+    struct {
+	unsigned op:9;
+	unsigned i16:16;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with a 18-bit signed immediate
+ */
+union spe_inst_RI18 {
+    uint32_t bits;
+    struct {
+	unsigned op:7;
+	unsigned i18:18;
+	unsigned rT:7;
+    } inst;
+};
+/*@}*/
+
+
+static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
+		    unsigned rA, unsigned rB)
+{
+    union spe_inst_RR inst;
+    inst.inst.op = op;
+    inst.inst.rB = rB;
+    inst.inst.rA = rA;
+    inst.inst.rT = rT;
+    *p->csr = inst.bits;
+    p->csr++;
+}
+
+
+static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
+		    unsigned rA, unsigned rB, unsigned rC)
+{
+    union spe_inst_RRR inst;
+    inst.inst.op = op;
+    inst.inst.rT = rT;
+    inst.inst.rB = rB;
+    inst.inst.rA = rA;
+    inst.inst.rC = rC;
+    *p->csr = inst.bits;
+    p->csr++;
+}
+
+
+static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
+		     unsigned rA, int imm)
+{
+    union spe_inst_RI7 inst;
+    inst.inst.op = op;
+    inst.inst.i7 = imm;
+    inst.inst.rA = rA;
+    inst.inst.rT = rT;
+    *p->csr = inst.bits;
+    p->csr++;
+}
+
+
+
+static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
+		     unsigned rA, int imm)
+{
+    union spe_inst_RI8 inst;
+    inst.inst.op = op;
+    inst.inst.i8 = imm;
+    inst.inst.rA = rA;
+    inst.inst.rT = rT;
+    *p->csr = inst.bits;
+    p->csr++;
+}
+
+
+
+static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
+		      unsigned rA, int imm)
+{
+    union spe_inst_RI10 inst;
+    inst.inst.op = op;
+    inst.inst.i10 = imm;
+    inst.inst.rA = rA;
+    inst.inst.rT = rT;
+    *p->csr = inst.bits;
+    p->csr++;
+}
+
+
+static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
+		      int imm)
+{
+    union spe_inst_RI16 inst;
+    inst.inst.op = op;
+    inst.inst.i16 = imm;
+    inst.inst.rT = rT;
+    *p->csr = inst.bits;
+    p->csr++;
+}
+
+
+static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
+		      int imm)
+{
+    union spe_inst_RI18 inst;
+    inst.inst.op = op;
+    inst.inst.i18 = imm;
+    inst.inst.rT = rT;
+    *p->csr = inst.bits;
+    p->csr++;
+}
+
+
+
+
+#define EMIT_(_name, _op) \
+void _name (struct spe_function *p, unsigned rT) \
+{ \
+    emit_RR(p, _op, rT, 0, 0); \
+}
+
+#define EMIT_R(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA) \
+{ \
+    emit_RR(p, _op, rT, rA, 0); \
+}
+
+#define EMIT_RR(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
+{ \
+    emit_RR(p, _op, rT, rA, rB); \
+}
+
+#define EMIT_RRR(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
+{ \
+    emit_RRR(p, _op, rT, rA, rB, rC); \
+}
+
+#define EMIT_RI7(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+    emit_RI7(p, _op, rT, rA, imm); \
+}
+
+#define EMIT_RI8(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+    emit_RI8(p, _op, rT, rA, 155 - imm); \
+}
+
+#define EMIT_RI10(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+    emit_RI10(p, _op, rT, rA, imm); \
+}
+
+#define EMIT_RI16(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, int imm) \
+{ \
+    emit_RI16(p, _op, rT, imm); \
+}
+
+#define EMIT_RI18(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, int imm) \
+{ \
+    emit_RI18(p, _op, rT, imm); \
+}
+
+#define EMIT_I16(_name, _op) \
+void _name (struct spe_function *p, int imm) \
+{ \
+    emit_RI16(p, _op, 0, imm); \
+}
+
+#include "rtasm_ppc_spe.h"
+
+
+/*
+ */
+void spe_init_func(struct spe_function *p, unsigned code_size)
+{
+    p->store = align_malloc(code_size, 16);
+    p->csr = p->store;
+}
+
+
+void spe_release_func(struct spe_function *p)
+{
+    align_free(p->store);
+    p->store = NULL;
+    p->csr = NULL;
+}
+
+
+void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
+{
+    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
+}
+
+void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
+{
+    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
+}
+
+void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
+		int e)
+{
+    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
+}
+
+void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
+		int e)
+{
+    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
+}
+
+void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d,
+		int e)
+{
+    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
+}
+
+void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+{
+    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
+}
+
+void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+{
+    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
+}
+
+void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+{
+    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
+}
+
+
+/* Hint-for-branch instructions
+ */
+#if 0
+hbr;
+hbra;
+hbrr;
+#endif
+
+
+/* Control instructions
+ */
+#if 0
+stop;
+EMIT_RR  (spe_stopd, 0x140);
+EMIT_    (spe_lnop,  0x001);
+EMIT_    (spe_nop,   0x201);
+sync;
+EMIT_    (spe_dsync, 0x003);
+EMIT_R   (spe_mfspr, 0x00c);
+EMIT_R   (spe_mtspr, 0x10c);
+#endif
+
+#endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
new file mode 100644
index 0000000000..10ce44b3a0
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -0,0 +1,314 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Real-time assembly generation interface for Cell B.E. SPEs.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#ifndef RTASM_PPC_SPE_H
+#define RTASM_PPC_SPE_H
+
+struct spe_function {
+    /**
+     *
+     */
+    uint32_t *store;
+    uint32_t *csr;
+    const char *fn;
+};
+
+extern void spe_init_func(struct spe_function *p, unsigned code_size);
+extern void spe_release_func(struct spe_function *p);
+
+#endif /* RTASM_PPC_SPE_H */
+
+#ifndef EMIT_
+#define EMIT_(name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT)
+#define EMIT_R(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA)
+#define EMIT_RR(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   unsigned rB)
+#define EMIT_RRR(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   unsigned rB, unsigned rC)
+#define EMIT_RI7(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
+#define EMIT_RI8(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
+#define EMIT_RI10(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
+#define EMIT_RI16(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, int imm)
+#define EMIT_RI18(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, int imm)
+#define EMIT_I16(_name, _op) \
+    extern void _name (struct spe_function *p, int imm)
+#define UNDEF_EMIT_MACROS
+#endif /* EMIT_ */
+
+
+/* Memory load / store instructions
+ */
+EMIT_RI10(spe_lqd,  0x034);
+EMIT_RR  (spe_lqx,  0x1c4);
+EMIT_RI16(spe_lqa,  0x061);
+EMIT_RI16(spe_lqr,  0x067);
+EMIT_RI10(spe_stqd, 0x024);
+EMIT_RR  (spe_stqx, 0x144);
+EMIT_RI16(spe_stqa, 0x041);
+EMIT_RI16(spe_stqr, 0x047);
+EMIT_RI7 (spe_cbd,  0x1f4);
+EMIT_RR  (spe_cbx,  0x1d4);
+EMIT_RI7 (spe_chd,  0x1f5);
+EMIT_RI7 (spe_chx,  0x1d5);
+EMIT_RI7 (spe_cwd,  0x1f6);
+EMIT_RI7 (spe_cwx,  0x1d6);
+EMIT_RI7 (spe_cdd,  0x1f7);
+EMIT_RI7 (spe_cdx,  0x1d7);
+
+
+/* Constant formation instructions
+ */
+EMIT_RI16(spe_ilh,   0x083);
+EMIT_RI16(spe_ilhu,  0x082);
+EMIT_RI16(spe_il,    0x081);
+EMIT_RI18(spe_ila,   0x021);
+EMIT_RI16(spe_iohl,  0x0c1);
+EMIT_RI16(spe_fsmbi, 0x0c5);
+
+
+/* Integer and logical instructions
+ */
+EMIT_RR  (spe_ah,      0x0c8);
+EMIT_RI10(spe_ahi,     0x01d);
+EMIT_RR  (spe_a,       0x0c0);
+EMIT_RI10(spe_ai,      0x01c);
+EMIT_RR  (spe_sfh,     0x048);
+EMIT_RI10(spe_sfhi,    0x00d);
+EMIT_RR  (spe_sf,      0x040);
+EMIT_RI10(spe_sfi,     0x00c);
+EMIT_RR  (spe_addx,    0x340);
+EMIT_RR  (spe_cg,      0x0c2);
+EMIT_RR  (spe_cgx,     0x342);
+EMIT_RR  (spe_sfx,     0x341);
+EMIT_RR  (spe_bg,      0x042);
+EMIT_RR  (spe_bgx,     0x343);
+EMIT_RR  (spe_mpy,     0x3c4);
+EMIT_RR  (spe_mpyu,    0x3cc);
+EMIT_RI10(spe_mpyi,    0x074);
+EMIT_RI10(spe_mpyui,   0x075);
+EMIT_RRR (spe_mpya,    0x00c);
+EMIT_RR  (spe_mpyh,    0x3c5);
+EMIT_RR  (spe_mpys,    0x3c7);
+EMIT_RR  (spe_mpyhh,   0x3c6);
+EMIT_RR  (spe_mpyhha,  0x346);
+EMIT_RR  (spe_mpyhhu,  0x3ce);
+EMIT_RR  (spe_mpyhhau, 0x34e);
+EMIT_R   (spe_clz,     0x2a5);
+EMIT_R   (spe_cntb,    0x2b4);
+EMIT_R   (spe_fsmb,    0x1b6);
+EMIT_R   (spe_fsmh,    0x1b5);
+EMIT_R   (spe_fsm,     0x1b4);
+EMIT_R   (spe_gbb,     0x1b2);
+EMIT_R   (spe_gbh,     0x1b1);
+EMIT_R   (spe_gb,      0x1b0);
+EMIT_RR  (spe_avgb,    0x0d3);
+EMIT_RR  (spe_absdb,   0x053);
+EMIT_RR  (spe_sumb,    0x253);
+EMIT_R   (spe_xsbh,    0x2b6);
+EMIT_R   (spe_xshw,    0x2ae);
+EMIT_R   (spe_xswd,    0x2a6);
+EMIT_RR  (spe_and,     0x0c1);
+EMIT_RR  (spe_andc,    0x2c1);
+EMIT_RI10(spe_andbi,   0x016);
+EMIT_RI10(spe_andhi,   0x015);
+EMIT_RI10(spe_andi,    0x014);
+EMIT_RR  (spe_or,      0x041);
+EMIT_RR  (spe_orc,     0x2c9);
+EMIT_RI10(spe_orbi,    0x006);
+EMIT_RI10(spe_orhi,    0x005);
+EMIT_RI10(spe_ori,     0x004);
+EMIT_R   (spe_orx,     0x1f0);
+EMIT_RR  (spe_xor,     0x241);
+EMIT_RI10(spe_xorbi,   0x026);
+EMIT_RI10(spe_xorhi,   0x025);
+EMIT_RI10(spe_xori,    0x024);
+EMIT_RR  (spe_nand,    0x0c9);
+EMIT_RR  (spe_nor,     0x049);
+EMIT_RR  (spe_eqv,     0x249);
+EMIT_RRR (spe_selb,    0x008);
+EMIT_RRR (spe_shufb,   0x00b);
+
+
+/* Shift and rotate instructions
+ */
+EMIT_RR  (spe_shlh,      0x05f);
+EMIT_RI7 (spe_shlhi,     0x07f);
+EMIT_RR  (spe_shl,       0x05b);
+EMIT_RI7 (spe_shli,      0x07b);
+EMIT_RR  (spe_shlqbi,    0x1db);
+EMIT_RI7 (spe_shlqbii,   0x1fb);
+EMIT_RR  (spe_shlqby,    0x1df);
+EMIT_RI7 (spe_shlqbyi,   0x1ff);
+EMIT_RR  (spe_shlqbybi,  0x1cf);
+EMIT_RR  (spe_roth,      0x05c);
+EMIT_RI7 (spe_rothi,     0x07c);
+EMIT_RR  (spe_rot,       0x058);
+EMIT_RI7 (spe_roti,      0x078);
+EMIT_RR  (spe_rotqby,    0x1dc);
+EMIT_RI7 (spe_rotqbyi,   0x1fc);
+EMIT_RR  (spe_rotqbybi,  0x1cc);
+EMIT_RR  (spe_rotqbi,    0x1d8);
+EMIT_RI7 (spe_rotqbii,   0x1f8);
+EMIT_RR  (spe_rothm,     0x05d);
+EMIT_RI7 (spe_rothmi,    0x07d);
+EMIT_RR  (spe_rotm,      0x059);
+EMIT_RI7 (spe_rotmi,     0x079);
+EMIT_RR  (spe_rotqmby,   0x1dd);
+EMIT_RI7 (spe_rotqmbyi,  0x1fd);
+EMIT_RR  (spe_rotqmbybi, 0x1cd);
+EMIT_RR  (spe_rotqmbi,   0x1c9);
+EMIT_RI7 (spe_rotqmbii,  0x1f9);
+EMIT_RR  (spe_rotmah,    0x05e);
+EMIT_RI7 (spe_rotmahi,   0x07e);
+EMIT_RR  (spe_rotma,     0x05a);
+EMIT_RI7 (spe_rotmai,    0x07a);
+
+
+/* Compare, branch, and halt instructions
+ */
+EMIT_RR  (spe_heq,       0x3d8);
+EMIT_RI10(spe_heqi,      0x07f);
+EMIT_RR  (spe_hgt,       0x258);
+EMIT_RI10(spe_hgti,      0x04f);
+EMIT_RR  (spe_hlgt,      0x2d8);
+EMIT_RI10(spe_hlgti,     0x05f);
+EMIT_RR  (spe_ceqb,      0x3d0);
+EMIT_RI10(spe_ceqbi,     0x07e);
+EMIT_RR  (spe_ceqh,      0x3c8);
+EMIT_RI10(spe_ceqhi,     0x07d);
+EMIT_RR  (spe_ceq,       0x3c0);
+EMIT_RI10(spe_ceqi,      0x07c);
+EMIT_RR  (spe_cgtb,      0x250);
+EMIT_RI10(spe_cgtbi,     0x04e);
+EMIT_RR  (spe_cgth,      0x248);
+EMIT_RI10(spe_cgthi,     0x04d);
+EMIT_RR  (spe_cgt,       0x240);
+EMIT_RI10(spe_cgti,      0x04c);
+EMIT_RR  (spe_clgtb,     0x2d0);
+EMIT_RI10(spe_clgtbi,    0x05e);
+EMIT_RR  (spe_clgth,     0x2c8);
+EMIT_RI10(spe_clgthi,    0x05d);
+EMIT_RR  (spe_clgt,      0x2c0);
+EMIT_RI10(spe_clgti,     0x05c);
+EMIT_I16 (spe_br,        0x064);
+EMIT_I16 (spe_bra,       0x060);
+EMIT_RI16(spe_brsl,      0x066);
+EMIT_RI16(spe_brasl,     0x062);
+EMIT_RI16(spe_brnz,      0x042);
+EMIT_RI16(spe_brz,       0x040);
+EMIT_RI16(spe_brhnz,     0x046);
+EMIT_RI16(spe_brhz,      0x044);
+
+extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
+extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
+extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_biz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_binz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+
+
+/* Floating-point instructions
+ */
+EMIT_RR  (spe_fa,         0x2c4);
+EMIT_RR  (spe_dfa,        0x2cc);
+EMIT_RR  (spe_fs,         0x2c5);
+EMIT_RR  (spe_dfs,        0x2cd);
+EMIT_RR  (spe_fm,         0x2c6);
+EMIT_RR  (spe_dfm,        0x2ce);
+EMIT_RRR (spe_fma,        0x00e);
+EMIT_RR  (spe_dfma,       0x35c);
+EMIT_RRR (spe_fnms,       0x00d);
+EMIT_RR  (spe_dfnms,      0x35e);
+EMIT_RRR (spe_fms,        0x00f);
+EMIT_RR  (spe_dfms,       0x35d);
+EMIT_RR  (spe_dfnma,      0x35f);
+EMIT_R   (spe_frest,      0x1b8);
+EMIT_R   (spe_frsqest,    0x1b9);
+EMIT_RR  (spe_fi,         0x3d4);
+EMIT_RI8 (spe_csflt,      0x1da);
+EMIT_RI8 (spe_cflts,      0x1d8);
+EMIT_RI8 (spe_cuflt,      0x1db);
+EMIT_RI8 (spe_cfltu,      0x1d9);
+EMIT_R   (spe_frds,       0x3b9);
+EMIT_R   (spe_fesd,       0x3b8);
+EMIT_RR  (spe_dfceq,      0x3c3);
+EMIT_RR  (spe_dfcmeq,     0x3cb);
+EMIT_RR  (spe_dfcgt,      0x2c3);
+EMIT_RR  (spe_dfcmgt,     0x2cb);
+EMIT_RI7 (spe_dftsv,      0x3bf);
+EMIT_RR  (spe_fceq,       0x3c2);
+EMIT_RR  (spe_fcmeq,      0x3ca);
+EMIT_RR  (spe_fcgt,       0x2c2);
+EMIT_RR  (spe_fcmgt,      0x2ca);
+EMIT_R   (spe_fscrwr,     0x3ba);
+EMIT_    (spe_fscrrd,     0x398);
+
+
+/* Channel instructions
+ */
+EMIT_R   (spe_rdch,       0x00d);
+EMIT_R   (spe_rdchcnt,    0x00f);
+EMIT_R   (spe_wrch,       0x10d);
+
+
+#ifdef UNDEF_EMIT_MACROS
+#undef EMIT_
+#undef EMIT_R
+#undef EMIT_RR
+#undef EMIT_RRR
+#undef EMIT_RI7
+#undef EMIT_RI8
+#undef EMIT_RI10
+#undef EMIT_RI16
+#undef EMIT_RI18
+#undef EMIT_I16
+#undef UNDEF_EMIT_MACROS
+#endif /* EMIT_ */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 91f8e542a2..3b687bb868 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -36,7 +36,7 @@
 #include "draw/draw_vbuf.h"
 #include "cell_winsys.h"
 #include "cell/common.h"
-#include "ppc/rtasm/spe_asm.h"
+#include "rtasm/rtasm_ppc_spe.h"
 
 
 struct cell_vbuf_render;
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index f10689a959..9cf74bab47 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -31,7 +31,7 @@
 #include "../auxiliary/draw/draw_private.h"
 
 #include "cell_context.h"
-#include "ppc/rtasm/spe_asm.h"
+#include "rtasm/rtasm_ppc_spe.h"
 
 typedef uint64_t register_mask;
 
diff --git a/src/mesa/ppc/rtasm/spe_asm.c b/src/mesa/ppc/rtasm/spe_asm.c
deleted file mode 100644
index 1037637250..0000000000
--- a/src/mesa/ppc/rtasm/spe_asm.c
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * (C) Copyright IBM Corporation 2008
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file spe_asm.c
- * Real-time assembly generation interface for Cell B.E. SPEs.
- *
- * \author Ian Romanick <idr@us.ibm.com>
- */
-#ifdef GALLIUM_CELL
-#include <inttypes.h>
-#include <imports.h>
-#include "spe_asm.h"
-
-/**
- * SPE instruction types
- *
- * There are 6 primary instruction encodings used on the Cell's SPEs.  Each of
- * the following unions encodes one type.
- *
- * \bug
- * If, at some point, we start generating SPE code from a little-endian host
- * these unions will not work.
- */
-/*@{*/
-/**
- * Encode one output register with two input registers
- */
-union spe_inst_RR {
-    uint32_t bits;
-    struct {
-	unsigned op:11;
-	unsigned rB:7;
-	unsigned rA:7;
-	unsigned rT:7;
-    } inst;
-};
-
-
-/**
- * Encode one output register with three input registers
- */
-union spe_inst_RRR {
-    uint32_t bits;
-    struct {
-	unsigned op:4;
-	unsigned rT:7;
-	unsigned rB:7;
-	unsigned rA:7;
-	unsigned rC:7;
-    } inst;
-};
-
-
-/**
- * Encode one output register with one input reg. and a 7-bit signed immed
- */
-union spe_inst_RI7 {
-    uint32_t bits;
-    struct {
-	unsigned op:11;
-	unsigned i7:7;
-	unsigned rA:7;
-	unsigned rT:7;
-    } inst;
-};
-
-
-/**
- * Encode one output register with one input reg. and an 8-bit signed immed
- */
-union spe_inst_RI8 {
-    uint32_t bits;
-    struct {
-	unsigned op:10;
-	unsigned i8:8;
-	unsigned rA:7;
-	unsigned rT:7;
-    } inst;
-};
-
-
-/**
- * Encode one output register with one input reg. and a 10-bit signed immed
- */
-union spe_inst_RI10 {
-    uint32_t bits;
-    struct {
-	unsigned op:8;
-	unsigned i10:10;
-	unsigned rA:7;
-	unsigned rT:7;
-    } inst;
-};
-
-
-/**
- * Encode one output register with a 16-bit signed immediate
- */
-union spe_inst_RI16 {
-    uint32_t bits;
-    struct {
-	unsigned op:9;
-	unsigned i16:16;
-	unsigned rT:7;
-    } inst;
-};
-
-
-/**
- * Encode one output register with a 18-bit signed immediate
- */
-union spe_inst_RI18 {
-    uint32_t bits;
-    struct {
-	unsigned op:7;
-	unsigned i18:18;
-	unsigned rT:7;
-    } inst;
-};
-/*@}*/
-
-
-static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB)
-{
-    union spe_inst_RR inst;
-    inst.inst.op = op;
-    inst.inst.rB = rB;
-    inst.inst.rA = rA;
-    inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
-}
-
-
-static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB, unsigned rC)
-{
-    union spe_inst_RRR inst;
-    inst.inst.op = op;
-    inst.inst.rT = rT;
-    inst.inst.rB = rB;
-    inst.inst.rA = rA;
-    inst.inst.rC = rC;
-    *p->csr = inst.bits;
-    p->csr++;
-}
-
-
-static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
-{
-    union spe_inst_RI7 inst;
-    inst.inst.op = op;
-    inst.inst.i7 = imm;
-    inst.inst.rA = rA;
-    inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
-}
-
-
-
-static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
-{
-    union spe_inst_RI8 inst;
-    inst.inst.op = op;
-    inst.inst.i8 = imm;
-    inst.inst.rA = rA;
-    inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
-}
-
-
-
-static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
-		      unsigned rA, int imm)
-{
-    union spe_inst_RI10 inst;
-    inst.inst.op = op;
-    inst.inst.i10 = imm;
-    inst.inst.rA = rA;
-    inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
-}
-
-
-static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
-{
-    union spe_inst_RI16 inst;
-    inst.inst.op = op;
-    inst.inst.i16 = imm;
-    inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
-}
-
-
-static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
-{
-    union spe_inst_RI18 inst;
-    inst.inst.op = op;
-    inst.inst.i18 = imm;
-    inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
-}
-
-
-
-
-#define EMIT_(_name, _op) \
-void _name (struct spe_function *p, unsigned rT) \
-{ \
-    emit_RR(p, _op, rT, 0, 0); \
-}
-
-#define EMIT_R(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA) \
-{ \
-    emit_RR(p, _op, rT, rA, 0); \
-}
-
-#define EMIT_RR(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
-{ \
-    emit_RR(p, _op, rT, rA, rB); \
-}
-
-#define EMIT_RRR(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
-{ \
-    emit_RRR(p, _op, rT, rA, rB, rC); \
-}
-
-#define EMIT_RI7(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
-{ \
-    emit_RI7(p, _op, rT, rA, imm); \
-}
-
-#define EMIT_RI8(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
-{ \
-    emit_RI8(p, _op, rT, rA, 155 - imm); \
-}
-
-#define EMIT_RI10(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
-{ \
-    emit_RI10(p, _op, rT, rA, imm); \
-}
-
-#define EMIT_RI16(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, int imm) \
-{ \
-    emit_RI16(p, _op, rT, imm); \
-}
-
-#define EMIT_RI18(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, int imm) \
-{ \
-    emit_RI18(p, _op, rT, imm); \
-}
-
-#define EMIT_I16(_name, _op) \
-void _name (struct spe_function *p, int imm) \
-{ \
-    emit_RI16(p, _op, 0, imm); \
-}
-
-#include "spe_asm.h"
-
-
-/*
- */
-void spe_init_func(struct spe_function *p, unsigned code_size)
-{
-    p->store = _mesa_align_malloc(code_size, 16);
-    p->csr = p->store;
-}
-
-
-void spe_release_func(struct spe_function *p)
-{
-    _mesa_align_free(p->store);
-    p->store = NULL;
-    p->csr = NULL;
-}
-
-
-void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
-{
-    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
-}
-
-void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
-{
-    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
-}
-
-void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
-		int e)
-{
-    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
-}
-
-void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
-		int e)
-{
-    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
-}
-
-void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d,
-		int e)
-{
-    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
-}
-
-void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
-{
-    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
-}
-
-void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
-{
-    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
-}
-
-void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
-{
-    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
-}
-
-
-/* Hint-for-branch instructions
- */
-#if 0
-hbr;
-hbra;
-hbrr;
-#endif
-
-
-/* Control instructions
- */
-#if 0
-stop;
-EMIT_RR  (spe_stopd, 0x140);
-EMIT_    (spe_lnop,  0x001);
-EMIT_    (spe_nop,   0x201);
-sync;
-EMIT_    (spe_dsync, 0x003);
-EMIT_R   (spe_mfspr, 0x00c);
-EMIT_R   (spe_mtspr, 0x10c);
-#endif
-
-#endif /* GALLIUM_CELL */
diff --git a/src/mesa/ppc/rtasm/spe_asm.h b/src/mesa/ppc/rtasm/spe_asm.h
deleted file mode 100644
index 6d69ae655d..0000000000
--- a/src/mesa/ppc/rtasm/spe_asm.h
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * (C) Copyright IBM Corporation 2008
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file spe_asm.h
- * Real-time assembly generation interface for Cell B.E. SPEs.
- *
- * \author Ian Romanick <idr@us.ibm.com>
- */
-
-#ifndef SPE_ASM_H
-#define SPE_ASM_H
-
-struct spe_function {
-    /**
-     *
-     */
-    uint32_t *store;
-    uint32_t *csr;
-    const char *fn;
-};
-
-extern void spe_init_func(struct spe_function *p, unsigned code_size);
-extern void spe_release_func(struct spe_function *p);
-
-#endif /* SPE_ASM_H */
-
-#ifndef EMIT_
-#define EMIT_(name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT)
-#define EMIT_R(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA)
-#define EMIT_RR(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   unsigned rB)
-#define EMIT_RRR(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   unsigned rB, unsigned rC)
-#define EMIT_RI7(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
-#define EMIT_RI8(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
-#define EMIT_RI10(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
-#define EMIT_RI16(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm)
-#define EMIT_RI18(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm)
-#define EMIT_I16(_name, _op) \
-    extern void _name (struct spe_function *p, int imm)
-#define UNDEF_EMIT_MACROS
-#endif /* EMIT_ */
-
-
-/* Memory load / store instructions
- */
-EMIT_RI10(spe_lqd,  0x034);
-EMIT_RR  (spe_lqx,  0x1c4);
-EMIT_RI16(spe_lqa,  0x061);
-EMIT_RI16(spe_lqr,  0x067);
-EMIT_RI10(spe_stqd, 0x024);
-EMIT_RR  (spe_stqx, 0x144);
-EMIT_RI16(spe_stqa, 0x041);
-EMIT_RI16(spe_stqr, 0x047);
-EMIT_RI7 (spe_cbd,  0x1f4);
-EMIT_RR  (spe_cbx,  0x1d4);
-EMIT_RI7 (spe_chd,  0x1f5);
-EMIT_RI7 (spe_chx,  0x1d5);
-EMIT_RI7 (spe_cwd,  0x1f6);
-EMIT_RI7 (spe_cwx,  0x1d6);
-EMIT_RI7 (spe_cdd,  0x1f7);
-EMIT_RI7 (spe_cdx,  0x1d7);
-
-
-/* Constant formation instructions
- */
-EMIT_RI16(spe_ilh,   0x083);
-EMIT_RI16(spe_ilhu,  0x082);
-EMIT_RI16(spe_il,    0x081);
-EMIT_RI18(spe_ila,   0x021);
-EMIT_RI16(spe_iohl,  0x0c1);
-EMIT_RI16(spe_fsmbi, 0x0c5);
-
-
-/* Integer and logical instructions
- */
-EMIT_RR  (spe_ah,      0x0c8);
-EMIT_RI10(spe_ahi,     0x01d);
-EMIT_RR  (spe_a,       0x0c0);
-EMIT_RI10(spe_ai,      0x01c);
-EMIT_RR  (spe_sfh,     0x048);
-EMIT_RI10(spe_sfhi,    0x00d);
-EMIT_RR  (spe_sf,      0x040);
-EMIT_RI10(spe_sfi,     0x00c);
-EMIT_RR  (spe_addx,    0x340);
-EMIT_RR  (spe_cg,      0x0c2);
-EMIT_RR  (spe_cgx,     0x342);
-EMIT_RR  (spe_sfx,     0x341);
-EMIT_RR  (spe_bg,      0x042);
-EMIT_RR  (spe_bgx,     0x343);
-EMIT_RR  (spe_mpy,     0x3c4);
-EMIT_RR  (spe_mpyu,    0x3cc);
-EMIT_RI10(spe_mpyi,    0x074);
-EMIT_RI10(spe_mpyui,   0x075);
-EMIT_RRR (spe_mpya,    0x00c);
-EMIT_RR  (spe_mpyh,    0x3c5);
-EMIT_RR  (spe_mpys,    0x3c7);
-EMIT_RR  (spe_mpyhh,   0x3c6);
-EMIT_RR  (spe_mpyhha,  0x346);
-EMIT_RR  (spe_mpyhhu,  0x3ce);
-EMIT_RR  (spe_mpyhhau, 0x34e);
-EMIT_R   (spe_clz,     0x2a5);
-EMIT_R   (spe_cntb,    0x2b4);
-EMIT_R   (spe_fsmb,    0x1b6);
-EMIT_R   (spe_fsmh,    0x1b5);
-EMIT_R   (spe_fsm,     0x1b4);
-EMIT_R   (spe_gbb,     0x1b2);
-EMIT_R   (spe_gbh,     0x1b1);
-EMIT_R   (spe_gb,      0x1b0);
-EMIT_RR  (spe_avgb,    0x0d3);
-EMIT_RR  (spe_absdb,   0x053);
-EMIT_RR  (spe_sumb,    0x253);
-EMIT_R   (spe_xsbh,    0x2b6);
-EMIT_R   (spe_xshw,    0x2ae);
-EMIT_R   (spe_xswd,    0x2a6);
-EMIT_RR  (spe_and,     0x0c1);
-EMIT_RR  (spe_andc,    0x2c1);
-EMIT_RI10(spe_andbi,   0x016);
-EMIT_RI10(spe_andhi,   0x015);
-EMIT_RI10(spe_andi,    0x014);
-EMIT_RR  (spe_or,      0x041);
-EMIT_RR  (spe_orc,     0x2c9);
-EMIT_RI10(spe_orbi,    0x006);
-EMIT_RI10(spe_orhi,    0x005);
-EMIT_RI10(spe_ori,     0x004);
-EMIT_R   (spe_orx,     0x1f0);
-EMIT_RR  (spe_xor,     0x241);
-EMIT_RI10(spe_xorbi,   0x026);
-EMIT_RI10(spe_xorhi,   0x025);
-EMIT_RI10(spe_xori,    0x024);
-EMIT_RR  (spe_nand,    0x0c9);
-EMIT_RR  (spe_nor,     0x049);
-EMIT_RR  (spe_eqv,     0x249);
-EMIT_RRR (spe_selb,    0x008);
-EMIT_RRR (spe_shufb,   0x00b);
-
-
-/* Shift and rotate instructions
- */
-EMIT_RR  (spe_shlh,      0x05f);
-EMIT_RI7 (spe_shlhi,     0x07f);
-EMIT_RR  (spe_shl,       0x05b);
-EMIT_RI7 (spe_shli,      0x07b);
-EMIT_RR  (spe_shlqbi,    0x1db);
-EMIT_RI7 (spe_shlqbii,   0x1fb);
-EMIT_RR  (spe_shlqby,    0x1df);
-EMIT_RI7 (spe_shlqbyi,   0x1ff);
-EMIT_RR  (spe_shlqbybi,  0x1cf);
-EMIT_RR  (spe_roth,      0x05c);
-EMIT_RI7 (spe_rothi,     0x07c);
-EMIT_RR  (spe_rot,       0x058);
-EMIT_RI7 (spe_roti,      0x078);
-EMIT_RR  (spe_rotqby,    0x1dc);
-EMIT_RI7 (spe_rotqbyi,   0x1fc);
-EMIT_RR  (spe_rotqbybi,  0x1cc);
-EMIT_RR  (spe_rotqbi,    0x1d8);
-EMIT_RI7 (spe_rotqbii,   0x1f8);
-EMIT_RR  (spe_rothm,     0x05d);
-EMIT_RI7 (spe_rothmi,    0x07d);
-EMIT_RR  (spe_rotm,      0x059);
-EMIT_RI7 (spe_rotmi,     0x079);
-EMIT_RR  (spe_rotqmby,   0x1dd);
-EMIT_RI7 (spe_rotqmbyi,  0x1fd);
-EMIT_RR  (spe_rotqmbybi, 0x1cd);
-EMIT_RR  (spe_rotqmbi,   0x1c9);
-EMIT_RI7 (spe_rotqmbii,  0x1f9);
-EMIT_RR  (spe_rotmah,    0x05e);
-EMIT_RI7 (spe_rotmahi,   0x07e);
-EMIT_RR  (spe_rotma,     0x05a);
-EMIT_RI7 (spe_rotmai,    0x07a);
-
-
-/* Compare, branch, and halt instructions
- */
-EMIT_RR  (spe_heq,       0x3d8);
-EMIT_RI10(spe_heqi,      0x07f);
-EMIT_RR  (spe_hgt,       0x258);
-EMIT_RI10(spe_hgti,      0x04f);
-EMIT_RR  (spe_hlgt,      0x2d8);
-EMIT_RI10(spe_hlgti,     0x05f);
-EMIT_RR  (spe_ceqb,      0x3d0);
-EMIT_RI10(spe_ceqbi,     0x07e);
-EMIT_RR  (spe_ceqh,      0x3c8);
-EMIT_RI10(spe_ceqhi,     0x07d);
-EMIT_RR  (spe_ceq,       0x3c0);
-EMIT_RI10(spe_ceqi,      0x07c);
-EMIT_RR  (spe_cgtb,      0x250);
-EMIT_RI10(spe_cgtbi,     0x04e);
-EMIT_RR  (spe_cgth,      0x248);
-EMIT_RI10(spe_cgthi,     0x04d);
-EMIT_RR  (spe_cgt,       0x240);
-EMIT_RI10(spe_cgti,      0x04c);
-EMIT_RR  (spe_clgtb,     0x2d0);
-EMIT_RI10(spe_clgtbi,    0x05e);
-EMIT_RR  (spe_clgth,     0x2c8);
-EMIT_RI10(spe_clgthi,    0x05d);
-EMIT_RR  (spe_clgt,      0x2c0);
-EMIT_RI10(spe_clgti,     0x05c);
-EMIT_I16 (spe_br,        0x064);
-EMIT_I16 (spe_bra,       0x060);
-EMIT_RI16(spe_brsl,      0x066);
-EMIT_RI16(spe_brasl,     0x062);
-EMIT_RI16(spe_brnz,      0x042);
-EMIT_RI16(spe_brz,       0x040);
-EMIT_RI16(spe_brhnz,     0x046);
-EMIT_RI16(spe_brhz,      0x044);
-
-extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
-extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
-extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
-    int d, int e);
-extern void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA,
-    int d, int e);
-extern void spe_biz(struct spe_function *p, unsigned rT, unsigned rA,
-    int d, int e);
-extern void spe_binz(struct spe_function *p, unsigned rT, unsigned rA,
-    int d, int e);
-extern void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA,
-    int d, int e);
-extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
-    int d, int e);
-
-
-/* Floating-point instructions
- */
-EMIT_RR  (spe_fa,         0x2c4);
-EMIT_RR  (spe_dfa,        0x2cc);
-EMIT_RR  (spe_fs,         0x2c5);
-EMIT_RR  (spe_dfs,        0x2cd);
-EMIT_RR  (spe_fm,         0x2c6);
-EMIT_RR  (spe_dfm,        0x2ce);
-EMIT_RRR (spe_fma,        0x00e);
-EMIT_RR  (spe_dfma,       0x35c);
-EMIT_RRR (spe_fnms,       0x00d);
-EMIT_RR  (spe_dfnms,      0x35e);
-EMIT_RRR (spe_fms,        0x00f);
-EMIT_RR  (spe_dfms,       0x35d);
-EMIT_RR  (spe_dfnma,      0x35f);
-EMIT_R   (spe_frest,      0x1b8);
-EMIT_R   (spe_frsqest,    0x1b9);
-EMIT_RR  (spe_fi,         0x3d4);
-EMIT_RI8 (spe_csflt,      0x1da);
-EMIT_RI8 (spe_cflts,      0x1d8);
-EMIT_RI8 (spe_cuflt,      0x1db);
-EMIT_RI8 (spe_cfltu,      0x1d9);
-EMIT_R   (spe_frds,       0x3b9);
-EMIT_R   (spe_fesd,       0x3b8);
-EMIT_RR  (spe_dfceq,      0x3c3);
-EMIT_RR  (spe_dfcmeq,     0x3cb);
-EMIT_RR  (spe_dfcgt,      0x2c3);
-EMIT_RR  (spe_dfcmgt,     0x2cb);
-EMIT_RI7 (spe_dftsv,      0x3bf);
-EMIT_RR  (spe_fceq,       0x3c2);
-EMIT_RR  (spe_fcmeq,      0x3ca);
-EMIT_RR  (spe_fcgt,       0x2c2);
-EMIT_RR  (spe_fcmgt,      0x2ca);
-EMIT_R   (spe_fscrwr,     0x3ba);
-EMIT_    (spe_fscrrd,     0x398);
-
-
-/* Channel instructions
- */
-EMIT_R   (spe_rdch,       0x00d);
-EMIT_R   (spe_rdchcnt,    0x00f);
-EMIT_R   (spe_wrch,       0x10d);
-
-
-#ifdef UNDEF_EMIT_MACROS
-#undef EMIT_
-#undef EMIT_R
-#undef EMIT_RR
-#undef EMIT_RRR
-#undef EMIT_RI7
-#undef EMIT_RI8
-#undef EMIT_RI10
-#undef EMIT_RI16
-#undef EMIT_RI18
-#undef EMIT_I16
-#undef UNDEF_EMIT_MACROS
-#endif /* EMIT_ */
diff --git a/src/mesa/sources b/src/mesa/sources
index 9e56694893..f0bf7b31fb 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -246,7 +246,6 @@ ASM_C_SOURCES =	\
 	x86/rtasm/x86sse.c \
 	sparc/sparc.c \
 	ppc/common_ppc.c \
-	ppc/rtasm/spe_asm.c \
 	x86-64/x86-64.c
 
 X86_SOURCES =			\
-- 
cgit v1.2.3


From 5480a6bc13a555f99a89fc801cfe153182697dda Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Tue, 19 Feb 2008 18:57:25 +0900
Subject: Fix windows build.

---
 src/gallium/auxiliary/rtasm/rtasm_execmem.c | 3 ++-
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c  | 2 +-
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h  | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 9c78fa5626..300c1c2d9d 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -33,6 +33,7 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
+#include "pipe/p_util.h"
 
 #include "rtasm_execmem.h"
 
@@ -118,7 +119,7 @@ rtasm_exec_free(void *addr)
  */
 
 void *
-rtasm_exec_malloc(GLuint size)
+rtasm_exec_malloc(size_t size)
 {
    return MALLOC( size );
 }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index b332192a62..dcbf76f600 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -21,7 +21,7 @@
  *
  **************************************************************************/
 
-#if defined(__i386__) || defined(__386__)
+#if defined(__i386__) || defined(__386__) || defined(i386)
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index e4576001bf..606b41eb35 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -24,7 +24,7 @@
 #ifndef _RTASM_X86SSE_H_
 #define _RTASM_X86SSE_H_
 
-#if defined(__i386__) || defined(__386__)
+#if defined(__i386__) || defined(__386__) || defined(i386)
 
 /* It is up to the caller to ensure that instructions issued are
  * suitable for the host cpu.  There are no checks made in this module
-- 
cgit v1.2.3


From 57060bc1fa82e4e93d2affafecd98219be2f991f Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Wed, 20 Feb 2008 22:10:27 +0100
Subject: gallium: Silence compiler warnings on Windows.

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index dcbf76f600..4d33950e99 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -25,6 +25,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
+#include "pipe/p_pointer.h"
 
 #include "rtasm_execmem.h"
 #include "rtasm_x86sse.h"
@@ -34,7 +35,7 @@
 
 static unsigned char *cptr( void (*label)() )
 {
-   return (unsigned char *)(unsigned long)label;
+   return (unsigned char *) label;
 }
 
 
@@ -46,7 +47,7 @@ static void do_realloc( struct x86_function *p )
       p->csr = p->store;
    }
    else {
-      unsigned used = p->csr - p->store;
+      uintptr_t used = pointer_to_uintptr( p->csr ) - pointer_to_uintptr( p->store );
       unsigned char *tmp = p->store;
       p->size *= 2;
       p->store = rtasm_exec_malloc(p->size);
@@ -60,7 +61,7 @@ static void do_realloc( struct x86_function *p )
  */
 static unsigned char *reserve( struct x86_function *p, int bytes )
 {
-   if (p->csr + bytes - p->store > p->size)
+   if (p->csr + bytes - p->store > (int) p->size)
       do_realloc(p);
 
    {
@@ -135,7 +136,7 @@ static void emit_modrm( struct x86_function *p,
    case mod_INDIRECT:
       break;
    case mod_DISP8:
-      emit_1b(p, regmem.disp);
+      emit_1b(p, (char) regmem.disp);
       break;
    case mod_DISP32:
       emit_1i(p, regmem.disp);
@@ -251,14 +252,14 @@ void x86_jcc( struct x86_function *p,
 	      enum x86_cc cc,
 	      unsigned char *label )
 {
-   int offset = label - (x86_get_label(p) + 2);
+   intptr_t offset = pointer_to_intptr( label ) - (pointer_to_intptr( x86_get_label(p) ) + 2);
    
    if (offset <= 127 && offset >= -128) {
       emit_1ub(p, 0x70 + cc);
       emit_1b(p, (char) offset);
    }
    else {
-      offset = label - (x86_get_label(p) + 6);
+      offset = pointer_to_intptr( label ) - (pointer_to_intptr( x86_get_label(p) ) + 6);
       emit_2ub(p, 0x0f, 0x80 + cc);
       emit_1i(p, offset);
    }
@@ -293,13 +294,13 @@ unsigned char *x86_call_forward( struct x86_function *p)
 void x86_fixup_fwd_jump( struct x86_function *p,
 			 unsigned char *fixup )
 {
-   *(int *)(fixup - 4) = x86_get_label(p) - fixup;
+   *(int *)(fixup - 4) = pointer_to_intptr( x86_get_label(p) ) - pointer_to_intptr( fixup );
 }
 
 void x86_jmp( struct x86_function *p, unsigned char *label)
 {
    emit_1ub(p, 0xe9);
-   emit_1i(p, label - x86_get_label(p) - 4);
+   emit_1i(p, pointer_to_intptr( label ) - pointer_to_intptr( x86_get_label(p) ) - 4);
 }
 
 #if 0
@@ -1207,7 +1208,7 @@ void (*x86_get_func( struct x86_function *p ))(void)
 {
    if (DISASSEM && p->store)
       debug_printf("disassemble %p %p\n", p->store, p->csr);
-   return (void (*)(void)) (unsigned long) p->store;
+   return (void (*)(void)) p->store;
 }
 
 #else
-- 
cgit v1.2.3


From b1525662b330ca8b4cdd930775f3642bfec3b58f Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 10 Mar 2008 16:28:54 -0700
Subject: Move SPE register allocator to rtasm code

Move the register allocator to a common location.  There is more code
on the way that will make use of this interface.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  47 +++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  16 ++++
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c | 101 +++++++----------------
 3 files changed, 92 insertions(+), 72 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 95a2d6fcbb..a996218ce7 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -306,6 +306,11 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     p->store = align_malloc(code_size, 16);
     p->csr = p->store;
+    
+    /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
+     */
+    p->regs[0] = ~7;
+    p->regs[1] = (1U << (80 - 64)) - 1;
 }
 
 
@@ -317,6 +322,48 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+int spe_allocate_available_register(struct spe_function *p)
+{
+   unsigned i;
+   for (i = 0; i < 128; i++) {
+      const uint64_t mask = (1ULL << (i % 128));
+      const unsigned idx = i / 128;
+
+      if ((p->regs[idx] & mask) != 0) {
+         p->regs[idx] &= ~mask;
+         return i;
+      }
+   }
+
+   return -1;
+}
+
+
+int spe_allocate_register(struct spe_function *p, int reg)
+{
+   const unsigned idx = reg / 128;
+   const unsigned bit = reg % 128;
+
+   assert((p->regs[idx] & (1ULL << bit)) != 0);
+
+   p->regs[idx] &= ~(1ULL << bit);
+   return reg;
+}
+
+
+void spe_release_register(struct spe_function *p, int reg)
+{
+   const unsigned idx = reg / 128;
+   const unsigned bit = reg % 128;
+
+   assert((p->regs[idx] & (1ULL << bit)) == 0);
+
+   p->regs[idx] |= (1ULL << bit);
+}
+
+
+
+
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 10ce44b3a0..5a1eb1ed8d 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -39,11 +39,27 @@ struct spe_function {
     uint32_t *store;
     uint32_t *csr;
     const char *fn;
+
+    /**
+     * Mask of used / unused registers
+     *
+     * Each set bit corresponds to an available register.  Each cleared bit
+     * corresponds to an allocated register.
+     *
+     * \sa
+     * spe_allocate_register, spe_allocate_available_register,
+     * spe_release_register
+     */
+    uint64_t regs[2];
 };
 
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
 
+extern int spe_allocate_available_register(struct spe_function *p);
+extern int spe_allocate_register(struct spe_function *p, int reg);
+extern void spe_release_register(struct spe_function *p, int reg);
+
 #endif /* RTASM_PPC_SPE_H */
 
 #ifndef EMIT_
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 9cf74bab47..4828a8023b 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -33,46 +33,11 @@
 #include "cell_context.h"
 #include "rtasm/rtasm_ppc_spe.h"
 
-typedef uint64_t register_mask;
-
-int allocate_available_register(register_mask *m)
-{
-   unsigned i;
-   for (i = 0; i < 64; i++) {
-      const uint64_t mask = (1ULL << i);
-
-      if ((m[0] & mask) != 0) {
-	 m[0] &= ~mask;
-	 return i;
-      }
-   }
-
-   return -1;
-}
-
-
-int allocate_register(register_mask *m, unsigned reg)
-{
-   assert((m[0] & (1ULL << reg)) != 0);
-
-   m[0] &= ~(1ULL << reg);
-   return reg;
-}
-
-
-void release_register(register_mask *m, unsigned reg)
-{
-   assert((m[0] & (1ULL << reg)) == 0);
-
-   m[0] |= (1ULL << reg);
-}
-
 
 /**
  * Emit a 4x4 matrix transpose operation
  *
  * \param p         Function that the transpose operation is to be appended to
- * \param m         Live register mask
  * \param row0      Register containing row 0 of the source matrix
  * \param row1      Register containing row 1 of the source matrix
  * \param row2      Register containing row 2 of the source matrix
@@ -91,15 +56,15 @@ void release_register(register_mask *m, unsigned reg)
  * This function requires that four temporary are available on entry.
  */
 static void
-emit_matrix_transpose(struct spe_function *p, register_mask *m,
+emit_matrix_transpose(struct spe_function *p,
 		      unsigned row0, unsigned row1, unsigned row2,
 		      unsigned row3, unsigned dest_ptr,
 		      unsigned shuf_ptr, unsigned count)
 {
-   int shuf_hi = allocate_available_register(m);
-   int shuf_lo = allocate_available_register(m);
-   int t1 = allocate_available_register(m);
-   int t2 = allocate_available_register(m);
+   int shuf_hi = spe_allocate_available_register(p);
+   int shuf_lo = spe_allocate_available_register(p);
+   int t1 = spe_allocate_available_register(p);
+   int t2 = spe_allocate_available_register(p);
    int t3;
    int t4;
    int col0;
@@ -169,19 +134,19 @@ emit_matrix_transpose(struct spe_function *p, register_mask *m,
 
    /* Release all of the temporary registers used.
     */
-   release_register(m, col0);
-   release_register(m, col1);
-   release_register(m, col2);
-   release_register(m, col3);
-   release_register(m, shuf_hi);
-   release_register(m, shuf_lo);
-   release_register(m, t2);
-   release_register(m, t4);
+   spe_release_register(p, col0);
+   spe_release_register(p, col1);
+   spe_release_register(p, col2);
+   spe_release_register(p, col3);
+   spe_release_register(p, shuf_hi);
+   spe_release_register(p, shuf_lo);
+   spe_release_register(p, t2);
+   spe_release_register(p, t4);
 }
 
 
 static void
-emit_fetch(struct spe_function *p, register_mask *m,
+emit_fetch(struct spe_function *p,
 	   unsigned in_ptr, unsigned *offset,
 	   unsigned out_ptr, unsigned shuf_ptr,
 	   enum pipe_format format)
@@ -191,11 +156,11 @@ emit_fetch(struct spe_function *p, register_mask *m,
    const unsigned type = pf_type(format);
    const unsigned bytes = pf_size_x(format);
 
-   int v0 = allocate_available_register(m);
-   int v1 = allocate_available_register(m);
-   int v2 = allocate_available_register(m);
-   int v3 = allocate_available_register(m);
-   int tmp = allocate_available_register(m);
+   int v0 = spe_allocate_available_register(p);
+   int v1 = spe_allocate_available_register(p);
+   int v2 = spe_allocate_available_register(p);
+   int v3 = spe_allocate_available_register(p);
+   int tmp = spe_allocate_available_register(p);
    int float_zero = -1;
    int float_one = -1;
    float scale_signed = 0.0;
@@ -260,19 +225,19 @@ emit_fetch(struct spe_function *p, register_mask *m,
 
 
    if (count < 4) {
-      float_one = allocate_available_register(m);
+      float_one = spe_allocate_available_register(p);
       spe_il(p, float_one, 1);
       spe_cuflt(p, float_one, float_one, 0);
       
       if (count < 3) {
-	 float_zero = allocate_available_register(m);
+	 float_zero = spe_allocate_available_register(p);
 	 spe_il(p, float_zero, 0);
       }
    }
 
-   release_register(m, tmp);
+   spe_release_register(p, tmp);
 
-   emit_matrix_transpose(p, m, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
+   emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
 
    switch (count) {
    case 1:
@@ -284,11 +249,11 @@ emit_fetch(struct spe_function *p, register_mask *m,
    }
 
    if (float_zero != -1) {
-      release_register(m, float_zero);
+      spe_release_register(p, float_zero);
    }
 
    if (float_one != -1) {
-      release_register(m, float_one);
+      spe_release_register(p, float_one);
    }
 }
 
@@ -297,7 +262,6 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 {
    struct cell_context *const cell =
        (struct cell_context *) draw->driver_private;
-   register_mask m = ~0;
    struct spe_function *p = &cell->attrib_fetch;
    unsigned function_index[PIPE_ATTRIB_MAX];
    unsigned unique_attr_formats;
@@ -338,18 +302,11 @@ void cell_update_vertex_fetch(struct draw_context *draw)
    spe_init_func(p, 136 * unique_attr_formats);
 
 
-   /* Registers 0, 1, and 2 are reserved by the ABI.
-    */
-   allocate_register(&m, 0);
-   allocate_register(&m, 1);
-   allocate_register(&m, 2);
-
-
    /* Allocate registers for the function's input parameters.
     */
-   out_ptr = allocate_register(&m, 3);
-   in_ptr = allocate_register(&m, 4);
-   shuf_ptr = allocate_register(&m, 5);
+   out_ptr = spe_allocate_register(p, 3);
+   in_ptr = spe_allocate_register(p, 4);
+   shuf_ptr = spe_allocate_register(p, 5);
 
 
    /* Generate code for the individual attribute fetch functions.
@@ -362,7 +319,7 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 						     - (void *) p->store);
 
 	 offset = 0;
-	 emit_fetch(p, & m, in_ptr, &offset, out_ptr, shuf_ptr,
+	 emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr,
 		    draw->vertex_element[i].src_format);
 	 spe_bi(p, 0, 0, 0);
 
-- 
cgit v1.2.3


From 0c715de39fa8337a2753dacd77ed280000416c1a Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 17 Mar 2008 15:37:09 -0700
Subject: cell: Fix simple register allocator

THere are 64-bits in a uint64_t, not 128.  Duh.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a996218ce7..842d713f84 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -326,8 +326,8 @@ int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
    for (i = 0; i < 128; i++) {
-      const uint64_t mask = (1ULL << (i % 128));
-      const unsigned idx = i / 128;
+      const uint64_t mask = (1ULL << (i % 64));
+      const unsigned idx = i / 64;
 
       if ((p->regs[idx] & mask) != 0) {
          p->regs[idx] &= ~mask;
@@ -341,8 +341,8 @@ int spe_allocate_available_register(struct spe_function *p)
 
 int spe_allocate_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 128;
-   const unsigned bit = reg % 128;
+   const unsigned idx = reg / 64;
+   const unsigned bit = reg % 64;
 
    assert((p->regs[idx] & (1ULL << bit)) != 0);
 
@@ -353,8 +353,8 @@ int spe_allocate_register(struct spe_function *p, int reg)
 
 void spe_release_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 128;
-   const unsigned bit = reg % 128;
+   const unsigned idx = reg / 64;
+   const unsigned bit = reg % 64;
 
    assert((p->regs[idx] & (1ULL << bit)) == 0);
 
-- 
cgit v1.2.3


From 9f106a8683ec89b873f0237fbb6930a63b89dfa0 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 17 Mar 2008 16:07:54 -0700
Subject: cell: Don't free NULL code pointers

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 842d713f84..24be65bff9 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -316,7 +316,9 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
 
 void spe_release_func(struct spe_function *p)
 {
-    align_free(p->store);
+    if (p->store != NULL) {
+        align_free(p->store);
+    }
     p->store = NULL;
     p->csr = NULL;
 }
-- 
cgit v1.2.3


From 84d8030735844785c3c97679db2bc1892a9c8c70 Mon Sep 17 00:00:00 2001
From: Ian Romanick <idr@us.ibm.com>
Date: Mon, 24 Mar 2008 12:15:59 -0700
Subject: cell: Float convert-to and convert-from instructions use different
 shift bias

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c |  4 ++--
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 24be65bff9..7f6bf577b2 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -267,10 +267,10 @@ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
     emit_RI7(p, _op, rT, rA, imm); \
 }
 
-#define EMIT_RI8(_name, _op) \
+#define EMIT_RI8(_name, _op, bias) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI8(p, _op, rT, rA, 155 - imm); \
+    emit_RI8(p, _op, rT, rA, bias - imm); \
 }
 
 #define EMIT_RI10(_name, _op) \
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 5a1eb1ed8d..1cacc717b1 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -76,7 +76,7 @@ extern void spe_release_register(struct spe_function *p, int reg);
 #define EMIT_RI7(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
 			   int imm)
-#define EMIT_RI8(_name, _op) \
+#define EMIT_RI8(_name, _op, bias) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
 			   int imm)
 #define EMIT_RI10(_name, _op) \
@@ -289,10 +289,10 @@ EMIT_RR  (spe_dfnma,      0x35f);
 EMIT_R   (spe_frest,      0x1b8);
 EMIT_R   (spe_frsqest,    0x1b9);
 EMIT_RR  (spe_fi,         0x3d4);
-EMIT_RI8 (spe_csflt,      0x1da);
-EMIT_RI8 (spe_cflts,      0x1d8);
-EMIT_RI8 (spe_cuflt,      0x1db);
-EMIT_RI8 (spe_cfltu,      0x1d9);
+EMIT_RI8 (spe_csflt,      0x1da, 155);
+EMIT_RI8 (spe_cflts,      0x1d8, 173);
+EMIT_RI8 (spe_cuflt,      0x1db, 155);
+EMIT_RI8 (spe_cfltu,      0x1d9, 173);
 EMIT_R   (spe_frds,       0x3b9);
 EMIT_R   (spe_fesd,       0x3b8);
 EMIT_RR  (spe_dfceq,      0x3c3);
-- 
cgit v1.2.3


From 4d184cc33131b440f9aafbcdd2d657050411db49 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 11 Apr 2008 13:20:52 -0600
Subject: gallium: fix broken x86_call()

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 4d33950e99..aea8b28e58 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -317,7 +317,7 @@ void x86_call( struct x86_function *p, void (*label)())
 void x86_call( struct x86_function *p, struct x86_reg reg)
 {
    emit_1ub(p, 0xff);
-   emit_modrm(p, reg, reg);
+   emit_modrm_noreg(p, 2, reg);
 }
 #endif
 
-- 
cgit v1.2.3


From 5b97c762ed9882dd922f48c2fbf13b14ad86a96e Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 18 Apr 2008 17:32:39 +0100
Subject: rtasm: add a couple more insns, clean up x86_mul

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 34 ++++++++++++++++++++++++------
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  2 ++
 2 files changed, 29 insertions(+), 7 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index aea8b28e58..5c25fa155d 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -33,11 +33,6 @@
 #define DISASSEM 0
 #define X86_TWOB 0x0f
 
-static unsigned char *cptr( void (*label)() )
-{
-   return (unsigned char *) label;
-}
-
 
 static void do_realloc( struct x86_function *p )
 {
@@ -304,6 +299,11 @@ void x86_jmp( struct x86_function *p, unsigned char *label)
 }
 
 #if 0
+static unsigned char *cptr( void (*label)() )
+{
+   return (unsigned char *) label;
+}
+
 /* This doesn't work once we start reallocating & copying the
  * generated code on buffer fills, because the call is relative to the
  * current pc.
@@ -417,11 +417,14 @@ void x86_add( struct x86_function *p,
    emit_op_modrm(p, 0x03, 0x01, dst, src );
 }
 
+/* Calculate EAX * src, results in EDX:EAX.
+ */
 void x86_mul( struct x86_function *p,
 	       struct x86_reg src )
 {
-   assert (src.file == file_REG32 && src.mod == mod_REG);
-   emit_op_modrm(p, 0xf7, 0, x86_make_reg (file_REG32, reg_SP), src );
+//   assert (src.file == file_REG32 && src.mod == mod_REG);
+   emit_1ub(p, 0xf7);
+   emit_modrm_noreg(p, 4, src );
 }
 
 void x86_sub( struct x86_function *p,
@@ -646,6 +649,14 @@ void sse_cvtps2pi( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse2_cvtdq2ps( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0x5b);
+   emit_modrm( p, dst, src );
+}
+
 
 /* Shufps can also be used to implement a reduced swizzle when dest ==
  * arg0.
@@ -735,6 +746,15 @@ void sse2_packuswb( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse2_punpcklbw( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   emit_3ub(p, 0x66, X86_TWOB, 0x60);
+   emit_modrm( p, dst, src );
+}
+
+
 void sse2_rcpps( struct x86_function *p,
                  struct x86_reg dst,
                  struct x86_reg src )
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 606b41eb35..dfde661f46 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -165,6 +165,7 @@ void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg sr
 
 void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -202,6 +203,7 @@ void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src
 void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
                  unsigned char shuf );
 void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
+void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
 void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-- 
cgit v1.2.3


From 363f7abf2000c1cf5993ae8f83ba81b2054bf6e0 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 18 Apr 2008 18:30:41 +0100
Subject: rtasm: add x86_imul

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 11 ++++++++++-
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 5c25fa155d..7f8cc23d15 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -422,11 +422,20 @@ void x86_add( struct x86_function *p,
 void x86_mul( struct x86_function *p,
 	       struct x86_reg src )
 {
-//   assert (src.file == file_REG32 && src.mod == mod_REG);
    emit_1ub(p, 0xf7);
    emit_modrm_noreg(p, 4, src );
 }
 
+
+void x86_imul( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   emit_2ub(p, X86_TWOB, 0xAF);
+   emit_modrm(p, dst, src);
+}
+
+
 void x86_sub( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index dfde661f46..5e99ceea70 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -213,6 +213,7 @@ void x86_inc( struct x86_function *p, struct x86_reg reg );
 void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_mul( struct x86_function *p, struct x86_reg src );
+void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_pop( struct x86_function *p, struct x86_reg reg );
 void x86_push( struct x86_function *p, struct x86_reg reg );
-- 
cgit v1.2.3


From af523a5bd7828fd554669cf83f18992af967a075 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Sat, 19 Apr 2008 18:25:33 +0100
Subject: rtasm: include yet another i386 define varient

---
 src/gallium/auxiliary/rtasm/rtasm_cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
index eb3359750b..d577ff5b42 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -32,7 +32,7 @@
 int rtasm_cpu_has_sse(void)
 {
    /* FIXME: actually detect this at run-time */
-#if defined(__i386__) || defined(__386__)
+#if defined(__i386__) || defined(__386__) || defined(i386)
    return 1;
 #else
    return 0;
@@ -42,7 +42,7 @@ int rtasm_cpu_has_sse(void)
 int rtasm_cpu_has_sse2(void) 
 {
    /* FIXME: actually detect this at run-time */
-#if defined(__i386__) || defined(__386__)
+#if defined(__i386__) || defined(__386__) || defined(i386)
    return 1;
 #else
    return 0;
-- 
cgit v1.2.3


From 40e0439db448a7d93ddb18faac7f14b47b1343c0 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 21 Apr 2008 13:02:59 +0900
Subject: gallium: Centralize SSE usage logic.

---
 src/gallium/auxiliary/draw/draw_context.c | 12 ------------
 src/gallium/auxiliary/draw/draw_context.h |  2 --
 src/gallium/auxiliary/draw/draw_private.h |  2 --
 src/gallium/auxiliary/draw/draw_vs_sse.c  |  3 ++-
 src/gallium/auxiliary/rtasm/rtasm_cpu.c   | 10 ++++++++--
 5 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 4988d67faa..b4dbdccd61 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -45,12 +45,6 @@ struct draw_context *draw_create( void )
    if (draw == NULL)
       goto fail;
 
-#if defined(__i386__) || defined(__386__)
-   draw->use_sse = GETENV( "GALLIUM_NOSSE" ) == NULL;
-#else
-   draw->use_sse = FALSE;
-#endif
-
    ASSIGN_4V( draw->plane[0], -1,  0,  0, 1 );
    ASSIGN_4V( draw->plane[1],  1,  0,  0, 1 );
    ASSIGN_4V( draw->plane[2],  0, -1,  0, 1 );
@@ -320,12 +314,6 @@ draw_num_vs_outputs(struct draw_context *draw)
 
 
-boolean draw_use_sse(struct draw_context *draw)
-{
-   return (boolean) draw->use_sse;
-}
-
-
 void draw_set_render( struct draw_context *draw, 
 		      struct vbuf_render *render )
 {
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index a0ac980c89..68e2efb865 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -73,8 +73,6 @@ void draw_enable_line_stipple(struct draw_context *draw, boolean enable);
 void draw_enable_point_sprites(struct draw_context *draw, boolean enable);
 
 
-boolean draw_use_sse(struct draw_context *draw);
-
 void
 draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe);
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 27f61c2f40..da973e868b 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -192,8 +192,6 @@ struct draw_context
    float plane[12][4];
    unsigned nr_planes;
 
-   boolean use_sse;
-
    /* If a prim stage introduces new vertex attributes, they'll be stored here
     */
    struct {
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 8e2d381f14..b1e9f67114 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -41,6 +41,7 @@
 #include "draw_private.h"
 #include "draw_context.h"
 
+#include "rtasm/rtasm_cpu.h"
 #include "rtasm/rtasm_x86sse.h"
 #include "tgsi/exec/tgsi_sse2.h"
 #include "tgsi/util/tgsi_parse.h"
@@ -155,7 +156,7 @@ draw_create_vs_sse(struct draw_context *draw,
    struct draw_sse_vertex_shader *vs;
    uint nt = tgsi_num_tokens(templ->tokens);
 
-   if (!draw->use_sse) 
+   if (!rtasm_cpu_has_sse2())
       return NULL;
 
    vs = CALLOC_STRUCT( draw_sse_vertex_shader );
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
index d577ff5b42..175245a9f6 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -26,14 +26,20 @@
  **************************************************************************/
 
 
+#include "pipe/p_debug.h"
 #include "rtasm_cpu.h"
 
 
+static boolean rtasm_sse_enabled(void)
+{
+   return !debug_get_bool_option("GALLIUM_NOSSE", FALSE);
+}
+
 int rtasm_cpu_has_sse(void)
 {
    /* FIXME: actually detect this at run-time */
 #if defined(__i386__) || defined(__386__) || defined(i386)
-   return 1;
+   return rtasm_sse_enabled();
 #else
    return 0;
 #endif
@@ -43,7 +49,7 @@ int rtasm_cpu_has_sse2(void)
 {
    /* FIXME: actually detect this at run-time */
 #if defined(__i386__) || defined(__386__) || defined(i386)
-   return 1;
+   return rtasm_sse_enabled();
 #else
    return 0;
 #endif
-- 
cgit v1.2.3


From b6c9d2ef2cfadbbe3e7aa94f21fd0da36d089952 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 21 Apr 2008 12:37:41 +0100
Subject: rtasm: add dump facility for x86 (from tgsi_sse2.c)

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 267 ++++++++++++++++++++++++++---
 1 file changed, 243 insertions(+), 24 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 7f8cc23d15..f2c08c96a6 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -34,6 +34,116 @@
 #define X86_TWOB 0x0f
 
 
+#define DUMP_SSE  0
+
+#if DUMP_SSE
+
+static void
+_print_reg(
+   struct x86_reg reg )
+{
+   if (reg.mod != mod_REG) 
+      debug_printf( "[" );
+      
+   switch( reg.file ) {
+   case file_REG32:
+      switch( reg.idx ) {
+      case reg_AX: debug_printf( "EAX" ); break;
+      case reg_CX: debug_printf( "ECX" ); break;
+      case reg_DX: debug_printf( "EDX" ); break;
+      case reg_BX: debug_printf( "EBX" ); break;
+      case reg_SP: debug_printf( "ESP" ); break;
+      case reg_BP: debug_printf( "EBP" ); break;
+      case reg_SI: debug_printf( "ESI" ); break;
+      case reg_DI: debug_printf( "EDI" ); break;
+      }
+      break;
+   case file_MMX:
+      debug_printf( "MMX%u", reg.idx );
+      break;
+   case file_XMM:
+      debug_printf( "XMM%u", reg.idx );
+      break;
+   case file_x87:
+      debug_printf( "fp%u", reg.idx );
+      break;
+   }
+
+   if (reg.mod == mod_DISP8 ||
+       reg.mod == mod_DISP32)
+      debug_printf("+%d", reg.disp);
+
+   if (reg.mod != mod_REG) 
+      debug_printf( "]" );
+}
+
+static void
+_fill(
+   const char  *op )
+{
+   unsigned count = 10 - strlen( op );
+
+   while( count-- ) {
+      debug_printf( " " );
+   }
+}
+
+#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
+#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
+#define DUMP( OP ) debug_printf( "\n%s", OP )
+
+#define DUMP_I( OP, I ) do {                    \
+   debug_printf( "\n%s", OP );                  \
+   _fill( OP );                                 \
+   debug_printf( "%u", I );                     \
+} while( 0 )
+
+#define DUMP_R( OP, R0 ) do {                   \
+   debug_printf( "\n%s", OP );                  \
+   _fill( OP );                                 \
+   _print_reg( R0 );                            \
+} while( 0 )
+
+#define DUMP_RR( OP, R0, R1 ) do {              \
+   debug_printf( "\n%s", OP );                  \
+   _fill( OP );                                 \
+   _print_reg( R0 );                            \
+   debug_printf( ", " );                        \
+   _print_reg( R1 );                            \
+} while( 0 )
+
+#define DUMP_RI( OP, R0, I ) do {               \
+   debug_printf( "\n%s", OP );                  \
+   _fill( OP );                                 \
+   _print_reg( R0 );                            \
+   debug_printf( ", " );                        \
+   debug_printf( "%u", I );                     \
+} while( 0 )
+
+#define DUMP_RRI( OP, R0, R1, I ) do {          \
+   debug_printf( "\n%s", OP );                  \
+   _fill( OP );                                 \
+   _print_reg( R0 );                            \
+   debug_printf( ", " );                        \
+   _print_reg( R1 );                            \
+   debug_printf( ", " );                        \
+   debug_printf( "%u", I );                     \
+} while( 0 )
+
+#else
+
+#define DUMP_START()
+#define DUMP_END()
+#define DUMP( OP )
+#define DUMP_I( OP, I )
+#define DUMP_R( OP, R0 )
+#define DUMP_RR( OP, R0, R1 )
+#define DUMP_RI( OP, R0, I )
+#define DUMP_RRI( OP, R0, R1, I )
+
+#endif
+
+
 static void do_realloc( struct x86_function *p )
 {
    if (p->size == 0) {
@@ -272,6 +382,7 @@ unsigned char *x86_jcc_forward( struct x86_function *p,
 
 unsigned char *x86_jmp_forward( struct x86_function *p)
 {
+   DUMP( __FUNCTION__ );
    emit_1ub(p, 0xe9);
    emit_1i(p, 0);
    return x86_get_label(p);
@@ -279,6 +390,8 @@ unsigned char *x86_jmp_forward( struct x86_function *p)
 
 unsigned char *x86_call_forward( struct x86_function *p)
 {
+   DUMP( __FUNCTION__ );
+
    emit_1ub(p, 0xe8);
    emit_1i(p, 0);
    return x86_get_label(p);
@@ -294,6 +407,7 @@ void x86_fixup_fwd_jump( struct x86_function *p,
 
 void x86_jmp( struct x86_function *p, unsigned char *label)
 {
+   DUMP_I( __FUNCTION__, label );
    emit_1ub(p, 0xe9);
    emit_1i(p, pointer_to_intptr( label ) - pointer_to_intptr( x86_get_label(p) ) - 4);
 }
@@ -310,12 +424,14 @@ static unsigned char *cptr( void (*label)() )
  */
 void x86_call( struct x86_function *p, void (*label)())
 {
+   DUMP_I( __FUNCTION__, label );
    emit_1ub(p, 0xe8);
    emit_1i(p, cptr(label) - x86_get_label(p) - 4);
 }
 #else
 void x86_call( struct x86_function *p, struct x86_reg reg)
 {
+   DUMP_R( __FUNCTION__, reg );
    emit_1ub(p, 0xff);
    emit_modrm_noreg(p, 2, reg);
 }
@@ -328,6 +444,7 @@ void x86_call( struct x86_function *p, struct x86_reg reg)
  */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 {
+   DUMP_RI( __FUNCTION__, dst, imm );
    assert(dst.mod == mod_REG);
    emit_1ub(p, 0xb8 + dst.idx);
    emit_1i(p, imm);
@@ -336,6 +453,7 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 void x86_push( struct x86_function *p,
 	       struct x86_reg reg )
 {
+   DUMP_R( __FUNCTION__, reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x50 + reg.idx);
    p->stack_offset += 4;
@@ -344,6 +462,7 @@ void x86_push( struct x86_function *p,
 void x86_pop( struct x86_function *p,
 	      struct x86_reg reg )
 {
+   DUMP_R( __FUNCTION__, reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x58 + reg.idx);
    p->stack_offset -= 4;
@@ -352,6 +471,7 @@ void x86_pop( struct x86_function *p,
 void x86_inc( struct x86_function *p,
 	      struct x86_reg reg )
 {
+   DUMP_R( __FUNCTION__, reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x40 + reg.idx);
 }
@@ -359,17 +479,20 @@ void x86_inc( struct x86_function *p,
 void x86_dec( struct x86_function *p,
 	      struct x86_reg reg )
 {
+   DUMP_R( __FUNCTION__, reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x48 + reg.idx);
 }
 
 void x86_ret( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_1ub(p, 0xc3);
 }
 
 void x86_sahf( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_1ub(p, 0x9e);
 }
 
@@ -377,6 +500,7 @@ void x86_mov( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_op_modrm( p, 0x8b, 0x89, dst, src );
 }
 
@@ -384,6 +508,7 @@ void x86_xor( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_op_modrm( p, 0x33, 0x31, dst, src );
 }
 
@@ -391,6 +516,7 @@ void x86_cmp( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_op_modrm( p, 0x3b, 0x39, dst, src );
 }
 
@@ -398,6 +524,7 @@ void x86_lea( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_1ub(p, 0x8d);
    emit_modrm( p, dst, src );
 }
@@ -406,6 +533,7 @@ void x86_test( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_1ub(p, 0x85);
    emit_modrm( p, dst, src );
 }
@@ -414,6 +542,7 @@ void x86_add( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_op_modrm(p, 0x03, 0x01, dst, src );
 }
 
@@ -422,6 +551,7 @@ void x86_add( struct x86_function *p,
 void x86_mul( struct x86_function *p,
 	       struct x86_reg src )
 {
+   DUMP_R( __FUNCTION__,  src );
    emit_1ub(p, 0xf7);
    emit_modrm_noreg(p, 4, src );
 }
@@ -431,6 +561,7 @@ void x86_imul( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0xAF);
    emit_modrm(p, dst, src);
 }
@@ -440,6 +571,7 @@ void x86_sub( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_op_modrm(p, 0x2b, 0x29, dst, src );
 }
 
@@ -447,6 +579,7 @@ void x86_or( struct x86_function *p,
              struct x86_reg dst,
              struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_op_modrm( p, 0x0b, 0x09, dst, src );
 }
 
@@ -454,6 +587,7 @@ void x86_and( struct x86_function *p,
               struct x86_reg dst,
               struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_op_modrm( p, 0x23, 0x21, dst, src );
 }
 
@@ -468,6 +602,7 @@ void sse_movss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, 0xF3, X86_TWOB);
    emit_op_modrm( p, 0x10, 0x11, dst, src );
 }
@@ -476,6 +611,7 @@ void sse_movaps( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x28, 0x29, dst, src );
 }
@@ -484,6 +620,7 @@ void sse_movups( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x10, 0x11, dst, src );
 }
@@ -492,6 +629,7 @@ void sse_movhps( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    assert(dst.mod != mod_REG || src.mod != mod_REG);
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
@@ -501,6 +639,7 @@ void sse_movlps( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    assert(dst.mod != mod_REG || src.mod != mod_REG);
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
@@ -510,6 +649,7 @@ void sse_maxps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x5F);
    emit_modrm( p, dst, src );
 }
@@ -518,6 +658,7 @@ void sse_maxss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
    emit_modrm( p, dst, src );
 }
@@ -526,6 +667,7 @@ void sse_divss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
    emit_modrm( p, dst, src );
 }
@@ -534,6 +676,7 @@ void sse_minps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x5D);
    emit_modrm( p, dst, src );
 }
@@ -542,6 +685,7 @@ void sse_subps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x5C);
    emit_modrm( p, dst, src );
 }
@@ -550,6 +694,7 @@ void sse_mulps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x59);
    emit_modrm( p, dst, src );
 }
@@ -558,6 +703,7 @@ void sse_mulss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x59);
    emit_modrm( p, dst, src );
 }
@@ -566,6 +712,7 @@ void sse_addps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x58);
    emit_modrm( p, dst, src );
 }
@@ -574,6 +721,7 @@ void sse_addss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x58);
    emit_modrm( p, dst, src );
 }
@@ -582,6 +730,7 @@ void sse_andnps( struct x86_function *p,
                  struct x86_reg dst,
                  struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x55);
    emit_modrm( p, dst, src );
 }
@@ -590,6 +739,7 @@ void sse_andps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x54);
    emit_modrm( p, dst, src );
 }
@@ -598,6 +748,7 @@ void sse_rsqrtps( struct x86_function *p,
                   struct x86_reg dst,
                   struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x52);
    emit_modrm( p, dst, src );
 }
@@ -606,6 +757,7 @@ void sse_rsqrtss( struct x86_function *p,
 		  struct x86_reg dst,
 		  struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x52);
    emit_modrm( p, dst, src );
 
@@ -615,6 +767,7 @@ void sse_movhlps( struct x86_function *p,
 		  struct x86_reg dst,
 		  struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    assert(dst.mod == mod_REG && src.mod == mod_REG);
    emit_2ub(p, X86_TWOB, 0x12);
    emit_modrm( p, dst, src );
@@ -624,6 +777,7 @@ void sse_movlhps( struct x86_function *p,
 		  struct x86_reg dst,
 		  struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    assert(dst.mod == mod_REG && src.mod == mod_REG);
    emit_2ub(p, X86_TWOB, 0x16);
    emit_modrm( p, dst, src );
@@ -633,6 +787,7 @@ void sse_orps( struct x86_function *p,
                struct x86_reg dst,
                struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x56);
    emit_modrm( p, dst, src );
 }
@@ -641,6 +796,7 @@ void sse_xorps( struct x86_function *p,
                 struct x86_reg dst,
                 struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x57);
    emit_modrm( p, dst, src );
 }
@@ -649,6 +805,7 @@ void sse_cvtps2pi( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    assert(dst.file == file_MMX && 
 	  (src.file == file_XMM || src.mod != mod_REG));
 
@@ -662,6 +819,7 @@ void sse2_cvtdq2ps( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x5b);
    emit_modrm( p, dst, src );
 }
@@ -671,31 +829,34 @@ void sse2_cvtdq2ps( struct x86_function *p,
  * arg0.
  */
 void sse_shufps( struct x86_function *p,
-		 struct x86_reg dest,
-		 struct x86_reg arg0,
+		 struct x86_reg dst,
+		 struct x86_reg src,
 		 unsigned char shuf) 
 {
+   DUMP_RRI( __FUNCTION__, dst, src, shuf );
    emit_2ub(p, X86_TWOB, 0xC6);
-   emit_modrm(p, dest, arg0);
+   emit_modrm(p, dst, src);
    emit_1ub(p, shuf); 
 }
 
 void sse_cmpps( struct x86_function *p,
-		struct x86_reg dest,
-		struct x86_reg arg0,
+		struct x86_reg dst,
+		struct x86_reg src,
 		unsigned char cc) 
 {
+   DUMP_RRI( "CMPPS", dst, src, cc );
    emit_2ub(p, X86_TWOB, 0xC2);
-   emit_modrm(p, dest, arg0);
+   emit_modrm(p, dst, src);
    emit_1ub(p, cc); 
 }
 
 void sse_pmovmskb( struct x86_function *p,
-                   struct x86_reg dest,
+                   struct x86_reg dst,
                    struct x86_reg src)
 {
-    emit_3ub(p, 0x66, X86_TWOB, 0xD7);
-    emit_modrm(p, dest, src);
+   DUMP_RR( __FUNCTION__, dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0xD7);
+   emit_modrm(p, dst, src);
 }
 
 /***********************************************************************
@@ -706,12 +867,13 @@ void sse_pmovmskb( struct x86_function *p,
  * Perform a reduced swizzle:
  */
 void sse2_pshufd( struct x86_function *p,
-		  struct x86_reg dest,
-		  struct x86_reg arg0,
+		  struct x86_reg dst,
+		  struct x86_reg src,
 		  unsigned char shuf) 
 {
+   DUMP_RRI( __FUNCTION__, dst, src, shuf );
    emit_3ub(p, 0x66, X86_TWOB, 0x70);
-   emit_modrm(p, dest, arg0);
+   emit_modrm(p, dst, src);
    emit_1ub(p, shuf); 
 }
 
@@ -719,6 +881,7 @@ void sse2_cvttps2dq( struct x86_function *p,
                      struct x86_reg dst,
                      struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
    emit_modrm( p, dst, src );
 }
@@ -727,6 +890,7 @@ void sse2_cvtps2dq( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x5B);
    emit_modrm( p, dst, src );
 }
@@ -735,6 +899,7 @@ void sse2_packssdw( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x6B);
    emit_modrm( p, dst, src );
 }
@@ -743,6 +908,7 @@ void sse2_packsswb( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x63);
    emit_modrm( p, dst, src );
 }
@@ -751,6 +917,7 @@ void sse2_packuswb( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x67);
    emit_modrm( p, dst, src );
 }
@@ -759,6 +926,7 @@ void sse2_punpcklbw( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x60);
    emit_modrm( p, dst, src );
 }
@@ -768,6 +936,7 @@ void sse2_rcpps( struct x86_function *p,
                  struct x86_reg dst,
                  struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, X86_TWOB, 0x53);
    emit_modrm( p, dst, src );
 }
@@ -776,6 +945,7 @@ void sse2_rcpss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x53);
    emit_modrm( p, dst, src );
 }
@@ -784,6 +954,7 @@ void sse2_movd( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    emit_2ub(p, 0x66, X86_TWOB);
    emit_op_modrm( p, 0x6e, 0x7e, dst, src );
 }
@@ -796,30 +967,35 @@ void sse2_movd( struct x86_function *p,
  */
 void x87_fist( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    emit_1ub(p, 0xdb);
    emit_modrm_noreg(p, 2, dst);
 }
 
 void x87_fistp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    emit_1ub(p, 0xdb);
    emit_modrm_noreg(p, 3, dst);
 }
 
 void x87_fild( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( __FUNCTION__, arg );
    emit_1ub(p, 0xdf);
    emit_modrm_noreg(p, 0, arg);
 }
 
 void x87_fldz( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xee);
 }
 
 
 void x87_fldcw( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( __FUNCTION__, arg );
    assert(arg.file == file_REG32);
    assert(arg.mod != mod_REG);
    emit_1ub(p, 0xd9);
@@ -828,26 +1004,31 @@ void x87_fldcw( struct x86_function *p, struct x86_reg arg )
 
 void x87_fld1( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xe8);
 }
 
 void x87_fldl2e( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xea);
 }
 
 void x87_fldln2( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xed);
 }
 
 void x87_fwait( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_1ub(p, 0x9b);
 }
 
 void x87_fnclex( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xdb, 0xe2);
 }
 
@@ -884,49 +1065,55 @@ static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86
       assert(0);
 }
 
-void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( __FUNCTION__, dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xc8,
 		0xdc, 0xc8,
 		4);
 }
 
-void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( __FUNCTION__, dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xe0,
 		0xdc, 0xe8,
 		4);
 }
 
-void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( __FUNCTION__, dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xe8,
 		0xdc, 0xe0,
 		5);
 }
 
-void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( __FUNCTION__, dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xc0,
 		0xdc, 0xc0,
 		0);
 }
 
-void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( __FUNCTION__, dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xf0,
 		0xdc, 0xf8,
 		6);
 }
 
-void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( __FUNCTION__, dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xf8,
 		0xdc, 0xf0,
 		7);
@@ -934,6 +1121,7 @@ void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
 
 void x87_fmulp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc8+dst.idx);
@@ -941,6 +1129,7 @@ void x87_fmulp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fsubp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe8+dst.idx);
@@ -948,6 +1137,7 @@ void x87_fsubp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe0+dst.idx);
@@ -955,6 +1145,7 @@ void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
 
 void x87_faddp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc0+dst.idx);
@@ -962,6 +1153,7 @@ void x87_faddp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fdivp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf8+dst.idx);
@@ -969,6 +1161,7 @@ void x87_fdivp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf0+dst.idx);
@@ -976,70 +1169,83 @@ void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fucom( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( __FUNCTION__, arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xdd, 0xe0+arg.idx);
 }
 
 void x87_fucomp( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( __FUNCTION__, arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xdd, 0xe8+arg.idx);
 }
 
 void x87_fucompp( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xda, 0xe9);
 }
 
 void x87_fxch( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( __FUNCTION__, arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xd9, 0xc8+arg.idx);
 }
 
 void x87_fabs( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xe1);
 }
 
 void x87_fchs( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xe0);
 }
 
 void x87_fcos( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xff);
 }
 
 
 void x87_fprndint( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xfc);
 }
 
 void x87_fscale( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xfd);
 }
 
 void x87_fsin( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xfe);
 }
 
 void x87_fsincos( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xfb);
 }
 
 void x87_fsqrt( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xfa);
 }
 
 void x87_fxtract( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xf4);
 }
 
@@ -1049,6 +1255,7 @@ void x87_fxtract( struct x86_function *p )
  */
 void x87_f2xm1( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xf0);
 }
 
@@ -1057,6 +1264,7 @@ void x87_f2xm1( struct x86_function *p )
  */
 void x87_fyl2x( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xf1);
 }
 
@@ -1067,12 +1275,14 @@ void x87_fyl2x( struct x86_function *p )
  */
 void x87_fyl2xp1( struct x86_function *p )
 {
+   DUMP( __FUNCTION__ );
    emit_2ub(p, 0xd9, 0xf9);
 }
 
 
 void x87_fld( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( __FUNCTION__, arg );
    if (arg.file == file_x87) 
       emit_2ub(p, 0xd9, 0xc0 + arg.idx);
    else {
@@ -1083,6 +1293,7 @@ void x87_fld( struct x86_function *p, struct x86_reg arg )
 
 void x87_fst( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xdd, 0xd0 + dst.idx);
    else {
@@ -1093,6 +1304,7 @@ void x87_fst( struct x86_function *p, struct x86_reg dst )
 
 void x87_fstp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xdd, 0xd8 + dst.idx);
    else {
@@ -1103,6 +1315,7 @@ void x87_fstp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fcom( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xd8, 0xd0 + dst.idx);
    else {
@@ -1113,6 +1326,7 @@ void x87_fcom( struct x86_function *p, struct x86_reg dst )
 
 void x87_fcomp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xd8, 0xd8 + dst.idx);
    else {
@@ -1124,6 +1338,7 @@ void x87_fcomp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( __FUNCTION__, dst );
    assert(dst.file == file_REG32);
 
    if (dst.idx == reg_AX &&
@@ -1153,6 +1368,7 @@ void mmx_packssdw( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    assert(dst.file == file_MMX && 
 	  (src.file == file_MMX || src.mod != mod_REG));
 
@@ -1166,6 +1382,7 @@ void mmx_packuswb( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    assert(dst.file == file_MMX && 
 	  (src.file == file_MMX || src.mod != mod_REG));
 
@@ -1179,6 +1396,7 @@ void mmx_movd( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    p->need_emms = 1;
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x6e, 0x7e, dst, src );
@@ -1188,6 +1406,7 @@ void mmx_movq( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( __FUNCTION__, dst, src );
    p->need_emms = 1;
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x6f, 0x7f, dst, src );
-- 
cgit v1.2.3


From b17e123a8f20239e8e1fc6816ccf115d9ec57471 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 21 Apr 2008 19:09:38 +0100
Subject: rtasm: propogate errors in x86 emit

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 34 +++++++++++++++++++++++++-----
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  1 +
 2 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index f2c08c96a6..c2fe0e40f5 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -146,7 +146,10 @@ _fill(
 
 static void do_realloc( struct x86_function *p )
 {
-   if (p->size == 0) {
+   if (p->store == p->error_overflow) {
+      p->csr = p->store;
+   }
+   else if (p->size == 0) {
       p->size = 1024;
       p->store = rtasm_exec_malloc(p->size);
       p->csr = p->store;
@@ -156,10 +159,22 @@ static void do_realloc( struct x86_function *p )
       unsigned char *tmp = p->store;
       p->size *= 2;
       p->store = rtasm_exec_malloc(p->size);
-      memcpy(p->store, tmp, used);
-      p->csr = p->store + used;
+
+      if (p->store) {
+         memcpy(p->store, tmp, used);
+         p->csr = p->store + used;
+      }
+      else {
+         p->csr = p->store;
+      }
+
       rtasm_exec_free(tmp);
    }
+
+   if (p->store == NULL) {
+      p->store = p->csr = p->error_overflow;
+      p->size = 4;
+   }
 }
 
 /* Emit bytes to the instruction stream:
@@ -1440,12 +1455,17 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size )
 {
    p->size = code_size;
    p->store = rtasm_exec_malloc(code_size);
+   if (p->store == NULL) {
+      p->store = p->error_overflow;
+   }
    p->csr = p->store;
 }
 
 void x86_release_func( struct x86_function *p )
 {
-   rtasm_exec_free(p->store);
+   if (p->store && p->store != p->error_overflow)
+      rtasm_exec_free(p->store);
+
    p->store = NULL;
    p->csr = NULL;
    p->size = 0;
@@ -1456,7 +1476,11 @@ void (*x86_get_func( struct x86_function *p ))(void)
 {
    if (DISASSEM && p->store)
       debug_printf("disassemble %p %p\n", p->store, p->csr);
-   return (void (*)(void)) p->store;
+
+   if (p->store == p->error_overflow)
+      return (void (*)(void)) NULL;
+   else
+      return (void (*)(void)) p->store;
 }
 
 #else
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 5e99ceea70..695a1cef4e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -43,6 +43,7 @@ struct x86_function {
    unsigned char *csr;
    unsigned stack_offset;
    int need_emms;
+   unsigned char error_overflow[4];
    const char *fn;
 };
 
-- 
cgit v1.2.3


From 73c2711bb186692b866720058a09f5eb05950213 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 21 Apr 2008 19:43:53 +0100
Subject: rtasm: clean up debug dumping a little

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 286 ++++++++++++++---------------
 1 file changed, 140 insertions(+), 146 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index c2fe0e40f5..10796c540d 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -77,69 +77,60 @@ _print_reg(
       debug_printf( "]" );
 }
 
-static void
-_fill(
-   const char  *op )
-{
-   unsigned count = 10 - strlen( op );
 
-   while( count-- ) {
-      debug_printf( " " );
-   }
-}
+#define DUMP_START() debug_printf( "\n" )
+#define DUMP_END() debug_printf( "\n" )
 
-#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
-#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) debug_printf( "\n%s", OP )
+#define DUMP() do {                             \
+   const char *foo = __FUNCTION__;              \
+   while (*foo && *foo != '_')                  \
+      foo++;                                    \
+   if  (*foo)                                   \
+      foo++;                                    \
+   debug_printf( "\n% 15s ", foo );             \
+} while (0)
 
-#define DUMP_I( OP, I ) do {                    \
-   debug_printf( "\n%s", OP );                  \
-   _fill( OP );                                 \
+#define DUMP_I( I ) do {                        \
+   DUMP();                                      \
    debug_printf( "%u", I );                     \
 } while( 0 )
 
-#define DUMP_R( OP, R0 ) do {                   \
-   debug_printf( "\n%s", OP );                  \
-   _fill( OP );                                 \
+#define DUMP_R( R0 ) do {                       \
+   DUMP();                                      \
    _print_reg( R0 );                            \
 } while( 0 )
 
-#define DUMP_RR( OP, R0, R1 ) do {              \
-   debug_printf( "\n%s", OP );                  \
-   _fill( OP );                                 \
+#define DUMP_RR( R0, R1 ) do {                  \
+   DUMP();                                      \
    _print_reg( R0 );                            \
    debug_printf( ", " );                        \
    _print_reg( R1 );                            \
 } while( 0 )
 
-#define DUMP_RI( OP, R0, I ) do {               \
-   debug_printf( "\n%s", OP );                  \
-   _fill( OP );                                 \
+#define DUMP_RI( R0, I ) do {                   \
+   DUMP();                                      \
    _print_reg( R0 );                            \
-   debug_printf( ", " );                        \
-   debug_printf( "%u", I );                     \
+   debug_printf( ", %u", I );                   \
 } while( 0 )
 
-#define DUMP_RRI( OP, R0, R1, I ) do {          \
-   debug_printf( "\n%s", OP );                  \
-   _fill( OP );                                 \
+#define DUMP_RRI( R0, R1, I ) do {              \
+   DUMP();                                      \
    _print_reg( R0 );                            \
    debug_printf( ", " );                        \
    _print_reg( R1 );                            \
-   debug_printf( ", " );                        \
-   debug_printf( "%u", I );                     \
+   debug_printf( ", %u", I );                   \
 } while( 0 )
 
 #else
 
 #define DUMP_START()
 #define DUMP_END()
-#define DUMP( OP )
-#define DUMP_I( OP, I )
-#define DUMP_R( OP, R0 )
-#define DUMP_RR( OP, R0, R1 )
-#define DUMP_RI( OP, R0, I )
-#define DUMP_RRI( OP, R0, R1, I )
+#define DUMP( )
+#define DUMP_I( I )
+#define DUMP_R( R0 )
+#define DUMP_RR( R0, R1 )
+#define DUMP_RI( R0, I )
+#define DUMP_RRI( R0, R1, I )
 
 #endif
 
@@ -173,7 +164,7 @@ static void do_realloc( struct x86_function *p )
 
    if (p->store == NULL) {
       p->store = p->csr = p->error_overflow;
-      p->size = 4;
+      p->size = sizeof(p->error_overflow);
    }
 }
 
@@ -397,7 +388,7 @@ unsigned char *x86_jcc_forward( struct x86_function *p,
 
 unsigned char *x86_jmp_forward( struct x86_function *p)
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_1ub(p, 0xe9);
    emit_1i(p, 0);
    return x86_get_label(p);
@@ -405,7 +396,7 @@ unsigned char *x86_jmp_forward( struct x86_function *p)
 
 unsigned char *x86_call_forward( struct x86_function *p)
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
 
    emit_1ub(p, 0xe8);
    emit_1i(p, 0);
@@ -422,7 +413,7 @@ void x86_fixup_fwd_jump( struct x86_function *p,
 
 void x86_jmp( struct x86_function *p, unsigned char *label)
 {
-   DUMP_I( __FUNCTION__, label );
+   DUMP_I( label );
    emit_1ub(p, 0xe9);
    emit_1i(p, pointer_to_intptr( label ) - pointer_to_intptr( x86_get_label(p) ) - 4);
 }
@@ -439,14 +430,14 @@ static unsigned char *cptr( void (*label)() )
  */
 void x86_call( struct x86_function *p, void (*label)())
 {
-   DUMP_I( __FUNCTION__, label );
+   DUMP_I( label );
    emit_1ub(p, 0xe8);
    emit_1i(p, cptr(label) - x86_get_label(p) - 4);
 }
 #else
 void x86_call( struct x86_function *p, struct x86_reg reg)
 {
-   DUMP_R( __FUNCTION__, reg );
+   DUMP_R( reg );
    emit_1ub(p, 0xff);
    emit_modrm_noreg(p, 2, reg);
 }
@@ -459,7 +450,7 @@ void x86_call( struct x86_function *p, struct x86_reg reg)
  */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 {
-   DUMP_RI( __FUNCTION__, dst, imm );
+   DUMP_RI( dst, imm );
    assert(dst.mod == mod_REG);
    emit_1ub(p, 0xb8 + dst.idx);
    emit_1i(p, imm);
@@ -468,7 +459,7 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 void x86_push( struct x86_function *p,
 	       struct x86_reg reg )
 {
-   DUMP_R( __FUNCTION__, reg );
+   DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x50 + reg.idx);
    p->stack_offset += 4;
@@ -477,7 +468,7 @@ void x86_push( struct x86_function *p,
 void x86_pop( struct x86_function *p,
 	      struct x86_reg reg )
 {
-   DUMP_R( __FUNCTION__, reg );
+   DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x58 + reg.idx);
    p->stack_offset -= 4;
@@ -486,7 +477,7 @@ void x86_pop( struct x86_function *p,
 void x86_inc( struct x86_function *p,
 	      struct x86_reg reg )
 {
-   DUMP_R( __FUNCTION__, reg );
+   DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x40 + reg.idx);
 }
@@ -494,20 +485,20 @@ void x86_inc( struct x86_function *p,
 void x86_dec( struct x86_function *p,
 	      struct x86_reg reg )
 {
-   DUMP_R( __FUNCTION__, reg );
+   DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x48 + reg.idx);
 }
 
 void x86_ret( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_1ub(p, 0xc3);
 }
 
 void x86_sahf( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_1ub(p, 0x9e);
 }
 
@@ -515,7 +506,7 @@ void x86_mov( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x8b, 0x89, dst, src );
 }
 
@@ -523,7 +514,7 @@ void x86_xor( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x33, 0x31, dst, src );
 }
 
@@ -531,7 +522,7 @@ void x86_cmp( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x3b, 0x39, dst, src );
 }
 
@@ -539,7 +530,7 @@ void x86_lea( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_1ub(p, 0x8d);
    emit_modrm( p, dst, src );
 }
@@ -548,7 +539,7 @@ void x86_test( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_1ub(p, 0x85);
    emit_modrm( p, dst, src );
 }
@@ -557,7 +548,7 @@ void x86_add( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_op_modrm(p, 0x03, 0x01, dst, src );
 }
 
@@ -566,7 +557,7 @@ void x86_add( struct x86_function *p,
 void x86_mul( struct x86_function *p,
 	       struct x86_reg src )
 {
-   DUMP_R( __FUNCTION__,  src );
+   DUMP_R(  src );
    emit_1ub(p, 0xf7);
    emit_modrm_noreg(p, 4, src );
 }
@@ -576,7 +567,7 @@ void x86_imul( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0xAF);
    emit_modrm(p, dst, src);
 }
@@ -586,7 +577,7 @@ void x86_sub( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_op_modrm(p, 0x2b, 0x29, dst, src );
 }
 
@@ -594,7 +585,7 @@ void x86_or( struct x86_function *p,
              struct x86_reg dst,
              struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x0b, 0x09, dst, src );
 }
 
@@ -602,7 +593,7 @@ void x86_and( struct x86_function *p,
               struct x86_reg dst,
               struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x23, 0x21, dst, src );
 }
 
@@ -617,7 +608,7 @@ void sse_movss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, 0xF3, X86_TWOB);
    emit_op_modrm( p, 0x10, 0x11, dst, src );
 }
@@ -626,7 +617,7 @@ void sse_movaps( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x28, 0x29, dst, src );
 }
@@ -635,7 +626,7 @@ void sse_movups( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x10, 0x11, dst, src );
 }
@@ -644,7 +635,7 @@ void sse_movhps( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    assert(dst.mod != mod_REG || src.mod != mod_REG);
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
@@ -654,7 +645,7 @@ void sse_movlps( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    assert(dst.mod != mod_REG || src.mod != mod_REG);
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
@@ -664,7 +655,7 @@ void sse_maxps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x5F);
    emit_modrm( p, dst, src );
 }
@@ -673,7 +664,7 @@ void sse_maxss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
    emit_modrm( p, dst, src );
 }
@@ -682,7 +673,7 @@ void sse_divss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
    emit_modrm( p, dst, src );
 }
@@ -691,7 +682,7 @@ void sse_minps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x5D);
    emit_modrm( p, dst, src );
 }
@@ -700,7 +691,7 @@ void sse_subps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x5C);
    emit_modrm( p, dst, src );
 }
@@ -709,7 +700,7 @@ void sse_mulps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x59);
    emit_modrm( p, dst, src );
 }
@@ -718,7 +709,7 @@ void sse_mulss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x59);
    emit_modrm( p, dst, src );
 }
@@ -727,7 +718,7 @@ void sse_addps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x58);
    emit_modrm( p, dst, src );
 }
@@ -736,7 +727,7 @@ void sse_addss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x58);
    emit_modrm( p, dst, src );
 }
@@ -745,7 +736,7 @@ void sse_andnps( struct x86_function *p,
                  struct x86_reg dst,
                  struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x55);
    emit_modrm( p, dst, src );
 }
@@ -754,7 +745,7 @@ void sse_andps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x54);
    emit_modrm( p, dst, src );
 }
@@ -763,7 +754,7 @@ void sse_rsqrtps( struct x86_function *p,
                   struct x86_reg dst,
                   struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x52);
    emit_modrm( p, dst, src );
 }
@@ -772,7 +763,7 @@ void sse_rsqrtss( struct x86_function *p,
 		  struct x86_reg dst,
 		  struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x52);
    emit_modrm( p, dst, src );
 
@@ -782,7 +773,7 @@ void sse_movhlps( struct x86_function *p,
 		  struct x86_reg dst,
 		  struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    assert(dst.mod == mod_REG && src.mod == mod_REG);
    emit_2ub(p, X86_TWOB, 0x12);
    emit_modrm( p, dst, src );
@@ -792,7 +783,7 @@ void sse_movlhps( struct x86_function *p,
 		  struct x86_reg dst,
 		  struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    assert(dst.mod == mod_REG && src.mod == mod_REG);
    emit_2ub(p, X86_TWOB, 0x16);
    emit_modrm( p, dst, src );
@@ -802,7 +793,7 @@ void sse_orps( struct x86_function *p,
                struct x86_reg dst,
                struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x56);
    emit_modrm( p, dst, src );
 }
@@ -811,7 +802,7 @@ void sse_xorps( struct x86_function *p,
                 struct x86_reg dst,
                 struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x57);
    emit_modrm( p, dst, src );
 }
@@ -820,7 +811,7 @@ void sse_cvtps2pi( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    assert(dst.file == file_MMX && 
 	  (src.file == file_XMM || src.mod != mod_REG));
 
@@ -834,7 +825,7 @@ void sse2_cvtdq2ps( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x5b);
    emit_modrm( p, dst, src );
 }
@@ -848,7 +839,7 @@ void sse_shufps( struct x86_function *p,
 		 struct x86_reg src,
 		 unsigned char shuf) 
 {
-   DUMP_RRI( __FUNCTION__, dst, src, shuf );
+   DUMP_RRI( dst, src, shuf );
    emit_2ub(p, X86_TWOB, 0xC6);
    emit_modrm(p, dst, src);
    emit_1ub(p, shuf); 
@@ -859,7 +850,7 @@ void sse_cmpps( struct x86_function *p,
 		struct x86_reg src,
 		unsigned char cc) 
 {
-   DUMP_RRI( "CMPPS", dst, src, cc );
+   DUMP_RRI( dst, src, cc );
    emit_2ub(p, X86_TWOB, 0xC2);
    emit_modrm(p, dst, src);
    emit_1ub(p, cc); 
@@ -869,7 +860,7 @@ void sse_pmovmskb( struct x86_function *p,
                    struct x86_reg dst,
                    struct x86_reg src)
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0xD7);
    emit_modrm(p, dst, src);
 }
@@ -886,7 +877,7 @@ void sse2_pshufd( struct x86_function *p,
 		  struct x86_reg src,
 		  unsigned char shuf) 
 {
-   DUMP_RRI( __FUNCTION__, dst, src, shuf );
+   DUMP_RRI( dst, src, shuf );
    emit_3ub(p, 0x66, X86_TWOB, 0x70);
    emit_modrm(p, dst, src);
    emit_1ub(p, shuf); 
@@ -896,7 +887,7 @@ void sse2_cvttps2dq( struct x86_function *p,
                      struct x86_reg dst,
                      struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
    emit_modrm( p, dst, src );
 }
@@ -905,7 +896,7 @@ void sse2_cvtps2dq( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x5B);
    emit_modrm( p, dst, src );
 }
@@ -914,7 +905,7 @@ void sse2_packssdw( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x6B);
    emit_modrm( p, dst, src );
 }
@@ -923,7 +914,7 @@ void sse2_packsswb( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x63);
    emit_modrm( p, dst, src );
 }
@@ -932,7 +923,7 @@ void sse2_packuswb( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x67);
    emit_modrm( p, dst, src );
 }
@@ -941,7 +932,7 @@ void sse2_punpcklbw( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x60);
    emit_modrm( p, dst, src );
 }
@@ -951,7 +942,7 @@ void sse2_rcpps( struct x86_function *p,
                  struct x86_reg dst,
                  struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x53);
    emit_modrm( p, dst, src );
 }
@@ -960,7 +951,7 @@ void sse2_rcpss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x53);
    emit_modrm( p, dst, src );
 }
@@ -969,7 +960,7 @@ void sse2_movd( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    emit_2ub(p, 0x66, X86_TWOB);
    emit_op_modrm( p, 0x6e, 0x7e, dst, src );
 }
@@ -982,35 +973,35 @@ void sse2_movd( struct x86_function *p,
  */
 void x87_fist( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    emit_1ub(p, 0xdb);
    emit_modrm_noreg(p, 2, dst);
 }
 
 void x87_fistp( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    emit_1ub(p, 0xdb);
    emit_modrm_noreg(p, 3, dst);
 }
 
 void x87_fild( struct x86_function *p, struct x86_reg arg )
 {
-   DUMP_R( __FUNCTION__, arg );
+   DUMP_R( arg );
    emit_1ub(p, 0xdf);
    emit_modrm_noreg(p, 0, arg);
 }
 
 void x87_fldz( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xee);
 }
 
 
 void x87_fldcw( struct x86_function *p, struct x86_reg arg )
 {
-   DUMP_R( __FUNCTION__, arg );
+   DUMP_R( arg );
    assert(arg.file == file_REG32);
    assert(arg.mod != mod_REG);
    emit_1ub(p, 0xd9);
@@ -1019,31 +1010,31 @@ void x87_fldcw( struct x86_function *p, struct x86_reg arg )
 
 void x87_fld1( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xe8);
 }
 
 void x87_fldl2e( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xea);
 }
 
 void x87_fldln2( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xed);
 }
 
 void x87_fwait( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_1ub(p, 0x9b);
 }
 
 void x87_fnclex( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xdb, 0xe2);
 }
 
@@ -1082,7 +1073,7 @@ static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86
 
 void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    x87_arith_op(p, dst, src, 
 		0xd8, 0xc8,
 		0xdc, 0xc8,
@@ -1091,7 +1082,7 @@ void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 
 void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    x87_arith_op(p, dst, src, 
 		0xd8, 0xe0,
 		0xdc, 0xe8,
@@ -1100,7 +1091,7 @@ void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 
 void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    x87_arith_op(p, dst, src, 
 		0xd8, 0xe8,
 		0xdc, 0xe0,
@@ -1109,7 +1100,7 @@ void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 
 void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    x87_arith_op(p, dst, src, 
 		0xd8, 0xc0,
 		0xdc, 0xc0,
@@ -1118,7 +1109,7 @@ void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 
 void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    x87_arith_op(p, dst, src, 
 		0xd8, 0xf0,
 		0xdc, 0xf8,
@@ -1127,7 +1118,7 @@ void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 
 void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    x87_arith_op(p, dst, src, 
 		0xd8, 0xf8,
 		0xdc, 0xf0,
@@ -1136,7 +1127,7 @@ void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 
 void x87_fmulp( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc8+dst.idx);
@@ -1144,7 +1135,7 @@ void x87_fmulp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fsubp( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe8+dst.idx);
@@ -1152,7 +1143,7 @@ void x87_fsubp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe0+dst.idx);
@@ -1160,7 +1151,7 @@ void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
 
 void x87_faddp( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc0+dst.idx);
@@ -1168,7 +1159,7 @@ void x87_faddp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fdivp( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf8+dst.idx);
@@ -1176,7 +1167,7 @@ void x87_fdivp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf0+dst.idx);
@@ -1184,83 +1175,83 @@ void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fucom( struct x86_function *p, struct x86_reg arg )
 {
-   DUMP_R( __FUNCTION__, arg );
+   DUMP_R( arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xdd, 0xe0+arg.idx);
 }
 
 void x87_fucomp( struct x86_function *p, struct x86_reg arg )
 {
-   DUMP_R( __FUNCTION__, arg );
+   DUMP_R( arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xdd, 0xe8+arg.idx);
 }
 
 void x87_fucompp( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xda, 0xe9);
 }
 
 void x87_fxch( struct x86_function *p, struct x86_reg arg )
 {
-   DUMP_R( __FUNCTION__, arg );
+   DUMP_R( arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xd9, 0xc8+arg.idx);
 }
 
 void x87_fabs( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xe1);
 }
 
 void x87_fchs( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xe0);
 }
 
 void x87_fcos( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xff);
 }
 
 
 void x87_fprndint( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xfc);
 }
 
 void x87_fscale( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xfd);
 }
 
 void x87_fsin( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xfe);
 }
 
 void x87_fsincos( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xfb);
 }
 
 void x87_fsqrt( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xfa);
 }
 
 void x87_fxtract( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xf4);
 }
 
@@ -1270,7 +1261,7 @@ void x87_fxtract( struct x86_function *p )
  */
 void x87_f2xm1( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xf0);
 }
 
@@ -1279,7 +1270,7 @@ void x87_f2xm1( struct x86_function *p )
  */
 void x87_fyl2x( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xf1);
 }
 
@@ -1290,14 +1281,14 @@ void x87_fyl2x( struct x86_function *p )
  */
 void x87_fyl2xp1( struct x86_function *p )
 {
-   DUMP( __FUNCTION__ );
+   DUMP();
    emit_2ub(p, 0xd9, 0xf9);
 }
 
 
 void x87_fld( struct x86_function *p, struct x86_reg arg )
 {
-   DUMP_R( __FUNCTION__, arg );
+   DUMP_R( arg );
    if (arg.file == file_x87) 
       emit_2ub(p, 0xd9, 0xc0 + arg.idx);
    else {
@@ -1308,7 +1299,7 @@ void x87_fld( struct x86_function *p, struct x86_reg arg )
 
 void x87_fst( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xdd, 0xd0 + dst.idx);
    else {
@@ -1319,7 +1310,7 @@ void x87_fst( struct x86_function *p, struct x86_reg dst )
 
 void x87_fstp( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xdd, 0xd8 + dst.idx);
    else {
@@ -1330,7 +1321,7 @@ void x87_fstp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fcom( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xd8, 0xd0 + dst.idx);
    else {
@@ -1341,7 +1332,7 @@ void x87_fcom( struct x86_function *p, struct x86_reg dst )
 
 void x87_fcomp( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xd8, 0xd8 + dst.idx);
    else {
@@ -1353,7 +1344,7 @@ void x87_fcomp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
 {
-   DUMP_R( __FUNCTION__, dst );
+   DUMP_R( dst );
    assert(dst.file == file_REG32);
 
    if (dst.idx == reg_AX &&
@@ -1383,7 +1374,7 @@ void mmx_packssdw( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    assert(dst.file == file_MMX && 
 	  (src.file == file_MMX || src.mod != mod_REG));
 
@@ -1397,7 +1388,7 @@ void mmx_packuswb( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    assert(dst.file == file_MMX && 
 	  (src.file == file_MMX || src.mod != mod_REG));
 
@@ -1411,7 +1402,7 @@ void mmx_movd( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    p->need_emms = 1;
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x6e, 0x7e, dst, src );
@@ -1421,7 +1412,7 @@ void mmx_movq( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
-   DUMP_RR( __FUNCTION__, dst, src );
+   DUMP_RR( dst, src );
    p->need_emms = 1;
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x6f, 0x7f, dst, src );
@@ -1449,6 +1440,7 @@ void x86_init_func( struct x86_function *p )
    p->size = 0;
    p->store = NULL;
    p->csr = p->store;
+   DUMP_START();
 }
 
 void x86_init_func_size( struct x86_function *p, unsigned code_size )
@@ -1459,6 +1451,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size )
       p->store = p->error_overflow;
    }
    p->csr = p->store;
+   DUMP_START();
 }
 
 void x86_release_func( struct x86_function *p )
@@ -1474,6 +1467,7 @@ void x86_release_func( struct x86_function *p )
 
 void (*x86_get_func( struct x86_function *p ))(void)
 {
+   DUMP_END();
    if (DISASSEM && p->store)
       debug_printf("disassemble %p %p\n", p->store, p->csr);
 
-- 
cgit v1.2.3


From 73706deef59c35472d2410411403f30c9603f22f Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 21 Apr 2008 19:48:08 +0100
Subject: rtasm: quieten sse_enabled debug

---
 src/gallium/auxiliary/rtasm/rtasm_cpu.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
index 175245a9f6..f01e12faa0 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -32,7 +32,16 @@
 
 static boolean rtasm_sse_enabled(void)
 {
-   return !debug_get_bool_option("GALLIUM_NOSSE", FALSE);
+   static boolean firsttime = 1;
+   static boolean enabled;
+   
+   /* This gets called quite often at the moment:
+    */
+   if (firsttime) {
+      enabled =  !debug_get_bool_option("GALLIUM_NOSSE", FALSE);
+      firsttime = FALSE;
+   }
+   return enabled;
 }
 
 int rtasm_cpu_has_sse(void)
-- 
cgit v1.2.3


From a945420ae6f96f0d7024f97e37ffd31329865a84 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 21 Apr 2008 19:48:21 +0100
Subject: rtasm: debug some missing funcs

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 10796c540d..3cd45d7dd9 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -364,6 +364,7 @@ void x86_jcc( struct x86_function *p,
 	      unsigned char *label )
 {
    intptr_t offset = pointer_to_intptr( label ) - (pointer_to_intptr( x86_get_label(p) ) + 2);
+   DUMP_I(cc);
    
    if (offset <= 127 && offset >= -128) {
       emit_1ub(p, 0x70 + cc);
@@ -381,6 +382,7 @@ void x86_jcc( struct x86_function *p,
 unsigned char *x86_jcc_forward( struct x86_function *p,
 			  enum x86_cc cc )
 {
+   DUMP_I(cc);
    emit_2ub(p, 0x0f, 0x80 + cc);
    emit_1i(p, 0);
    return x86_get_label(p);
@@ -1365,6 +1367,7 @@ void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
 
 void mmx_emms( struct x86_function *p )
 {
+   DUMP();
    assert(p->need_emms);
    emit_2ub(p, 0x0f, 0x77);
    p->need_emms = 0;
-- 
cgit v1.2.3


From e3c415995706d2dda7c34a227e2e24d0745763ec Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Sun, 27 Apr 2008 21:09:45 +0900
Subject: rtasm: Implement x86_retw.

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 6 ++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h | 1 +
 2 files changed, 7 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 3cd45d7dd9..e6cbe9967f 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -498,6 +498,12 @@ void x86_ret( struct x86_function *p )
    emit_1ub(p, 0xc3);
 }
 
+void x86_retw( struct x86_function *p, unsigned short imm )
+{
+   DUMP();
+   emit_3ub(p, 0xc2, imm & 0xff, (imm >> 8) & 0xff);
+}
+
 void x86_sahf( struct x86_function *p )
 {
    DUMP();
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 695a1cef4e..1962b07bc5 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -219,6 +219,7 @@ void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_pop( struct x86_function *p, struct x86_reg reg );
 void x86_push( struct x86_function *p, struct x86_reg reg );
 void x86_ret( struct x86_function *p );
+void x86_retw( struct x86_function *p, unsigned short imm );
 void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-- 
cgit v1.2.3


From 58d3dff0d3115ddd5397b7f77b5bcf4f9ca616b6 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@ubuntu-vbox.(none)>
Date: Mon, 28 Apr 2008 18:50:27 +0200
Subject: gallium: Generate SSE code to swizzle and unswizzle vs inputs and
 outputs.

Change SSE_SWIZZLES #define to 0 to disable it.
---
 .../auxiliary/draw/draw_pt_fetch_shade_pipeline.c  |   3 +-
 src/gallium/auxiliary/draw/draw_vs_sse.c           |  52 ++++++--
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c         |  14 ++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h         |   2 +
 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c        | 142 ++++++++++++++++++++-
 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h        |   4 +-
 src/gallium/drivers/softpipe/sp_fs_sse.c           |   2 +-
 7 files changed, 204 insertions(+), 15 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index f0763dad8d..4ec20493c4 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -109,9 +109,10 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
    struct draw_context *draw = fpme->draw;
    struct draw_vertex_shader *shader = draw->vertex_shader;
    unsigned opt = fpme->opt;
+   unsigned alloc_count = align_int( fetch_count, 4 );
 
    struct vertex_header *pipeline_verts = 
-      (struct vertex_header *)MALLOC(fpme->vertex_size * fetch_count);
+      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
 
    if (!pipeline_verts) {
       /* Not much we can do here - just skip the rendering.
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index b1e9f67114..07f85bc448 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -47,14 +47,29 @@
 #include "tgsi/util/tgsi_parse.h"
 
 #define SSE_MAX_VERTICES 4
+#define SSE_SWIZZLES 1
 
+#if SSE_SWIZZLES
+typedef void (XSTDCALL *codegen_function) (
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   float (*constant)[4],
+   struct tgsi_exec_vector *temporary,
+   float (*immediates)[4],
+   const float (*aos_input)[4],
+   uint num_inputs,
+   uint input_stride,
+   float (*aos_output)[4],
+   uint num_outputs,
+   uint output_stride );
+#else
 typedef void (XSTDCALL *codegen_function) (
    const struct tgsi_exec_vector *input,
    struct tgsi_exec_vector *output,
    float (*constant)[4],
    struct tgsi_exec_vector *temporary,
    float (*immediates)[4] );
-
+#endif
 
 struct draw_sse_vertex_shader {
    struct draw_vertex_shader base;
@@ -91,12 +106,31 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
 {
    struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
    struct tgsi_exec_machine *machine = shader->machine;
-   unsigned int i, j;
-   unsigned slot;
+   unsigned int i;
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
 
+#if SSE_SWIZZLES
+      /* run compiled shader
+       */
+      shader->func(machine->Inputs,
+		   machine->Outputs,
+		   (float (*)[4])constants,
+		   machine->Temps,
+		   shader->immediates,
+                   input,
+                   base->info.num_inputs,
+                   input_stride,
+                   output,
+                   base->info.num_outputs,
+                   output_stride );
+
+      input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+      output = (float (*)[4])((char *)output + output_stride * max_vertices);
+#else
+      unsigned int j, slot;
+
       /* Swizzle inputs.  
        */
       for (j = 0; j < max_vertices; j++) {
@@ -105,10 +139,10 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
             machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
             machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
             machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
-         }
+         } 
 
 	 input = (const float (*)[4])((const char *)input + input_stride);
-      } 
+      }
 
       /* run compiled shader
        */
@@ -118,7 +152,6 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
 		   machine->Temps,
 		   shader->immediates);
 
-
       /* Unswizzle all output results.  
        */
       for (j = 0; j < max_vertices; j++) {
@@ -127,10 +160,11 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
             output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
             output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
             output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-         }
+         } 
 
 	 output = (float (*)[4])((char *)output + output_stride);
-      } 
+      }
+#endif
    }
 }
 
@@ -176,7 +210,7 @@ draw_create_vs_sse(struct draw_context *draw,
    x86_init_func( &vs->sse2_program );
 
    if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
-			&vs->sse2_program, vs->immediates )) 
+			&vs->sse2_program, vs->immediates, SSE_SWIZZLES )) 
       goto fail;
       
    vs->func = (codegen_function) x86_get_func( &vs->sse2_program );
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index e6cbe9967f..d7e2230557 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -853,6 +853,20 @@ void sse_shufps( struct x86_function *p,
    emit_1ub(p, shuf); 
 }
 
+void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub( p, X86_TWOB, 0x15 );
+   emit_modrm( p, dst, src );
+}
+
+void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub( p, X86_TWOB, 0x14 );
+   emit_modrm( p, dst, src );
+}
+
 void sse_cmpps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src,
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 1962b07bc5..ad79b1facf 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -203,6 +203,8 @@ void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src
 void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
                  unsigned char shuf );
+void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
 void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index 9061e00b63..86ca16c246 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -1788,7 +1788,6 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_RET:
-   case TGSI_OPCODE_END:
 #ifdef WIN32
       emit_retw( func, 16 );
 #else
@@ -1796,6 +1795,9 @@ emit_instruction(
 #endif
       break;
 
+   case TGSI_OPCODE_END:
+      break;
+
    case TGSI_OPCODE_SSG:
       return 0;
       break;
@@ -2027,6 +2029,127 @@ emit_declaration(
    }
 }
 
+static void aos_to_soa( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_input;
+   struct x86_reg aos_input;
+   struct x86_reg num_inputs;
+   struct x86_reg temp;
+   unsigned char *inner_loop;
+
+   soa_input = x86_make_reg( file_REG32, reg_AX );
+   aos_input = x86_make_reg( file_REG32, reg_BX );
+   num_inputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, soa_input, get_argument( soa + 1 ) );
+   x86_mov( func, aos_input, get_argument( aos + 1 ) );
+   x86_mov( func, num_inputs, get_argument( num + 1 ) );
+
+   inner_loop = x86_get_label( func );
+
+   x86_mov( func, temp, get_argument( stride + 1 ) );
+   x86_push( func, aos_input );
+   sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+   sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+   x86_add( func, aos_input, temp );
+   sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+   sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+   x86_add( func, aos_input, temp );
+   sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+   sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+   x86_add( func, aos_input, temp );
+   sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+   sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+   x86_pop( func, aos_input );
+
+   sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+   sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+   sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+   sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+   sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+   sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+   sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+   sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+   sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+   sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+   /* Advance to next input */
+   x86_mov_reg_imm( func, temp, 16 );
+   x86_add( func, aos_input, temp );
+   x86_mov_reg_imm( func, temp, 64 );
+   x86_add( func, soa_input, temp );
+   x86_dec( func, num_inputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
+}
+
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_output;
+   struct x86_reg aos_output;
+   struct x86_reg num_outputs;
+   struct x86_reg temp;
+   unsigned char *inner_loop;
+
+   soa_output = x86_make_reg( file_REG32, reg_AX );
+   aos_output = x86_make_reg( file_REG32, reg_BX );
+   num_outputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, soa_output, get_argument( soa + 1 ) );
+   x86_mov( func, aos_output, get_argument( aos + 1 ) );
+   x86_mov( func, num_outputs, get_argument( num + 1 ) );
+
+   inner_loop = x86_get_label( func );
+
+   sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+   sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+   sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+   sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+   sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+   sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+   sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+   sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+   sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+   sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+   x86_mov( func, temp, get_argument( stride + 1 ) );
+   x86_push( func, aos_output );
+   sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+   sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+   x86_add( func, aos_output, temp );
+   sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+   sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+   x86_add( func, aos_output, temp );
+   sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+   sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+   x86_add( func, aos_output, temp );
+   sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+   sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+   x86_pop( func, aos_output );
+
+   /* Advance to next output */
+   x86_mov_reg_imm( func, temp, 16 );
+   x86_add( func, aos_output, temp );
+   x86_mov_reg_imm( func, temp, 64 );
+   x86_add( func, soa_output, temp );
+   x86_dec( func, num_outputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
+}
 
 /**
  * Translate a TGSI vertex/fragment shader to SSE2 code.
@@ -2048,7 +2171,8 @@ unsigned
 tgsi_emit_sse2(
    const struct tgsi_token *tokens,
    struct x86_function *func,
-   float (*immediates)[4])
+   float (*immediates)[4],
+   boolean do_swizzles )
 {
    struct tgsi_parse_context parse;
    boolean instruction_phase = FALSE;
@@ -2089,6 +2213,9 @@ tgsi_emit_sse2(
    else {
       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
 
+      if (do_swizzles)
+         aos_to_soa( func, 5, 0, 6, 7 );
+
       x86_mov(
          func,
          get_input_base(),
@@ -2176,6 +2303,17 @@ tgsi_emit_sse2(
       }
    }
 
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+      if (do_swizzles)
+         soa_to_aos( func, 8, 1, 9, 10 );
+   }
+
+#ifdef WIN32
+   emit_retw( func, 16 );
+#else
+   emit_ret( func );
+#endif
+
    tgsi_parse_free( &parse );
 
    return ok;
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
index 063287dc5e..e66d115283 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
@@ -12,8 +12,8 @@ unsigned
 tgsi_emit_sse2(
    const struct tgsi_token *tokens,
    struct x86_function *function,
-   float (*immediates)[4]
- );
+   float (*immediates)[4],
+   boolean do_swizzles );
 
 #if defined __cplusplus
 }
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index f857d26143..4d569e1e22 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -133,7 +133,7 @@ softpipe_create_fs_sse(struct softpipe_context *softpipe,
    x86_init_func( &shader->sse2_program );
    
    if (!tgsi_emit_sse2( templ->tokens, &shader->sse2_program,
-                        shader->immediates)) {
+                        shader->immediates, FALSE )) {
       FREE(shader);
       return NULL;
    }
-- 
cgit v1.2.3


From 727257f32002544658219d2e0163993c1cbc5644 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 1 May 2008 15:31:17 +0100
Subject: rtasm: assert stack is fully popped in return

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index d7e2230557..40f6f973d6 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -495,6 +495,7 @@ void x86_dec( struct x86_function *p,
 void x86_ret( struct x86_function *p )
 {
    DUMP();
+   assert(p->stack_offset == 0);
    emit_1ub(p, 0xc3);
 }
 
-- 
cgit v1.2.3


From fb3623b235f5caa9d76e656b1e5eda797c7c73eb Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 1 May 2008 20:41:03 +0100
Subject: rtasm: fix labels after (not so) recent change to allow dynamic fn
 growth

Using char * for labels doesn't work if you realloc the function
during assembly and free the old storage...
---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c      | 49 +++++++++----------------
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h      | 14 +++----
 src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c     |  4 +-
 src/gallium/auxiliary/translate/translate_sse.c |  2 +-
 4 files changed, 28 insertions(+), 41 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 40f6f973d6..e69251f072 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -347,9 +347,9 @@ struct x86_reg x86_get_base_reg( struct x86_reg reg )
    return x86_make_reg( reg.file, reg.idx );
 }
 
-unsigned char *x86_get_label( struct x86_function *p )
+int x86_get_label( struct x86_function *p )
 {
-   return p->csr;
+   return p->csr - p->store;
 }
 
 
@@ -361,17 +361,22 @@ unsigned char *x86_get_label( struct x86_function *p )
 
 void x86_jcc( struct x86_function *p,
 	      enum x86_cc cc,
-	      unsigned char *label )
+	      int label )
 {
-   intptr_t offset = pointer_to_intptr( label ) - (pointer_to_intptr( x86_get_label(p) ) + 2);
+   int offset = label - (x86_get_label(p) + 2);
    DUMP_I(cc);
    
+   if (offset < 0) {
+      int amt = p->csr - p->store;
+      assert(amt > -offset);
+   }
+
    if (offset <= 127 && offset >= -128) {
       emit_1ub(p, 0x70 + cc);
       emit_1b(p, (char) offset);
    }
    else {
-      offset = pointer_to_intptr( label ) - (pointer_to_intptr( x86_get_label(p) ) + 6);
+      offset = label - (x86_get_label(p) + 6);
       emit_2ub(p, 0x0f, 0x80 + cc);
       emit_1i(p, offset);
    }
@@ -379,8 +384,8 @@ void x86_jcc( struct x86_function *p,
 
 /* Always use a 32bit offset for forward jumps:
  */
-unsigned char *x86_jcc_forward( struct x86_function *p,
-			  enum x86_cc cc )
+int x86_jcc_forward( struct x86_function *p,
+                     enum x86_cc cc )
 {
    DUMP_I(cc);
    emit_2ub(p, 0x0f, 0x80 + cc);
@@ -388,7 +393,7 @@ unsigned char *x86_jcc_forward( struct x86_function *p,
    return x86_get_label(p);
 }
 
-unsigned char *x86_jmp_forward( struct x86_function *p)
+int x86_jmp_forward( struct x86_function *p)
 {
    DUMP();
    emit_1ub(p, 0xe9);
@@ -396,7 +401,7 @@ unsigned char *x86_jmp_forward( struct x86_function *p)
    return x86_get_label(p);
 }
 
-unsigned char *x86_call_forward( struct x86_function *p)
+int x86_call_forward( struct x86_function *p)
 {
    DUMP();
 
@@ -408,42 +413,24 @@ unsigned char *x86_call_forward( struct x86_function *p)
 /* Fixup offset from forward jump:
  */
 void x86_fixup_fwd_jump( struct x86_function *p,
-			 unsigned char *fixup )
+			 int fixup )
 {
-   *(int *)(fixup - 4) = pointer_to_intptr( x86_get_label(p) ) - pointer_to_intptr( fixup );
+   *(int *)(p->store + fixup - 4) = x86_get_label(p) - fixup;
 }
 
-void x86_jmp( struct x86_function *p, unsigned char *label)
+void x86_jmp( struct x86_function *p, int label)
 {
    DUMP_I( label );
    emit_1ub(p, 0xe9);
-   emit_1i(p, pointer_to_intptr( label ) - pointer_to_intptr( x86_get_label(p) ) - 4);
-}
-
-#if 0
-static unsigned char *cptr( void (*label)() )
-{
-   return (unsigned char *) label;
+   emit_1i(p, label - x86_get_label(p) - 4);
 }
 
-/* This doesn't work once we start reallocating & copying the
- * generated code on buffer fills, because the call is relative to the
- * current pc.
- */
-void x86_call( struct x86_function *p, void (*label)())
-{
-   DUMP_I( label );
-   emit_1ub(p, 0xe8);
-   emit_1i(p, cptr(label) - x86_get_label(p) - 4);
-}
-#else
 void x86_call( struct x86_function *p, struct x86_reg reg)
 {
    DUMP_R( reg );
    emit_1ub(p, 0xff);
    emit_modrm_noreg(p, 2, reg);
 }
-#endif
 
 
 /* michal:
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index ad79b1facf..eacaeeaf6f 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -124,23 +124,23 @@ struct x86_reg x86_get_base_reg( struct x86_reg reg );
 
 /* Labels, jumps and fixup:
  */
-unsigned char *x86_get_label( struct x86_function *p );
+int x86_get_label( struct x86_function *p );
 
 void x86_jcc( struct x86_function *p,
 	      enum x86_cc cc,
-	      unsigned char *label );
+	      int label );
 
-unsigned char *x86_jcc_forward( struct x86_function *p,
+int x86_jcc_forward( struct x86_function *p,
 			  enum x86_cc cc );
 
-unsigned char *x86_jmp_forward( struct x86_function *p);
+int x86_jmp_forward( struct x86_function *p);
 
-unsigned char *x86_call_forward( struct x86_function *p);
+int x86_call_forward( struct x86_function *p);
 
 void x86_fixup_fwd_jump( struct x86_function *p,
-			 unsigned char *fixup );
+			 int fixup );
 
-void x86_jmp( struct x86_function *p, unsigned char *label );
+void x86_jmp( struct x86_function *p, int label );
 
 /* void x86_call( struct x86_function *p, void (*label)() ); */
 void x86_call( struct x86_function *p, struct x86_reg reg);
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index 45453c34ce..07db3292b4 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -2021,7 +2021,7 @@ static void aos_to_soa( struct x86_function *func, uint aos, uint soa, uint num,
    struct x86_reg aos_input;
    struct x86_reg num_inputs;
    struct x86_reg temp;
-   unsigned char *inner_loop;
+   int inner_loop;
 
    soa_input = x86_make_reg( file_REG32, reg_AX );
    aos_input = get_temp_base(); /* BX or SI */
@@ -2083,7 +2083,7 @@ static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num,
    struct x86_reg aos_output;
    struct x86_reg num_outputs;
    struct x86_reg temp;
-   unsigned char *inner_loop;
+   int inner_loop;
 
    soa_output = x86_make_reg( file_REG32, reg_AX );
    aos_output = get_temp_base(); /* BX or SI */
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index f590d48b78..a54ac5a82f 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -404,7 +404,7 @@ static boolean build_vertex_emit( struct translate_sse *p,
    struct x86_reg srcEAX       = x86_make_reg(file_REG32, reg_CX);
    struct x86_reg countEBP     = x86_make_reg(file_REG32, reg_BP);
    struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
-   uint8_t *fixup, *label;
+   int fixup, label;
    unsigned j;
 
    p->func = func;
-- 
cgit v1.2.3


From 2c89b75e36fd35d5a003107d1d2f97b537321f95 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 1 May 2008 20:44:41 +0100
Subject: rtasm: learn another version of push

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index e69251f072..4e036d9032 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -449,8 +449,15 @@ void x86_push( struct x86_function *p,
 	       struct x86_reg reg )
 {
    DUMP_R( reg );
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x50 + reg.idx);
+   if (reg.mod == mod_REG)
+      emit_1ub(p, 0x50 + reg.idx);
+   else 
+   {
+      emit_1ub(p, 0xff);
+      emit_modrm_noreg(p, 6, reg);
+   }
+
+
    p->stack_offset += 4;
 }
 
-- 
cgit v1.2.3


From 9232f0c023af060b12f77dee5e8b6a533c48e146 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 19 May 2008 16:28:53 +0100
Subject: rtasm: remove unused struct member

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index eacaeeaf6f..baa10b7d4a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -44,7 +44,6 @@ struct x86_function {
    unsigned stack_offset;
    int need_emms;
    unsigned char error_overflow[4];
-   const char *fn;
 };
 
 enum x86_reg_file {
-- 
cgit v1.2.3


From d3e64caef6f8654af1a84825803e517ab8221c68 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Wed, 21 May 2008 08:28:16 +0100
Subject: rtasm: export debug reg print function

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 20 +++++++++-----------
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  3 +++
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 4e036d9032..68ac91ed13 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -36,11 +36,8 @@
 
 #define DUMP_SSE  0
 
-#if DUMP_SSE
 
-static void
-_print_reg(
-   struct x86_reg reg )
+void x86_print_reg( struct x86_reg reg )
 {
    if (reg.mod != mod_REG) 
       debug_printf( "[" );
@@ -77,6 +74,7 @@ _print_reg(
       debug_printf( "]" );
 }
 
+#if DUMP_SSE
 
 #define DUMP_START() debug_printf( "\n" )
 #define DUMP_END() debug_printf( "\n" )
@@ -87,7 +85,7 @@ _print_reg(
       foo++;                                    \
    if  (*foo)                                   \
       foo++;                                    \
-   debug_printf( "\n% 15s ", foo );             \
+   debug_printf( "\n% 4x% 15s ", p->csr - p->store, foo );             \
 } while (0)
 
 #define DUMP_I( I ) do {                        \
@@ -97,27 +95,27 @@ _print_reg(
 
 #define DUMP_R( R0 ) do {                       \
    DUMP();                                      \
-   _print_reg( R0 );                            \
+   x86_print_reg( R0 );                            \
 } while( 0 )
 
 #define DUMP_RR( R0, R1 ) do {                  \
    DUMP();                                      \
-   _print_reg( R0 );                            \
+   x86_print_reg( R0 );                            \
    debug_printf( ", " );                        \
-   _print_reg( R1 );                            \
+   x86_print_reg( R1 );                            \
 } while( 0 )
 
 #define DUMP_RI( R0, I ) do {                   \
    DUMP();                                      \
-   _print_reg( R0 );                            \
+   x86_print_reg( R0 );                            \
    debug_printf( ", %u", I );                   \
 } while( 0 )
 
 #define DUMP_RRI( R0, R1, I ) do {              \
    DUMP();                                      \
-   _print_reg( R0 );                            \
+   x86_print_reg( R0 );                            \
    debug_printf( ", " );                        \
-   _print_reg( R1 );                            \
+   x86_print_reg( R1 );                            \
    debug_printf( ", %u", I );                   \
 } while( 0 )
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index baa10b7d4a..1e02c6e73b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -106,6 +106,9 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size );
 void x86_release_func( struct x86_function *p );
 void (*x86_get_func( struct x86_function *p ))( void );
 
+/* Debugging:
+ */
+void x86_print_reg( struct x86_reg reg );
 
 
 /* Create and manipulate registers and regmem values:
-- 
cgit v1.2.3


From 030af06691bc5bc82ca141a576da7a2edffe9d1c Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Wed, 21 May 2008 20:14:55 +0100
Subject: rtasm: add x87 instructions and debug-check for x87 stack usage

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 120 +++++++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  20 ++++-
 2 files changed, 138 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 68ac91ed13..a2e8af343b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -986,6 +986,26 @@ void sse2_movd( struct x86_function *p,
 /***********************************************************************
  * x87 instructions
  */
+static void note_x87_pop( struct x86_function *p )
+{
+   p->x87_stack--;
+   assert(p->x87_stack >= 0);
+   debug_printf("\nstack: %d\n", p->x87_stack);
+}
+
+static void note_x87_push( struct x86_function *p )
+{
+   p->x87_stack++;
+   assert(p->x87_stack <= 7);
+   debug_printf("\nstack: %d\n", p->x87_stack);
+}
+
+void x87_assert_stack_empty( struct x86_function *p )
+{
+   assert (p->x87_stack == 0);
+}
+
+
 void x87_fist( struct x86_function *p, struct x86_reg dst )
 {
    DUMP_R( dst );
@@ -998,6 +1018,7 @@ void x87_fistp( struct x86_function *p, struct x86_reg dst )
    DUMP_R( dst );
    emit_1ub(p, 0xdb);
    emit_modrm_noreg(p, 3, dst);
+   note_x87_pop(p);
 }
 
 void x87_fild( struct x86_function *p, struct x86_reg arg )
@@ -1005,12 +1026,14 @@ void x87_fild( struct x86_function *p, struct x86_reg arg )
    DUMP_R( arg );
    emit_1ub(p, 0xdf);
    emit_modrm_noreg(p, 0, arg);
+   note_x87_push(p);
 }
 
 void x87_fldz( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xee);
+   note_x87_push(p);
 }
 
 
@@ -1027,18 +1050,21 @@ void x87_fld1( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xe8);
+   note_x87_push(p);
 }
 
 void x87_fldl2e( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xea);
+   note_x87_push(p);
 }
 
 void x87_fldln2( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xed);
+   note_x87_push(p);
 }
 
 void x87_fwait( struct x86_function *p )
@@ -1059,6 +1085,49 @@ void x87_fclex( struct x86_function *p )
    x87_fnclex(p);
 }
 
+void x87_fcmovb( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xc0+arg.idx);
+}
+
+void x87_fcmove( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xc8+arg.idx);
+}
+
+void x87_fcmovbe( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xd0+arg.idx);
+}
+
+void x87_fcmovnb( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xc0+arg.idx);
+}
+
+void x87_fcmovne( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xc8+arg.idx);
+}
+
+void x87_fcmovnbe( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xd0+arg.idx);
+}
+
+
 
 static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,
 			  unsigned char dst0ub0,
@@ -1146,6 +1215,7 @@ void x87_fmulp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc8+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_fsubp( struct x86_function *p, struct x86_reg dst )
@@ -1154,6 +1224,7 @@ void x87_fsubp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe8+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
@@ -1162,6 +1233,7 @@ void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe0+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_faddp( struct x86_function *p, struct x86_reg dst )
@@ -1170,6 +1242,7 @@ void x87_faddp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc0+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_fdivp( struct x86_function *p, struct x86_reg dst )
@@ -1178,6 +1251,7 @@ void x87_fdivp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf8+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
@@ -1186,6 +1260,13 @@ void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf0+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_ftst( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe4);
 }
 
 void x87_fucom( struct x86_function *p, struct x86_reg arg )
@@ -1200,12 +1281,15 @@ void x87_fucomp( struct x86_function *p, struct x86_reg arg )
    DUMP_R( arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xdd, 0xe8+arg.idx);
+   note_x87_pop(p);
 }
 
 void x87_fucompp( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xda, 0xe9);
+   note_x87_pop(p);             /* pop twice */
+   note_x87_pop(p);             /* pop twice */
 }
 
 void x87_fxch( struct x86_function *p, struct x86_reg arg )
@@ -1287,6 +1371,7 @@ void x87_fyl2x( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xf1);
+   note_x87_pop(p);
 }
 
 /* st1 = st1 * log2(st0 + 1.0);
@@ -1298,6 +1383,7 @@ void x87_fyl2xp1( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xf9);
+   note_x87_pop(p);
 }
 
 
@@ -1310,6 +1396,7 @@ void x87_fld( struct x86_function *p, struct x86_reg arg )
       emit_1ub(p, 0xd9);
       emit_modrm_noreg(p, 0, arg);
    }
+   note_x87_push(p);
 }
 
 void x87_fst( struct x86_function *p, struct x86_reg dst )
@@ -1332,8 +1419,15 @@ void x87_fstp( struct x86_function *p, struct x86_reg dst )
       emit_1ub(p, 0xd9);
       emit_modrm_noreg(p, 3, dst);
    }
+   note_x87_pop(p);
+}
+
+void x87_fpop( struct x86_function *p )
+{
+   x87_fstp( p, x86_make_reg( file_x87, 0 ));
 }
 
+
 void x87_fcom( struct x86_function *p, struct x86_reg dst )
 {
    DUMP_R( dst );
@@ -1345,6 +1439,7 @@ void x87_fcom( struct x86_function *p, struct x86_reg dst )
    }
 }
 
+
 void x87_fcomp( struct x86_function *p, struct x86_reg dst )
 {
    DUMP_R( dst );
@@ -1354,6 +1449,20 @@ void x87_fcomp( struct x86_function *p, struct x86_reg dst )
       emit_1ub(p, 0xd8);
       emit_modrm_noreg(p, 3, dst);
    }
+   note_x87_pop(p);
+}
+
+void x87_fcomi( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_2ub(p, 0xdb, 0xf0+arg.idx);
+}
+
+void x87_fcomip( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_2ub(p, 0xdb, 0xf0+arg.idx);
+   note_x87_pop(p);
 }
 
 
@@ -1372,6 +1481,17 @@ void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
 }
 
 
+void x87_fnstcw( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_REG32);
+
+   emit_1ub(p, 0x9b);           /* WAIT -- needed? */
+   emit_1ub(p, 0xd9);
+   emit_modrm_noreg(p, 7, dst);
+}
+
+
 
 
 /***********************************************************************
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 1e02c6e73b..9f7e31e055 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -41,8 +41,11 @@ struct x86_function {
    unsigned size;
    unsigned char *store;
    unsigned char *csr;
-   unsigned stack_offset;
-   int need_emms;
+
+   unsigned stack_offset:16;
+   unsigned need_emms:8;
+   int x87_stack:8;
+
    unsigned char error_overflow[4];
 };
 
@@ -229,13 +232,23 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_sahf( struct x86_function *p );
 
+void x87_assert_stack_empty( struct x86_function *p );
+
 void x87_f2xm1( struct x86_function *p );
 void x87_fabs( struct x86_function *p );
 void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
 void x87_faddp( struct x86_function *p, struct x86_reg dst );
 void x87_fchs( struct x86_function *p );
 void x87_fclex( struct x86_function *p );
+void x87_fcmovb( struct x86_function *p, struct x86_reg src );
+void x87_fcmovbe( struct x86_function *p, struct x86_reg src );
+void x87_fcmove( struct x86_function *p, struct x86_reg src );
+void x87_fcmovnb( struct x86_function *p, struct x86_reg src );
+void x87_fcmovnbe( struct x86_function *p, struct x86_reg src );
+void x87_fcmovne( struct x86_function *p, struct x86_reg src );
 void x87_fcom( struct x86_function *p, struct x86_reg dst );
+void x87_fcomi( struct x86_function *p, struct x86_reg dst );
+void x87_fcomip( struct x86_function *p, struct x86_reg dst );
 void x87_fcomp( struct x86_function *p, struct x86_reg dst );
 void x87_fcos( struct x86_function *p );
 void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
@@ -255,6 +268,7 @@ void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
 void x87_fmulp( struct x86_function *p, struct x86_reg dst );
 void x87_fnclex( struct x86_function *p );
 void x87_fprndint( struct x86_function *p );
+void x87_fpop( struct x86_function *p );
 void x87_fscale( struct x86_function *p );
 void x87_fsin( struct x86_function *p );
 void x87_fsincos( struct x86_function *p );
@@ -265,11 +279,13 @@ void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
 void x87_fsubp( struct x86_function *p, struct x86_reg dst );
 void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
 void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
+void x87_ftst( struct x86_function *p );
 void x87_fxch( struct x86_function *p, struct x86_reg dst );
 void x87_fxtract( struct x86_function *p );
 void x87_fyl2x( struct x86_function *p );
 void x87_fyl2xp1( struct x86_function *p );
 void x87_fwait( struct x86_function *p );
+void x87_fnstcw( struct x86_function *p, struct x86_reg dst );
 void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
 void x87_fucompp( struct x86_function *p );
 void x87_fucomp( struct x86_function *p, struct x86_reg arg );
-- 
cgit v1.2.3


From 6f407b072453eb2bb7077a952257a099db4da025 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Wed, 21 May 2008 20:50:36 +0100
Subject: rtasm: remove debug

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index a2e8af343b..d78676b8f3 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -990,14 +990,12 @@ static void note_x87_pop( struct x86_function *p )
 {
    p->x87_stack--;
    assert(p->x87_stack >= 0);
-   debug_printf("\nstack: %d\n", p->x87_stack);
 }
 
 static void note_x87_push( struct x86_function *p )
 {
    p->x87_stack++;
    assert(p->x87_stack <= 7);
-   debug_printf("\nstack: %d\n", p->x87_stack);
 }
 
 void x87_assert_stack_empty( struct x86_function *p )
-- 
cgit v1.2.3


From 6b3723ee8d084a1abbc971b21c58f7c1e66949a7 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Sat, 24 May 2008 13:22:15 +0100
Subject: rtasm: add some helpers for calling out from generated code

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 44 ++++++++++++++++++++++++++++--
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  6 ++++
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index d78676b8f3..2415b0156b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -218,6 +218,8 @@ static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1
 
 /* Build a modRM byte + possible displacement.  No treatment of SIB
  * indexing.  BZZT - no way to encode an absolute address.
+ *
+ * This is the "/r" field in the x86 manuals...
  */
 static void emit_modrm( struct x86_function *p, 
 			struct x86_reg reg, 
@@ -256,7 +258,8 @@ static void emit_modrm( struct x86_function *p,
    }
 }
 
-
+/* Emits the "/0".."/7" specialized versions of the modrm ("/r") bytes.
+ */
 static void emit_modrm_noreg( struct x86_function *p,
 			      unsigned op,
 			      struct x86_reg regmem )
@@ -365,8 +368,7 @@ void x86_jcc( struct x86_function *p,
    DUMP_I(cc);
    
    if (offset < 0) {
-      int amt = p->csr - p->store;
-      assert(amt > -offset);
+      assert(p->csr - p->store > -offset);
    }
 
    if (offset <= 127 && offset >= -128) {
@@ -443,6 +445,16 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
    emit_1i(p, imm);
 }
 
+void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm )
+{
+   DUMP_RI( dst, imm );
+   assert(dst.mod == mod_REG);
+   emit_1ub(p, 0x80);
+   emit_modrm_noreg(p, 0, dst);
+   emit_1ub(p, imm);
+}
+
+
 void x86_push( struct x86_function *p,
 	       struct x86_reg reg )
 {
@@ -459,6 +471,17 @@ void x86_push( struct x86_function *p,
    p->stack_offset += 4;
 }
 
+void x86_push_imm32( struct x86_function *p,
+                     int imm32 )
+{
+   DUMP_I( imm32 );
+   emit_1ub(p, 0x68);
+   emit_1i(p,  imm32);
+
+   p->stack_offset += 4;
+}
+
+
 void x86_pop( struct x86_function *p,
 	      struct x86_reg reg )
 {
@@ -1558,6 +1581,21 @@ void mmx_movq( struct x86_function *p,
  */
 
 
+void x86_cdecl_caller_push_regs( struct x86_function *p )
+{
+   x86_push(p, x86_make_reg(file_REG32, reg_AX));
+   x86_push(p, x86_make_reg(file_REG32, reg_CX));
+   x86_push(p, x86_make_reg(file_REG32, reg_DX));
+}
+
+void x86_cdecl_caller_pop_regs( struct x86_function *p )
+{
+   x86_pop(p, x86_make_reg(file_REG32, reg_DX));
+   x86_pop(p, x86_make_reg(file_REG32, reg_CX));
+   x86_pop(p, x86_make_reg(file_REG32, reg_AX));
+}
+
+
 /* Retreive a reference to one of the function arguments, taking into
  * account any push/pop activity:
  */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 9f7e31e055..63e812fac9 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -155,6 +155,7 @@ void x86_call( struct x86_function *p, struct x86_reg reg);
  * I load the immediate into general purpose register and use it.
  */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm );
 
 
 /* Macro for sse_shufps() and sse2_pshufd():
@@ -225,6 +226,7 @@ void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_pop( struct x86_function *p, struct x86_reg reg );
 void x86_push( struct x86_function *p, struct x86_reg reg );
+void x86_push_imm32( struct x86_function *p, int imm );
 void x86_ret( struct x86_function *p );
 void x86_retw( struct x86_function *p, unsigned short imm );
 void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -232,6 +234,10 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_sahf( struct x86_function *p );
 
+
+void x86_cdecl_caller_push_regs( struct x86_function *p );
+void x86_cdecl_caller_pop_regs( struct x86_function *p );
+
 void x87_assert_stack_empty( struct x86_function *p );
 
 void x87_f2xm1( struct x86_function *p );
-- 
cgit v1.2.3


From 55d29a8d48663982a1aeea414f69a5896b97d1ea Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Wed, 28 May 2008 16:12:14 +0900
Subject: gallium: Windows CE portability fixes.

---
 src/gallium/auxiliary/draw/draw_pt_elts.c          |   8 +-
 src/gallium/auxiliary/draw/draw_pt_varray.c        |   4 +-
 src/gallium/auxiliary/draw/draw_vs_sse.c           |   4 +-
 src/gallium/auxiliary/rtasm/rtasm_cpu.c            |   4 +-
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c         |   4 +-
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h         |   4 +-
 src/gallium/auxiliary/tgsi/util/tgsi_util.c        |   2 +-
 src/gallium/auxiliary/translate/translate.c        |   3 +-
 .../auxiliary/translate/translate_generic.c        | 236 ++++++++++-----------
 src/gallium/auxiliary/translate/translate_sse.c    |   5 +-
 src/gallium/auxiliary/util/u_time.h                |   2 +-
 src/gallium/include/pipe/p_compiler.h              |  54 +++--
 src/gallium/include/pipe/p_config.h                |   8 +-
 src/gallium/include/pipe/p_debug.h                 |  11 +-
 14 files changed, 192 insertions(+), 157 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c
index 2094c081ed..b7780fb507 100644
--- a/src/gallium/auxiliary/draw/draw_pt_elts.c
+++ b/src/gallium/auxiliary/draw/draw_pt_elts.c
@@ -60,10 +60,10 @@ static unsigned elt_vert( const void *elts, unsigned idx )
 pt_elt_func draw_pt_elt_func( struct draw_context *draw )
 {
    switch (draw->pt.user.eltSize) {
-   case 0: return elt_vert;
-   case 1: return elt_ubyte;
-   case 2: return elt_ushort; 
-   case 4: return elt_uint;
+   case 0: return &elt_vert;
+   case 1: return &elt_ubyte;
+   case 2: return &elt_ushort; 
+   case 4: return &elt_uint;
    default: return NULL;
    }
 }     
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
index 355093f945..c7c66b34d4 100644
--- a/src/gallium/auxiliary/draw/draw_pt_varray.c
+++ b/src/gallium/auxiliary/draw/draw_pt_varray.c
@@ -147,8 +147,8 @@ static INLINE void varray_ef_quad( struct varray_frontend *varray,
                                    unsigned i2,
                                    unsigned i3 )
 {
-   const unsigned omitEdge1 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_2;
-   const unsigned omitEdge2 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_1;
+   const ushort omitEdge1 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_2;
+   const ushort omitEdge2 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_1;
 
    varray_triangle_flags( varray,
                           DRAW_PIPE_RESET_STIPPLE | omitEdge1,
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index e3f4e67472..c88bc137ee 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -31,9 +31,11 @@
   *   Brian Paul
   */
 
+#include "pipe/p_config.h"
+
 #include "draw_vs.h"
 
-#if defined(__i386__) || defined(__386__)
+#if defined(PIPE_ARCH_X86)
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
index f01e12faa0..5499018b21 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -47,7 +47,7 @@ static boolean rtasm_sse_enabled(void)
 int rtasm_cpu_has_sse(void)
 {
    /* FIXME: actually detect this at run-time */
-#if defined(__i386__) || defined(__386__) || defined(i386)
+#if defined(PIPE_ARCH_X86)
    return rtasm_sse_enabled();
 #else
    return 0;
@@ -57,7 +57,7 @@ int rtasm_cpu_has_sse(void)
 int rtasm_cpu_has_sse2(void) 
 {
    /* FIXME: actually detect this at run-time */
-#if defined(__i386__) || defined(__386__) || defined(i386)
+#if defined(PIPE_ARCH_X86)
    return rtasm_sse_enabled();
 #else
    return 0;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 4e036d9032..6cd88ebca3 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -21,7 +21,9 @@
  *
  **************************************************************************/
 
-#if defined(__i386__) || defined(__386__) || defined(i386)
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86)
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index eacaeeaf6f..a5afa16395 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -24,7 +24,9 @@
 #ifndef _RTASM_X86SSE_H_
 #define _RTASM_X86SSE_H_
 
-#if defined(__i386__) || defined(__386__) || defined(i386)
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86)
 
 /* It is up to the caller to ensure that instructions issued are
  * suitable for the host cpu.  There are no checks made in this module
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_util.c b/src/gallium/auxiliary/tgsi/util/tgsi_util.c
index 4cdd89182a..56a50d3b21 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_util.c
@@ -8,7 +8,7 @@
 union pointer_hack
 {
    void *pointer;
-   unsigned long long uint64;
+   uint64_t uint64;
 };
 
 void *
diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c
index b04bc6eefd..b93fbf9033 100644
--- a/src/gallium/auxiliary/translate/translate.c
+++ b/src/gallium/auxiliary/translate/translate.c
@@ -30,6 +30,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
+#include "pipe/p_config.h"
 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "translate.h"
@@ -38,7 +39,7 @@ struct translate *translate_create( const struct translate_key *key )
 {
    struct translate *translate = NULL;
 
-#if defined(__i386__) || defined(__386__) || defined(i386)
+#if defined(PIPE_ARCH_X86)
    translate = translate_sse2_create( key );
    if (translate)
       return translate;
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 402780ee53..8f3b470333 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -255,140 +255,140 @@ static fetch_func get_fetch_func( enum pipe_format format )
 {
    switch (format) {
    case PIPE_FORMAT_R64_FLOAT:
-      return fetch_R64_FLOAT;
+      return &fetch_R64_FLOAT;
    case PIPE_FORMAT_R64G64_FLOAT:
-      return fetch_R64G64_FLOAT;
+      return &fetch_R64G64_FLOAT;
    case PIPE_FORMAT_R64G64B64_FLOAT:
-      return fetch_R64G64B64_FLOAT;
+      return &fetch_R64G64B64_FLOAT;
    case PIPE_FORMAT_R64G64B64A64_FLOAT:
-      return fetch_R64G64B64A64_FLOAT;
+      return &fetch_R64G64B64A64_FLOAT;
 
    case PIPE_FORMAT_R32_FLOAT:
-      return fetch_R32_FLOAT;
+      return &fetch_R32_FLOAT;
    case PIPE_FORMAT_R32G32_FLOAT:
-      return fetch_R32G32_FLOAT;
+      return &fetch_R32G32_FLOAT;
    case PIPE_FORMAT_R32G32B32_FLOAT:
-      return fetch_R32G32B32_FLOAT;
+      return &fetch_R32G32B32_FLOAT;
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return fetch_R32G32B32A32_FLOAT;
+      return &fetch_R32G32B32A32_FLOAT;
 
    case PIPE_FORMAT_R32_UNORM:
-      return fetch_R32_UNORM;
+      return &fetch_R32_UNORM;
    case PIPE_FORMAT_R32G32_UNORM:
-      return fetch_R32G32_UNORM;
+      return &fetch_R32G32_UNORM;
    case PIPE_FORMAT_R32G32B32_UNORM:
-      return fetch_R32G32B32_UNORM;
+      return &fetch_R32G32B32_UNORM;
    case PIPE_FORMAT_R32G32B32A32_UNORM:
-      return fetch_R32G32B32A32_UNORM;
+      return &fetch_R32G32B32A32_UNORM;
 
    case PIPE_FORMAT_R32_USCALED:
-      return fetch_R32_USCALED;
+      return &fetch_R32_USCALED;
    case PIPE_FORMAT_R32G32_USCALED:
-      return fetch_R32G32_USCALED;
+      return &fetch_R32G32_USCALED;
    case PIPE_FORMAT_R32G32B32_USCALED:
-      return fetch_R32G32B32_USCALED;
+      return &fetch_R32G32B32_USCALED;
    case PIPE_FORMAT_R32G32B32A32_USCALED:
-      return fetch_R32G32B32A32_USCALED;
+      return &fetch_R32G32B32A32_USCALED;
 
    case PIPE_FORMAT_R32_SNORM:
-      return fetch_R32_SNORM;
+      return &fetch_R32_SNORM;
    case PIPE_FORMAT_R32G32_SNORM:
-      return fetch_R32G32_SNORM;
+      return &fetch_R32G32_SNORM;
    case PIPE_FORMAT_R32G32B32_SNORM:
-      return fetch_R32G32B32_SNORM;
+      return &fetch_R32G32B32_SNORM;
    case PIPE_FORMAT_R32G32B32A32_SNORM:
-      return fetch_R32G32B32A32_SNORM;
+      return &fetch_R32G32B32A32_SNORM;
 
    case PIPE_FORMAT_R32_SSCALED:
-      return fetch_R32_SSCALED;
+      return &fetch_R32_SSCALED;
    case PIPE_FORMAT_R32G32_SSCALED:
-      return fetch_R32G32_SSCALED;
+      return &fetch_R32G32_SSCALED;
    case PIPE_FORMAT_R32G32B32_SSCALED:
-      return fetch_R32G32B32_SSCALED;
+      return &fetch_R32G32B32_SSCALED;
    case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return fetch_R32G32B32A32_SSCALED;
+      return &fetch_R32G32B32A32_SSCALED;
 
    case PIPE_FORMAT_R16_UNORM:
-      return fetch_R16_UNORM;
+      return &fetch_R16_UNORM;
    case PIPE_FORMAT_R16G16_UNORM:
-      return fetch_R16G16_UNORM;
+      return &fetch_R16G16_UNORM;
    case PIPE_FORMAT_R16G16B16_UNORM:
-      return fetch_R16G16B16_UNORM;
+      return &fetch_R16G16B16_UNORM;
    case PIPE_FORMAT_R16G16B16A16_UNORM:
-      return fetch_R16G16B16A16_UNORM;
+      return &fetch_R16G16B16A16_UNORM;
 
    case PIPE_FORMAT_R16_USCALED:
-      return fetch_R16_USCALED;
+      return &fetch_R16_USCALED;
    case PIPE_FORMAT_R16G16_USCALED:
-      return fetch_R16G16_USCALED;
+      return &fetch_R16G16_USCALED;
    case PIPE_FORMAT_R16G16B16_USCALED:
-      return fetch_R16G16B16_USCALED;
+      return &fetch_R16G16B16_USCALED;
    case PIPE_FORMAT_R16G16B16A16_USCALED:
-      return fetch_R16G16B16A16_USCALED;
+      return &fetch_R16G16B16A16_USCALED;
 
    case PIPE_FORMAT_R16_SNORM:
-      return fetch_R16_SNORM;
+      return &fetch_R16_SNORM;
    case PIPE_FORMAT_R16G16_SNORM:
-      return fetch_R16G16_SNORM;
+      return &fetch_R16G16_SNORM;
    case PIPE_FORMAT_R16G16B16_SNORM:
-      return fetch_R16G16B16_SNORM;
+      return &fetch_R16G16B16_SNORM;
    case PIPE_FORMAT_R16G16B16A16_SNORM:
-      return fetch_R16G16B16A16_SNORM;
+      return &fetch_R16G16B16A16_SNORM;
 
    case PIPE_FORMAT_R16_SSCALED:
-      return fetch_R16_SSCALED;
+      return &fetch_R16_SSCALED;
    case PIPE_FORMAT_R16G16_SSCALED:
-      return fetch_R16G16_SSCALED;
+      return &fetch_R16G16_SSCALED;
    case PIPE_FORMAT_R16G16B16_SSCALED:
-      return fetch_R16G16B16_SSCALED;
+      return &fetch_R16G16B16_SSCALED;
    case PIPE_FORMAT_R16G16B16A16_SSCALED:
-      return fetch_R16G16B16A16_SSCALED;
+      return &fetch_R16G16B16A16_SSCALED;
 
    case PIPE_FORMAT_R8_UNORM:
-      return fetch_R8_UNORM;
+      return &fetch_R8_UNORM;
    case PIPE_FORMAT_R8G8_UNORM:
-      return fetch_R8G8_UNORM;
+      return &fetch_R8G8_UNORM;
    case PIPE_FORMAT_R8G8B8_UNORM:
-      return fetch_R8G8B8_UNORM;
+      return &fetch_R8G8B8_UNORM;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return fetch_R8G8B8A8_UNORM;
+      return &fetch_R8G8B8A8_UNORM;
 
    case PIPE_FORMAT_R8_USCALED:
-      return fetch_R8_USCALED;
+      return &fetch_R8_USCALED;
    case PIPE_FORMAT_R8G8_USCALED:
-      return fetch_R8G8_USCALED;
+      return &fetch_R8G8_USCALED;
    case PIPE_FORMAT_R8G8B8_USCALED:
-      return fetch_R8G8B8_USCALED;
+      return &fetch_R8G8B8_USCALED;
    case PIPE_FORMAT_R8G8B8A8_USCALED:
-      return fetch_R8G8B8A8_USCALED;
+      return &fetch_R8G8B8A8_USCALED;
 
    case PIPE_FORMAT_R8_SNORM:
-      return fetch_R8_SNORM;
+      return &fetch_R8_SNORM;
    case PIPE_FORMAT_R8G8_SNORM:
-      return fetch_R8G8_SNORM;
+      return &fetch_R8G8_SNORM;
    case PIPE_FORMAT_R8G8B8_SNORM:
-      return fetch_R8G8B8_SNORM;
+      return &fetch_R8G8B8_SNORM;
    case PIPE_FORMAT_R8G8B8A8_SNORM:
-      return fetch_R8G8B8A8_SNORM;
+      return &fetch_R8G8B8A8_SNORM;
 
    case PIPE_FORMAT_R8_SSCALED:
-      return fetch_R8_SSCALED;
+      return &fetch_R8_SSCALED;
    case PIPE_FORMAT_R8G8_SSCALED:
-      return fetch_R8G8_SSCALED;
+      return &fetch_R8G8_SSCALED;
    case PIPE_FORMAT_R8G8B8_SSCALED:
-      return fetch_R8G8B8_SSCALED;
+      return &fetch_R8G8B8_SSCALED;
    case PIPE_FORMAT_R8G8B8A8_SSCALED:
-      return fetch_R8G8B8A8_SSCALED;
+      return &fetch_R8G8B8A8_SSCALED;
 
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return fetch_A8R8G8B8_UNORM;
+      return &fetch_A8R8G8B8_UNORM;
 
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return fetch_B8G8R8A8_UNORM;
+      return &fetch_B8G8R8A8_UNORM;
 
    default:
       assert(0); 
-      return fetch_NULL;
+      return &fetch_NULL;
    }
 }
 
@@ -399,140 +399,140 @@ static emit_func get_emit_func( enum pipe_format format )
 {
    switch (format) {
    case PIPE_FORMAT_R64_FLOAT:
-      return emit_R64_FLOAT;
+      return &emit_R64_FLOAT;
    case PIPE_FORMAT_R64G64_FLOAT:
-      return emit_R64G64_FLOAT;
+      return &emit_R64G64_FLOAT;
    case PIPE_FORMAT_R64G64B64_FLOAT:
-      return emit_R64G64B64_FLOAT;
+      return &emit_R64G64B64_FLOAT;
    case PIPE_FORMAT_R64G64B64A64_FLOAT:
-      return emit_R64G64B64A64_FLOAT;
+      return &emit_R64G64B64A64_FLOAT;
 
    case PIPE_FORMAT_R32_FLOAT:
-      return emit_R32_FLOAT;
+      return &emit_R32_FLOAT;
    case PIPE_FORMAT_R32G32_FLOAT:
-      return emit_R32G32_FLOAT;
+      return &emit_R32G32_FLOAT;
    case PIPE_FORMAT_R32G32B32_FLOAT:
-      return emit_R32G32B32_FLOAT;
+      return &emit_R32G32B32_FLOAT;
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return emit_R32G32B32A32_FLOAT;
+      return &emit_R32G32B32A32_FLOAT;
 
    case PIPE_FORMAT_R32_UNORM:
-      return emit_R32_UNORM;
+      return &emit_R32_UNORM;
    case PIPE_FORMAT_R32G32_UNORM:
-      return emit_R32G32_UNORM;
+      return &emit_R32G32_UNORM;
    case PIPE_FORMAT_R32G32B32_UNORM:
-      return emit_R32G32B32_UNORM;
+      return &emit_R32G32B32_UNORM;
    case PIPE_FORMAT_R32G32B32A32_UNORM:
-      return emit_R32G32B32A32_UNORM;
+      return &emit_R32G32B32A32_UNORM;
 
    case PIPE_FORMAT_R32_USCALED:
-      return emit_R32_USCALED;
+      return &emit_R32_USCALED;
    case PIPE_FORMAT_R32G32_USCALED:
-      return emit_R32G32_USCALED;
+      return &emit_R32G32_USCALED;
    case PIPE_FORMAT_R32G32B32_USCALED:
-      return emit_R32G32B32_USCALED;
+      return &emit_R32G32B32_USCALED;
    case PIPE_FORMAT_R32G32B32A32_USCALED:
-      return emit_R32G32B32A32_USCALED;
+      return &emit_R32G32B32A32_USCALED;
 
    case PIPE_FORMAT_R32_SNORM:
-      return emit_R32_SNORM;
+      return &emit_R32_SNORM;
    case PIPE_FORMAT_R32G32_SNORM:
-      return emit_R32G32_SNORM;
+      return &emit_R32G32_SNORM;
    case PIPE_FORMAT_R32G32B32_SNORM:
-      return emit_R32G32B32_SNORM;
+      return &emit_R32G32B32_SNORM;
    case PIPE_FORMAT_R32G32B32A32_SNORM:
-      return emit_R32G32B32A32_SNORM;
+      return &emit_R32G32B32A32_SNORM;
 
    case PIPE_FORMAT_R32_SSCALED:
-      return emit_R32_SSCALED;
+      return &emit_R32_SSCALED;
    case PIPE_FORMAT_R32G32_SSCALED:
-      return emit_R32G32_SSCALED;
+      return &emit_R32G32_SSCALED;
    case PIPE_FORMAT_R32G32B32_SSCALED:
-      return emit_R32G32B32_SSCALED;
+      return &emit_R32G32B32_SSCALED;
    case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return emit_R32G32B32A32_SSCALED;
+      return &emit_R32G32B32A32_SSCALED;
 
    case PIPE_FORMAT_R16_UNORM:
-      return emit_R16_UNORM;
+      return &emit_R16_UNORM;
    case PIPE_FORMAT_R16G16_UNORM:
-      return emit_R16G16_UNORM;
+      return &emit_R16G16_UNORM;
    case PIPE_FORMAT_R16G16B16_UNORM:
-      return emit_R16G16B16_UNORM;
+      return &emit_R16G16B16_UNORM;
    case PIPE_FORMAT_R16G16B16A16_UNORM:
-      return emit_R16G16B16A16_UNORM;
+      return &emit_R16G16B16A16_UNORM;
 
    case PIPE_FORMAT_R16_USCALED:
-      return emit_R16_USCALED;
+      return &emit_R16_USCALED;
    case PIPE_FORMAT_R16G16_USCALED:
-      return emit_R16G16_USCALED;
+      return &emit_R16G16_USCALED;
    case PIPE_FORMAT_R16G16B16_USCALED:
-      return emit_R16G16B16_USCALED;
+      return &emit_R16G16B16_USCALED;
    case PIPE_FORMAT_R16G16B16A16_USCALED:
-      return emit_R16G16B16A16_USCALED;
+      return &emit_R16G16B16A16_USCALED;
 
    case PIPE_FORMAT_R16_SNORM:
-      return emit_R16_SNORM;
+      return &emit_R16_SNORM;
    case PIPE_FORMAT_R16G16_SNORM:
-      return emit_R16G16_SNORM;
+      return &emit_R16G16_SNORM;
    case PIPE_FORMAT_R16G16B16_SNORM:
-      return emit_R16G16B16_SNORM;
+      return &emit_R16G16B16_SNORM;
    case PIPE_FORMAT_R16G16B16A16_SNORM:
-      return emit_R16G16B16A16_SNORM;
+      return &emit_R16G16B16A16_SNORM;
 
    case PIPE_FORMAT_R16_SSCALED:
-      return emit_R16_SSCALED;
+      return &emit_R16_SSCALED;
    case PIPE_FORMAT_R16G16_SSCALED:
-      return emit_R16G16_SSCALED;
+      return &emit_R16G16_SSCALED;
    case PIPE_FORMAT_R16G16B16_SSCALED:
-      return emit_R16G16B16_SSCALED;
+      return &emit_R16G16B16_SSCALED;
    case PIPE_FORMAT_R16G16B16A16_SSCALED:
-      return emit_R16G16B16A16_SSCALED;
+      return &emit_R16G16B16A16_SSCALED;
 
    case PIPE_FORMAT_R8_UNORM:
-      return emit_R8_UNORM;
+      return &emit_R8_UNORM;
    case PIPE_FORMAT_R8G8_UNORM:
-      return emit_R8G8_UNORM;
+      return &emit_R8G8_UNORM;
    case PIPE_FORMAT_R8G8B8_UNORM:
-      return emit_R8G8B8_UNORM;
+      return &emit_R8G8B8_UNORM;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return emit_R8G8B8A8_UNORM;
+      return &emit_R8G8B8A8_UNORM;
 
    case PIPE_FORMAT_R8_USCALED:
-      return emit_R8_USCALED;
+      return &emit_R8_USCALED;
    case PIPE_FORMAT_R8G8_USCALED:
-      return emit_R8G8_USCALED;
+      return &emit_R8G8_USCALED;
    case PIPE_FORMAT_R8G8B8_USCALED:
-      return emit_R8G8B8_USCALED;
+      return &emit_R8G8B8_USCALED;
    case PIPE_FORMAT_R8G8B8A8_USCALED:
-      return emit_R8G8B8A8_USCALED;
+      return &emit_R8G8B8A8_USCALED;
 
    case PIPE_FORMAT_R8_SNORM:
-      return emit_R8_SNORM;
+      return &emit_R8_SNORM;
    case PIPE_FORMAT_R8G8_SNORM:
-      return emit_R8G8_SNORM;
+      return &emit_R8G8_SNORM;
    case PIPE_FORMAT_R8G8B8_SNORM:
-      return emit_R8G8B8_SNORM;
+      return &emit_R8G8B8_SNORM;
    case PIPE_FORMAT_R8G8B8A8_SNORM:
-      return emit_R8G8B8A8_SNORM;
+      return &emit_R8G8B8A8_SNORM;
 
    case PIPE_FORMAT_R8_SSCALED:
-      return emit_R8_SSCALED;
+      return &emit_R8_SSCALED;
    case PIPE_FORMAT_R8G8_SSCALED:
-      return emit_R8G8_SSCALED;
+      return &emit_R8G8_SSCALED;
    case PIPE_FORMAT_R8G8B8_SSCALED:
-      return emit_R8G8B8_SSCALED;
+      return &emit_R8G8B8_SSCALED;
    case PIPE_FORMAT_R8G8B8A8_SSCALED:
-      return emit_R8G8B8A8_SSCALED;
+      return &emit_R8G8B8A8_SSCALED;
 
    case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return emit_A8R8G8B8_UNORM;
+      return &emit_A8R8G8B8_UNORM;
 
    case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return emit_B8G8R8A8_UNORM;
+      return &emit_B8G8R8A8_UNORM;
 
    default:
       assert(0); 
-      return emit_NULL;
+      return &emit_NULL;
    }
 }
 
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index a54ac5a82f..634b05b8a9 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -26,6 +26,7 @@
  */
 
 
+#include "pipe/p_config.h"
 #include "pipe/p_compiler.h"
 #include "pipe/p_util.h"
 #include "util/u_simple_list.h"
@@ -33,7 +34,7 @@
 #include "translate.h"
 
 
-#if defined(__i386__) || defined(__386__) || defined(i386)
+#if defined(PIPE_ARCH_X86)
 
 #include "rtasm/rtasm_cpu.h"
 #include "rtasm/rtasm_x86sse.h"
@@ -617,7 +618,7 @@ struct translate *translate_sse2_create( const struct translate_key *key )
 
 #else
 
-void translate_create_sse( const struct translate_key *key )
+struct translate *translate_sse2_create( const struct translate_key *key )
 {
    return NULL;
 }
diff --git a/src/gallium/auxiliary/util/u_time.h b/src/gallium/auxiliary/util/u_time.h
index 48ec7a4a96..f9963ce0e2 100644
--- a/src/gallium/auxiliary/util/u_time.h
+++ b/src/gallium/auxiliary/util/u_time.h
@@ -61,7 +61,7 @@ struct util_time
 #if defined(PIPE_OS_LINUX)
    struct timeval tv;
 #else
-   long long counter;
+   int64_t counter;
 #endif
 };
    
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index a4b772bc4f..96b21d998d 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -52,39 +52,55 @@
 #endif /* __MSC__ */
 
 
-typedef unsigned int       uint;
-typedef unsigned char      ubyte;
-typedef unsigned char      boolean;
-typedef unsigned short     ushort;
-typedef unsigned long long uint64;
-
-
 #if defined(__MSC__)
 
-typedef char               int8_t;
-typedef unsigned char      uint8_t;
-typedef short              int16_t;
-typedef unsigned short     uint16_t;
-typedef long               int32_t;
-typedef unsigned long      uint32_t;
-typedef long long          int64_t;
-typedef unsigned long long uint64_t;
+typedef __int8             int8_t;
+typedef unsigned __int8    uint8_t;
+typedef __int16            int16_t;
+typedef unsigned __int16   uint16_t;
+typedef __int32            int32_t;
+typedef unsigned __int32   uint32_t;
+typedef __int64            int64_t;
+typedef unsigned __int64   uint64_t;
 
 #if defined(_WIN64)
 typedef __int64            intptr_t;
 typedef unsigned __int64   uintptr_t;
 #else
-typedef int                intptr_t;
-typedef unsigned int       uintptr_t;
+typedef __int32            intptr_t;
+typedef unsigned __int32   uintptr_t;
 #endif
 
+#ifndef __cplusplus
+#define false   0
+#define true    1
+#define bool    _Bool
+typedef int     _Bool;
+#define __bool_true_false_are_defined   1
+#endif /* !__cplusplus */
+
 #else
 #include <stdint.h>
+#include <stdbool.h>
 #endif
 
 
-#define TRUE  1
-#define FALSE 0
+typedef unsigned int       uint;
+typedef unsigned char      ubyte;
+typedef unsigned short     ushort;
+typedef uint64_t           uint64;
+
+#if 0
+#define boolean bool
+#else
+typedef unsigned char boolean;
+#endif
+#ifndef TRUE
+#define TRUE  true
+#endif
+#ifndef FALSE
+#define FALSE false
+#endif
 
 
 /* Function inlining */
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index 6ba211a1fc..d2d2ae1617 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -35,6 +35,10 @@
  * this file is auto-generated by an autoconf-like tool at some point, as some 
  * things cannot be determined by existing defines alone. 
  * 
+ * See also:
+ * - http://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html
+ * - echo | gcc -dM -E - | sort
+ * - http://msdn.microsoft.com/en-us/library/b0084kay.aspx
  * @author José Fonseca <jrfonseca@tungstengraphics.com>
  */
 
@@ -63,11 +67,11 @@
  * Processor architecture
  */
 
-#if defined(_X86_) || defined(__i386__) || defined(__386__) || defined(i386)
+#if defined(__i386__) /* gcc */ || defined(_M_IX86) /* msvc */ || defined(_X86_) || defined(__386__) || defined(i386)
 #define PIPE_ARCH_X86
 #endif
 
-#if 0 /* FIXME */
+#if defined(__x86_64__) /* gcc */ || defined(_M_X64) /* msvc */ || defined(_M_AMD64) /* msvc */
 #define PIPE_ARCH_X86_64
 #endif
 
diff --git a/src/gallium/include/pipe/p_debug.h b/src/gallium/include/pipe/p_debug.h
index 0af635be57..05eca75201 100644
--- a/src/gallium/include/pipe/p_debug.h
+++ b/src/gallium/include/pipe/p_debug.h
@@ -59,6 +59,13 @@ extern "C" {
 #endif
 #endif
 
+   
+/* MSVC bebore VC7 does not have the __FUNCTION__ macro */
+#if defined(_MSC_VER) && _MSC_VER < 1300
+#define __FUNCTION__ "???"
+#endif
+
+
 void _debug_vprintf(const char *format, va_list ap);
    
 
@@ -127,8 +134,8 @@ void _debug_break(void);
 #ifdef DEBUG
 #if (defined(__i386__) || defined(__386__)) && defined(__GNUC__)
 #define debug_break() __asm("int3")
-#elif (defined(__i386__) || defined(__386__)) && defined(__MSC__)
-#define debug_break()  _asm {int 3}
+#elif defined(_M_IX86) && defined(_MSC_VER)
+#define debug_break()  do { _asm {int 3} } while(0)
 #else
 #define debug_break() _debug_break()
 #endif
-- 
cgit v1.2.3


From 648da5158e5f418bf859aee6aa4532b6899b0d94 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Wed, 28 May 2008 16:36:45 +0100
Subject: rtasm: special case for [ebp]

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 2415b0156b..672d2ff554 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -328,7 +328,7 @@ struct x86_reg x86_make_disp( struct x86_reg reg,
    else
       reg.disp += disp;
 
-   if (reg.disp == 0)
+   if (reg.disp == 0 && reg.idx != reg_BP)
       reg.mod = mod_INDIRECT;
    else if (reg.disp <= 127 && reg.disp >= -128)
       reg.mod = mod_DISP8;
-- 
cgit v1.2.3


From aa1a39d1a742c1bb346ba14814d6bf7b44e646cb Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 2 Jun 2008 20:46:05 +0900
Subject: rtasm: Use enum sse_cc in sse_cmpps.

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 2 +-
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 664a69a537..f4ca282dd9 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -888,7 +888,7 @@ void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg sr
 void sse_cmpps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src,
-		unsigned char cc) 
+		enum sse_cc cc) 
 {
    DUMP_RRI( dst, src, cc );
    emit_2ub(p, X86_TWOB, 0xC2);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index bd76e1729c..af94577aab 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -191,7 +191,7 @@ void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src,
-                unsigned char cc );
+                enum sse_cc cc );
 void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-- 
cgit v1.2.3


From 4f25420bdd834e81a3e22733304efc5261c2998a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Sun, 24 Aug 2008 17:48:55 -0600
Subject: gallium: refactor/replace p_util.h with util/u_memory.h and
 util/u_math.h

Also, rename p_tile.[ch] to u_tile.[ch]
---
 src/gallium/README.portability                     |    4 +-
 src/gallium/auxiliary/cso_cache/cso_cache.c        |    3 +-
 src/gallium/auxiliary/cso_cache/cso_context.c      |    2 +-
 src/gallium/auxiliary/cso_cache/cso_hash.c         |    2 +-
 src/gallium/auxiliary/draw/draw_context.c          |    3 +-
 src/gallium/auxiliary/draw/draw_pipe.c             |    1 -
 src/gallium/auxiliary/draw/draw_pipe_aaline.c      |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_aapoint.c     |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_clip.c        |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_cull.c        |    2 +-
 src/gallium/auxiliary/draw/draw_pipe_flatshade.c   |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_offset.c      |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_pstipple.c    |    4 +-
 src/gallium/auxiliary/draw/draw_pipe_stipple.c     |    6 +-
 src/gallium/auxiliary/draw/draw_pipe_twoside.c     |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_unfilled.c    |    2 +-
 src/gallium/auxiliary/draw/draw_pipe_util.c        |    2 +-
 src/gallium/auxiliary/draw/draw_pipe_validate.c    |    2 +-
 src/gallium/auxiliary/draw/draw_pipe_vbuf.c        |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_wide_line.c   |    3 +-
 src/gallium/auxiliary/draw/draw_pipe_wide_point.c  |    3 +-
 src/gallium/auxiliary/draw/draw_pt.c               |    1 -
 src/gallium/auxiliary/draw/draw_pt_emit.c          |    2 +-
 src/gallium/auxiliary/draw/draw_pt_fetch.c         |    2 +-
 src/gallium/auxiliary/draw/draw_pt_fetch_emit.c    |    2 +-
 .../auxiliary/draw/draw_pt_fetch_shade_emit.c      |    3 +-
 .../auxiliary/draw/draw_pt_fetch_shade_pipeline.c  |    3 +-
 src/gallium/auxiliary/draw/draw_pt_post_vs.c       |    2 +-
 src/gallium/auxiliary/draw/draw_pt_util.c          |    1 -
 src/gallium/auxiliary/draw/draw_pt_varray.c        |    4 +-
 src/gallium/auxiliary/draw/draw_pt_vcache.c        |    2 +-
 src/gallium/auxiliary/draw/draw_vbuf.h             |    2 -
 src/gallium/auxiliary/draw/draw_vs.c               |    6 +-
 src/gallium/auxiliary/draw/draw_vs_aos.c           |    4 +-
 src/gallium/auxiliary/draw/draw_vs_aos_io.c        |    2 +-
 src/gallium/auxiliary/draw/draw_vs_aos_machine.c   |    3 +-
 src/gallium/auxiliary/draw/draw_vs_exec.c          |    3 +-
 src/gallium/auxiliary/draw/draw_vs_llvm.c          |    1 -
 src/gallium/auxiliary/draw/draw_vs_sse.c           |    3 +-
 src/gallium/auxiliary/draw/draw_vs_varient.c       |    3 +-
 src/gallium/auxiliary/gallivm/gallivm_cpu.cpp      |    3 +-
 src/gallium/auxiliary/gallivm/instructions.cpp     |    2 +-
 src/gallium/auxiliary/gallivm/instructionssoa.cpp  |    2 +-
 .../auxiliary/pipebuffer/pb_buffer_fenced.c        |    2 +-
 .../auxiliary/pipebuffer/pb_buffer_malloc.c        |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c   |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c |    2 +-
 .../auxiliary/pipebuffer/pb_bufmgr_fenced.c        |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c    |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c  |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c  |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_validate.c     |    2 +-
 src/gallium/auxiliary/pipebuffer/pb_winsys.c       |    2 +-
 src/gallium/auxiliary/rtasm/rtasm_execmem.c        |    2 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        |    2 +-
 src/gallium/auxiliary/sct/sct.c                    |    2 +-
 src/gallium/auxiliary/tgsi/tgsi_build.c            |    1 -
 src/gallium/auxiliary/tgsi/tgsi_build.h            |    4 +
 src/gallium/auxiliary/tgsi/tgsi_dump_c.c           |    1 -
 src/gallium/auxiliary/tgsi/tgsi_exec.c             |    2 +-
 src/gallium/auxiliary/tgsi/tgsi_parse.c            |    2 +-
 src/gallium/auxiliary/tgsi/tgsi_scan.c             |    6 +-
 src/gallium/auxiliary/tgsi/tgsi_sse2.c             |    2 +-
 src/gallium/auxiliary/tgsi/tgsi_transform.c        |    1 +
 src/gallium/auxiliary/tgsi/tgsi_transform.h        |    1 -
 src/gallium/auxiliary/tgsi/tgsi_util.c             |    1 -
 src/gallium/auxiliary/translate/translate.c        |    1 -
 src/gallium/auxiliary/translate/translate_cache.c  |    2 +-
 .../auxiliary/translate/translate_generic.c        |    2 +-
 src/gallium/auxiliary/translate/translate_sse.c    |    2 +-
 src/gallium/auxiliary/util/Makefile                |    2 +-
 src/gallium/auxiliary/util/SConscript              |    2 +-
 src/gallium/auxiliary/util/p_debug.c               |    1 -
 src/gallium/auxiliary/util/u_blit.c                |    5 +-
 src/gallium/auxiliary/util/u_gen_mipmap.c          |    2 +-
 src/gallium/auxiliary/util/u_handle_table.c        |    4 +-
 src/gallium/auxiliary/util/u_hash_table.c          |    5 +-
 src/gallium/auxiliary/util/u_math.h                |  240 +++-
 src/gallium/auxiliary/util/u_memory.h              |  222 ++++
 src/gallium/auxiliary/util/u_mm.c                  |    2 +-
 src/gallium/auxiliary/util/u_pack_color.h          |   36 +-
 src/gallium/auxiliary/util/u_pointer.h             |  107 ++
 src/gallium/auxiliary/util/u_rect.c                |    1 -
 src/gallium/auxiliary/util/u_simple_shaders.c      |    2 +-
 src/gallium/auxiliary/util/u_tile.c                | 1169 ++++++++++++++++++++
 src/gallium/auxiliary/util/u_tile.h                |  101 ++
 src/gallium/drivers/cell/common.h                  |    1 -
 src/gallium/drivers/cell/ppu/cell_clear.c          |    2 +-
 src/gallium/drivers/cell/ppu/cell_context.c        |    2 +-
 src/gallium/drivers/cell/ppu/cell_pipe_state.c     |    2 +-
 src/gallium/drivers/cell/ppu/cell_render.c         |    2 +-
 src/gallium/drivers/cell/ppu/cell_screen.c         |    2 +-
 src/gallium/drivers/cell/ppu/cell_state_derived.c  |    2 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |    2 +-
 src/gallium/drivers/cell/ppu/cell_state_shader.c   |    2 +-
 src/gallium/drivers/cell/ppu/cell_surface.c        |    2 +-
 src/gallium/drivers/cell/ppu/cell_texture.c        |    2 +-
 src/gallium/drivers/cell/ppu/cell_winsys.c         |    2 +-
 src/gallium/drivers/cell/spu/spu_exec.c            |    1 -
 src/gallium/drivers/cell/spu/spu_tri.c             |    1 -
 src/gallium/drivers/cell/spu/spu_util.c            |    1 -
 src/gallium/drivers/cell/spu/spu_vertex_fetch.c    |    1 -
 src/gallium/drivers/cell/spu/spu_vertex_shader.c   |    1 -
 src/gallium/drivers/failover/fo_context.c          |    2 +-
 src/gallium/drivers/i915simple/i915_context.c      |    2 +-
 src/gallium/drivers/i915simple/i915_debug_fp.c     |    2 +-
 src/gallium/drivers/i915simple/i915_fpc.h          |    1 -
 .../drivers/i915simple/i915_fpc_translate.c        |    2 +
 src/gallium/drivers/i915simple/i915_prim_emit.c    |    4 +-
 src/gallium/drivers/i915simple/i915_prim_vbuf.c    |    3 +-
 src/gallium/drivers/i915simple/i915_screen.c       |    2 +-
 src/gallium/drivers/i915simple/i915_state.c        |    3 +-
 .../drivers/i915simple/i915_state_derived.c        |    2 +-
 .../drivers/i915simple/i915_state_dynamic.c        |    4 +-
 .../drivers/i915simple/i915_state_immediate.c      |    2 +-
 .../drivers/i915simple/i915_state_sampler.c        |    2 +-
 src/gallium/drivers/i915simple/i915_surface.c      |    3 +-
 src/gallium/drivers/i915simple/i915_texture.c      |    3 +-
 src/gallium/drivers/i965simple/brw_cc.c            |    6 +-
 src/gallium/drivers/i965simple/brw_clip_state.c    |    3 +-
 src/gallium/drivers/i965simple/brw_context.c       |    2 +-
 src/gallium/drivers/i965simple/brw_curbe.c         |    3 +-
 src/gallium/drivers/i965simple/brw_draw_upload.c   |    1 +
 src/gallium/drivers/i965simple/brw_gs_state.c      |    3 +-
 src/gallium/drivers/i965simple/brw_screen.c        |    2 +-
 src/gallium/drivers/i965simple/brw_sf_state.c      |    5 +-
 src/gallium/drivers/i965simple/brw_shader_info.c   |    2 +-
 src/gallium/drivers/i965simple/brw_state.c         |    2 +-
 src/gallium/drivers/i965simple/brw_state_batch.c   |    2 +-
 src/gallium/drivers/i965simple/brw_state_cache.c   |    2 +-
 src/gallium/drivers/i965simple/brw_state_pool.c    |    3 +-
 src/gallium/drivers/i965simple/brw_state_upload.c  |    2 +-
 src/gallium/drivers/i965simple/brw_surface.c       |    3 +-
 src/gallium/drivers/i965simple/brw_tex_layout.c    |    8 +-
 src/gallium/drivers/i965simple/brw_vs_state.c      |    3 +-
 src/gallium/drivers/i965simple/brw_wm.c            |    2 +-
 src/gallium/drivers/i965simple/brw_wm_decl.c       |    3 +-
 src/gallium/drivers/i965simple/brw_wm_glsl.c       |    3 +-
 .../drivers/i965simple/brw_wm_sampler_state.c      |    3 +-
 src/gallium/drivers/i965simple/brw_wm_state.c      |    3 +-
 src/gallium/drivers/softpipe/sp_context.c          |    2 +-
 src/gallium/drivers/softpipe/sp_fs_exec.c          |    2 +-
 src/gallium/drivers/softpipe/sp_fs_llvm.c          |    2 +-
 src/gallium/drivers/softpipe/sp_fs_sse.c           |    2 +-
 src/gallium/drivers/softpipe/sp_prim_setup.c       |    2 +-
 src/gallium/drivers/softpipe/sp_prim_vbuf.c        |    1 +
 src/gallium/drivers/softpipe/sp_quad_alpha_test.c  |    2 +-
 src/gallium/drivers/softpipe/sp_quad_blend.c       |   29 +-
 src/gallium/drivers/softpipe/sp_quad_bufloop.c     |    2 +-
 src/gallium/drivers/softpipe/sp_quad_colormask.c   |    3 +-
 src/gallium/drivers/softpipe/sp_quad_coverage.c    |    2 +-
 src/gallium/drivers/softpipe/sp_quad_depth_test.c  |    2 +-
 src/gallium/drivers/softpipe/sp_quad_earlyz.c      |    2 +-
 src/gallium/drivers/softpipe/sp_quad_fs.c          |    3 +-
 src/gallium/drivers/softpipe/sp_quad_occlusion.c   |    2 +-
 src/gallium/drivers/softpipe/sp_quad_output.c      |    2 +-
 src/gallium/drivers/softpipe/sp_quad_stencil.c     |    2 +-
 src/gallium/drivers/softpipe/sp_quad_stipple.c     |    2 +-
 src/gallium/drivers/softpipe/sp_query.c            |    2 +-
 src/gallium/drivers/softpipe/sp_screen.c           |    2 +-
 src/gallium/drivers/softpipe/sp_setup.c            |    2 +-
 src/gallium/drivers/softpipe/sp_state_blend.c      |    2 +-
 src/gallium/drivers/softpipe/sp_state_derived.c    |    3 +-
 src/gallium/drivers/softpipe/sp_state_fs.c         |    2 +-
 src/gallium/drivers/softpipe/sp_state_rasterizer.c |    2 +-
 src/gallium/drivers/softpipe/sp_state_sampler.c    |    2 +-
 src/gallium/drivers/softpipe/sp_surface.c          |    3 +-
 src/gallium/drivers/softpipe/sp_tex_sample.c       |    2 +-
 src/gallium/drivers/softpipe/sp_texture.c          |    3 +-
 src/gallium/drivers/softpipe/sp_tile_cache.c       |    4 +-
 src/gallium/drivers/trace/tr_context.c             |    2 +-
 src/gallium/drivers/trace/tr_dump.c                |    2 +
 src/gallium/drivers/trace/tr_dump.h                |    1 -
 src/gallium/drivers/trace/tr_screen.c              |    2 +-
 src/gallium/drivers/trace/tr_state.c               |    1 +
 src/gallium/drivers/trace/tr_stream_stdc.c         |    2 +-
 src/gallium/drivers/trace/tr_stream_wd.c           |    2 +-
 src/gallium/drivers/trace/tr_texture.c             |    2 +-
 src/gallium/drivers/trace/tr_winsys.c              |    3 +-
 src/gallium/include/pipe/p_util.h                  |  460 --------
 src/gallium/state_trackers/python/gallium.i        |    2 +-
 src/gallium/state_trackers/python/st_device.c      |    3 +-
 src/gallium/state_trackers/python/st_sample.c      |    5 +-
 .../state_trackers/python/st_softpipe_winsys.c     |    3 +-
 .../winsys/drm/intel/common/intel_be_device.c      |    2 +-
 .../winsys/drm/intel/dri/intel_winsys_softpipe.c   |    2 +-
 src/gallium/winsys/egl_xlib/egl_xlib.c             |    2 +-
 src/gallium/winsys/egl_xlib/sw_winsys.c            |    3 +-
 src/gallium/winsys/gdi/wmesa.c                     |    2 +-
 src/gallium/winsys/xlib/brw_aub.c                  |    1 -
 src/gallium/winsys/xlib/xm_winsys.c                |    3 +-
 src/gallium/winsys/xlib/xm_winsys_aub.c            |    2 +-
 src/mesa/state_tracker/acc2.c                      |  319 ++++++
 src/mesa/state_tracker/st_cb_accum.c               |    2 +-
 src/mesa/state_tracker/st_cb_bitmap.c              |    2 +-
 src/mesa/state_tracker/st_cb_drawpixels.c          |    2 +-
 src/mesa/state_tracker/st_cb_readpixels.c          |    2 +-
 src/mesa/state_tracker/st_cb_texture.c             |    2 +-
 src/mesa/state_tracker/st_program.c                |    2 +-
 src/mesa/state_tracker/st_texture.c                |    1 -
 201 files changed, 2453 insertions(+), 686 deletions(-)
 create mode 100644 src/gallium/auxiliary/util/u_memory.h
 create mode 100644 src/gallium/auxiliary/util/u_pointer.h
 create mode 100644 src/gallium/auxiliary/util/u_tile.c
 create mode 100644 src/gallium/auxiliary/util/u_tile.h
 delete mode 100644 src/gallium/include/pipe/p_util.h
 create mode 100644 src/mesa/state_tracker/acc2.c

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/README.portability b/src/gallium/README.portability
index d5d5987a7f..adecf4bb79 100644
--- a/src/gallium/README.portability
+++ b/src/gallium/README.portability
@@ -35,8 +35,8 @@ not available in Windows Kernel Mode. Use the appropriate p_*.h include.
 
 * Use MALLOC, CALLOC, FREE instead of the malloc, calloc, free functions.
 
-* Use align_pointer() function defined in p_util.h for aligning pointers in a
-portable way.
+* Use align_pointer() function defined in u_memory.h for aligning pointers
+ in a portable way.
 
 == Debugging ==
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
index 36dc46ff80..6b1754ea00 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -28,9 +28,10 @@
 /* Authors:  Zack Rusin <zack@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
 #include "pipe/p_debug.h"
 
+#include "util/u_memory.h"
+
 #include "cso_cache.h"
 #include "cso_hash.h"
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 86e4d46a20..f22ba40824 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -36,7 +36,7 @@
   */
 
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "tgsi/tgsi_parse.h"
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index 0646efd952..7f0044c5a7 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -31,7 +31,7 @@
   */
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "cso_hash.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 2f263cf06a..1c26cb31a3 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -31,7 +31,8 @@
   */
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
 #include "draw_context.h"
 #include "draw_vbuf.h"
 #include "draw_vs.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 1db43876ef..3cde9d36d3 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -30,7 +30,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pipe.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 991304b2c8..20841bb5d6 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -32,11 +32,12 @@
  */
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index c7f4349cb3..2c1cacbdb4 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -38,7 +38,6 @@
  */
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -47,6 +46,9 @@
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
 
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "draw_context.h"
 #include "draw_vs.h"
 #include "draw_pipe.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index fa10f8efca..3265dcd154 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -32,7 +32,9 @@
  */
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
 #include "pipe/p_shader_tokens.h"
 
 #include "draw_vs.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c
index d0d22a38e0..053be5f050 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -33,7 +33,7 @@
  */
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "draw_pipe.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
index 4741b22d02..43d1fecc4d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -28,7 +28,9 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "pipe/p_shader_tokens.h"
 #include "draw_vs.h"
 #include "draw_pipe.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
index 2f5865741c..1fea5e6dcb 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_offset.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -32,7 +32,8 @@
  * \author  Brian Paul
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "draw_pipe.h"
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index e97136fa1f..b764d9c518 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -34,12 +34,14 @@
  */
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "tgsi/tgsi_transform.h"
 #include "tgsi/tgsi_dump.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index bf0db18a68..b65e2aa102 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -36,10 +36,12 @@
  */
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
-#include "draw_pipe.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "draw/draw_pipe.h"
 
 
 /** Subclass of draw_stage */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
index 3ac825f565..c329d92339 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -28,7 +28,8 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_vs.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index 8f97fdedaa..68835fd1a5 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -33,7 +33,7 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "draw_private.h"
 #include "draw_pipe.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_util.c b/src/gallium/auxiliary/draw/draw_pipe_util.c
index 04438f4dd0..e22e5fed0c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_util.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_util.c
@@ -30,7 +30,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pipe.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c
index 6be1d369c3..f34c68728e 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_validate.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c
@@ -28,7 +28,7 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "draw_private.h"
 #include "draw_pipe.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index a6fde77a0e..c0cf4269db 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -35,7 +35,8 @@
 
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "draw_vbuf.h"
 #include "draw_private.h"
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
index 48ec2f1239..184e363594 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
@@ -28,9 +28,10 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "draw_private.h"
 #include "draw_pipe.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index 54590984c6..4f1326053d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -28,7 +28,8 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_vs.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 85a75525c8..669c11c993 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -30,7 +30,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 40f05cb9e0..d4eca80588 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
index 07f4c99164..6377f896fb 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 4a1f3b0953..0684c93d10 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -30,7 +30,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index fdf9b6fe6a..87094f3092 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -31,7 +31,8 @@
   */
 
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index be3535ed9e..f617aac9f7 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -25,7 +25,8 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index af6306b1c6..96dc706b99 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_context.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c
index 32c8a9632c..3bc7939c55 100644
--- a/src/gallium/auxiliary/draw/draw_pt_util.c
+++ b/src/gallium/auxiliary/draw/draw_pt_util.c
@@ -30,7 +30,6 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
index 46e722a154..c15afe65f1 100644
--- a/src/gallium/auxiliary/draw/draw_pt_varray.c
+++ b/src/gallium/auxiliary/draw/draw_pt_varray.c
@@ -25,7 +25,9 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
index cda2987c9e..b8b5de729d 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c
@@ -30,7 +30,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h
index e90f37872a..62247ccd9f 100644
--- a/src/gallium/auxiliary/draw/draw_vbuf.h
+++ b/src/gallium/auxiliary/draw/draw_vbuf.h
@@ -37,8 +37,6 @@
 #define DRAW_VBUF_H_
 
 
-#include "pipe/p_util.h"
-
 
 struct draw_context;
 struct vertex_info;
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index f798b20492..34adbd49b0 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -31,11 +31,15 @@
   *   Brian Paul
   */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 #include "pipe/p_shader_tokens.h"
+
 #include "draw_private.h"
 #include "draw_context.h"
 #include "draw_vs.h"
+
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 41bdd012d5..760fcb389f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -29,9 +29,9 @@
  */
 
 
-#include "pipe/p_util.h"
-#include "pipe/p_shader_tokens.h"
+#include "util/u_memory.h"
 #include "util/u_math.h"
+#include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_exec.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index eda677cc62..ab3c5b94a5 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
index e029b7b4bb..b358bd2df4 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
@@ -29,8 +29,9 @@
 #include "pipe/p_config.h"
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_exec.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index e26903d8cc..44563803f9 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -31,7 +31,8 @@
   *   Brian Paul
   */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 
 #include "draw_private.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index fc03473b91..2ce30b9a02 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -32,7 +32,6 @@
   *   Brian Paul
   */
 
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_private.h"
 #include "draw_context.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 61f0c084c3..0efabd9de8 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -31,13 +31,14 @@
   *   Brian Paul
   */
 
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_config.h"
 
 #include "draw_vs.h"
 
 #if defined(PIPE_ARCH_X86)
 
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 
 #include "draw_private.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index 994ce3e889..4daf05dae7 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -30,7 +30,8 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index cf5b978837..e64bfb1c6c 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -41,11 +41,12 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
-#include "pipe/p_util.h"
 
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_dump.h"
 
+#include "util/u_memory.h"
+
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
 #include <llvm/Constants.h>
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index 035224e8f3..a82dc30306 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -35,7 +35,7 @@
 
 #include "storage.h"
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include <llvm/CallingConv.h>
 #include <llvm/Constants.h>
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index 76049ade7c..efddc04e81 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -29,7 +29,7 @@
 #include "storagesoa.h"
 
 #include "pipe/p_shader_tokens.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include <llvm/CallingConv.h>
 #include <llvm/Constants.h>
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index ce41418a0f..8ae052e875 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -45,7 +45,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 
 #include "pb_buffer.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
index e90d2e5623..20fc87b39d 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -35,7 +35,7 @@
 
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pb_buffer.h"
 #include "pb_bufmgr.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
index 0d2d6c0c1b..2afaeafa1a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
@@ -35,7 +35,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "pb_buffer.h"
 #include "pb_bufmgr.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index bed4bec4fe..b914c2d0fe 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -38,7 +38,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_time.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index d02e3500ff..5e518370d0 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -37,7 +37,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_time.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index 05efd8ce41..8fc63ce648 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -35,7 +35,7 @@
 
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "pb_buffer.h"
 #include "pb_buffer_fenced.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index c51e582611..b40eb6cc90 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -36,7 +36,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_mm.h"
 #include "pb_buffer.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index 95af08929a..93d2cc9635 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -39,7 +39,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 
 #include "pb_buffer.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index 598d9ce310..af307e265a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -39,7 +39,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_time.h"
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.c b/src/gallium/auxiliary/pipebuffer/pb_validate.c
index 362fd896f3..1e54fc39d4 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_validate.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.c
@@ -35,7 +35,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_error.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_debug.h"
 
 #include "pb_buffer.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_winsys.c b/src/gallium/auxiliary/pipebuffer/pb_winsys.c
index 978944091f..28d137dbc4 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_winsys.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_winsys.c
@@ -35,7 +35,7 @@
 
 
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "pb_buffer.h"
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 300c1c2d9d..dfa5c35ab6 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -33,7 +33,7 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
 #include "pipe/p_thread.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "rtasm_execmem.h"
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 7f6bf577b2..285ddc0e3f 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -30,7 +30,7 @@
  */
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "rtasm_ppc_spe.h"
 
 #ifdef GALLIUM_CELL
diff --git a/src/gallium/auxiliary/sct/sct.c b/src/gallium/auxiliary/sct/sct.c
index 5e4126e014..49bb7ea92e 100644
--- a/src/gallium/auxiliary/sct/sct.c
+++ b/src/gallium/auxiliary/sct/sct.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_state.h"
 #include "pipe/p_inlines.h"
 #include "sct.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 050b448fe7..74614d3688 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_build.h"
 #include "tgsi_parse.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
index 6ae7f324f8..7d6234746a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -28,6 +28,10 @@
 #ifndef TGSI_BUILD_H
 #define TGSI_BUILD_H
 
+
+struct tgsi_token;
+
+
 #if defined __cplusplus
 extern "C" {
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
index 1025866a25..be25cb45a0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "util/u_string.h"
 #include "tgsi_dump_c.h"
 #include "tgsi_build.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index e28b56c842..fb573fe1f0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -52,11 +52,11 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
+#include "util/u_memory.h"
 #include "util/u_math.h"
 
 #define FAST_MATH 1
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index d16f0cdcad..3757486ba9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -26,10 +26,10 @@
  **************************************************************************/
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
 #include "tgsi_build.h"
+#include "util/u_memory.h"
 
 void
 tgsi_full_token_init(
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 59bcf10b53..be4870a498 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -33,11 +33,11 @@
  */
 
 
-#include "tgsi_scan.h"
-#include "tgsi/tgsi_parse.h"
+#include "util/u_math.h"
 #include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
 
-#include "pipe/p_util.h"
 
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 00ed4da450..626724ad4e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,7 +25,7 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 #include "tgsi/tgsi_parse.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c b/src/gallium/auxiliary/tgsi/tgsi_transform.c
index 357f77b05a..ea87da31e5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c
@@ -31,6 +31,7 @@
  * Authors:  Brian Paul
  */
 
+#include "pipe/p_debug.h"
 
 #include "tgsi_transform.h"
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index 3da0b38271..a121adbaef 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -29,7 +29,6 @@
 #define TGSI_TRANSFORM_H
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_build.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 09486e649e..50101a9bb0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -26,7 +26,6 @@
  **************************************************************************/
 
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
 #include "tgsi_build.h"
diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c
index b93fbf9033..7678903f75 100644
--- a/src/gallium/auxiliary/translate/translate.c
+++ b/src/gallium/auxiliary/translate/translate.c
@@ -31,7 +31,6 @@
   */
 
 #include "pipe/p_config.h"
-#include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "translate.h"
 
diff --git a/src/gallium/auxiliary/translate/translate_cache.c b/src/gallium/auxiliary/translate/translate_cache.c
index 115dc9287e..d8069a149c 100644
--- a/src/gallium/auxiliary/translate/translate_cache.c
+++ b/src/gallium/auxiliary/translate/translate_cache.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_state.h"
 #include "translate.h"
 #include "translate_cache.h"
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 4c8179ffa8..4d336f47ea 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -30,7 +30,7 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_state.h"
 #include "translate.h"
 
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index 18a212ac1c..7955186e16 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -28,7 +28,7 @@
 
 #include "pipe/p_config.h"
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_simple_list.h"
 
 #include "translate.h"
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index 6eebf6d29b..6e5fd26c05 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -5,7 +5,6 @@ LIBNAME = util
 
 C_SOURCES = \
 	p_debug.c \
-	p_tile.c \
 	u_blit.c \
 	u_draw_quad.c \
 	u_gen_mipmap.c \
@@ -16,6 +15,7 @@ C_SOURCES = \
 	u_rect.c \
 	u_simple_shaders.c \
 	u_snprintf.c \
+	u_tile.c \
 	u_time.c
 
 include ../../Makefile.template
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index 94382fe1f9..ce3fad7068 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -6,7 +6,6 @@ util = env.ConvenienceLibrary(
 		'p_debug.c',
 		'p_debug_mem.c',
 		'p_debug_prof.c',
-		'p_tile.c',
 		'u_blit.c',
 		'u_draw_quad.c',
 		'u_gen_mipmap.c',
@@ -17,6 +16,7 @@ util = env.ConvenienceLibrary(
 		'u_rect.c',
 		'u_simple_shaders.c',
 		'u_snprintf.c',
+		'u_tile.c',
 		'u_time.c',
 	])
 
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index 2c2f2f8931..7d1dba5a24 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -51,7 +51,6 @@
 #endif
 
 #include "pipe/p_compiler.h" 
-#include "pipe/p_util.h" 
 #include "pipe/p_debug.h" 
 #include "pipe/p_format.h" 
 #include "pipe/p_state.h" 
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index ae087df4cf..05399f9885 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -37,12 +37,13 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "util/u_draw_quad.h"
 #include "util/u_blit.h"
+#include "util/u_draw_quad.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 
 #include "cso_cache/cso_context.h"
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index 8713ff5d58..c1e2c19f87 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -37,10 +37,10 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_shader_tokens.h"
 
+#include "util/u_memory.h"
 #include "util/u_draw_quad.h"
 #include "util/u_gen_mipmap.h"
 #include "util/u_simple_shaders.h"
diff --git a/src/gallium/auxiliary/util/u_handle_table.c b/src/gallium/auxiliary/util/u_handle_table.c
index 2176a00959..2c40011923 100644
--- a/src/gallium/auxiliary/util/u_handle_table.c
+++ b/src/gallium/auxiliary/util/u_handle_table.c
@@ -35,9 +35,9 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 
-#include "u_handle_table.h"
+#include "util/u_memory.h"
+#include "util/u_handle_table.h"
 
 
 #define HANDLE_TABLE_INITIAL_SIZE 16  
diff --git a/src/gallium/auxiliary/util/u_hash_table.c b/src/gallium/auxiliary/util/u_hash_table.c
index dd5eca7fca..0bc8de9632 100644
--- a/src/gallium/auxiliary/util/u_hash_table.c
+++ b/src/gallium/auxiliary/util/u_hash_table.c
@@ -40,10 +40,11 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 
 #include "cso_cache/cso_hash.h"
-#include "u_hash_table.h"
+
+#include "util/u_memory.h"
+#include "util/u_hash_table.h"
 
 
 struct hash_table
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index a541d30a5d..9b4ca39371 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -40,8 +40,6 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
-#include "util/u_math.h"
 
 
 #ifdef __cplusplus
@@ -49,6 +47,132 @@ extern "C" {
 #endif
 
 
+#if defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+__inline double ceil(double val)
+{
+   double ceil_val;
+
+   if((val - (long) val) == 0) {
+      ceil_val = val;
+   }
+   else {
+      if(val > 0) {
+         ceil_val = (long) val + 1;
+      }
+      else {
+         ceil_val = (long) val;
+      }
+   }
+
+   return ceil_val;
+}
+
+#ifndef PIPE_SUBSYSTEM_WINDOWS_CE
+__inline double floor(double val)
+{
+   double floor_val;
+
+   if((val - (long) val) == 0) {
+      floor_val = val;
+   }
+   else {
+      if(val > 0) {
+         floor_val = (long) val;
+      }
+      else {
+         floor_val = (long) val - 1;
+      }
+   }
+
+   return floor_val;
+}
+#endif
+
+#pragma function(pow)
+__inline double __cdecl pow(double val, double exponent)
+{
+   /* XXX */
+   assert(0);
+   return 0;
+}
+
+#pragma function(log)
+__inline double __cdecl log(double val)
+{
+   /* XXX */
+   assert(0);
+   return 0;
+}
+
+#pragma function(atan2)
+__inline double __cdecl atan2(double val)
+{
+   /* XXX */
+   assert(0);
+   return 0;
+}
+#else
+#include <math.h>
+#include <stdarg.h>
+#endif
+
+
+#if defined(_MSC_VER) 
+#if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+ 
+static INLINE float cosf( float f ) 
+{
+   return (float) cos( (double) f );
+}
+
+static INLINE float sinf( float f ) 
+{
+   return (float) sin( (double) f );
+}
+
+static INLINE float ceilf( float f ) 
+{
+   return (float) ceil( (double) f );
+}
+
+static INLINE float floorf( float f ) 
+{
+   return (float) floor( (double) f );
+}
+
+static INLINE float powf( float f, float g ) 
+{
+   return (float) pow( (double) f, (double) g );
+}
+
+static INLINE float sqrtf( float f ) 
+{
+   return (float) sqrt( (double) f );
+}
+
+static INLINE float fabsf( float f ) 
+{
+   return (float) fabs( (double) f );
+}
+
+static INLINE float logf( float f ) 
+{
+   return (float) log( (double) f );
+}
+
+#else
+/* Work-around an extra semi-colon in VS 2005 logf definition */
+#ifdef logf
+#undef logf
+#define logf(x) ((float)log((double)(x)))
+#endif /* logf */
+#endif
+#endif /* _MSC_VER */
+
+
+
+
+
 #define POW2_TABLE_SIZE 256
 #define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1))
 extern float pow2_table[POW2_TABLE_SIZE];
@@ -59,6 +183,11 @@ extern void
 util_init_math(void);
 
 
+union fi {
+   float f;
+   int i;
+   unsigned ui;
+};
 
 
 /**
@@ -195,6 +324,113 @@ util_iround(float f)
 
 
+#if defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
+/**
+ * Find first bit set in word.  Least significant bit is 1.
+ * Return 0 if no bits set.
+ */
+static INLINE
+unsigned ffs( unsigned u )
+{
+   unsigned i;
+
+   if( u == 0 ) {
+      return 0;
+   }
+
+   __asm bsf eax, [u]
+   __asm inc eax
+   __asm mov [i], eax
+
+   return i;
+}
+#endif
+
+
+/**
+ * Return float bits.
+ */
+static INLINE unsigned
+fui( float f )
+{
+   union fi fi;
+   fi.f = f;
+   return fi.ui;
+}
+
+
+
+static INLINE float
+ubyte_to_float(ubyte ub)
+{
+   return (float) ub * (1.0f / 255.0f);
+}
+
+
+/**
+ * Convert float in [0,1] to ubyte in [0,255] with clamping.
+ */
+static INLINE ubyte
+float_to_ubyte(float f)
+{
+   const int ieee_0996 = 0x3f7f0000;   /* 0.996 or so */
+   union fi tmp;
+
+   tmp.f = f;
+   if (tmp.i < 0) {
+      return (ubyte) 0;
+   }
+   else if (tmp.i >= ieee_0996) {
+      return (ubyte) 255;
+   }
+   else {
+      tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
+      return (ubyte) tmp.i;
+   }
+}
+
+
+
+#define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
+
+#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
+#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
+
+
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+#ifndef COPY_4V
+#define COPY_4V( DST, SRC )         \
+do {                                \
+   (DST)[0] = (SRC)[0];             \
+   (DST)[1] = (SRC)[1];             \
+   (DST)[2] = (SRC)[2];             \
+   (DST)[3] = (SRC)[3];             \
+} while (0)
+#endif
+
+
+#ifndef COPY_4FV
+#define COPY_4FV( DST, SRC )  COPY_4V(DST, SRC)
+#endif
+
+
+#ifndef ASSIGN_4V
+#define ASSIGN_4V( DST, V0, V1, V2, V3 ) \
+do {                                     \
+   (DST)[0] = (V0);                      \
+   (DST)[1] = (V1);                      \
+   (DST)[2] = (V2);                      \
+   (DST)[3] = (V3);                      \
+} while (0)
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
new file mode 100644
index 0000000000..148a5cb997
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -0,0 +1,222 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Memory functions
+ */
+
+
+#ifndef U_MEMORY_H
+#define U_MEMORY_H
+
+
+#include "util/u_pointer.h"
+
+
+ /* Define ENOMEM for WINCE */ 
+#if (_WIN32_WCE < 600)
+#ifndef ENOMEM
+#define ENOMEM 12
+#endif
+#endif
+
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && defined(DEBUG) 
+
+/* memory debugging */
+
+#include "p_debug.h"
+
+#define MALLOC( _size ) \
+   debug_malloc( __FILE__, __LINE__, __FUNCTION__, _size )
+#define CALLOC( _count, _size ) \
+   debug_calloc(__FILE__, __LINE__, __FUNCTION__, _count, _size )
+#define FREE( _ptr ) \
+   debug_free( __FILE__, __LINE__, __FUNCTION__,  _ptr )
+#define REALLOC( _ptr, _old_size, _size ) \
+   debug_realloc( __FILE__, __LINE__, __FUNCTION__,  _ptr, _old_size, _size )
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+
+void * __stdcall
+EngAllocMem(
+    unsigned long Flags,
+    unsigned long MemSize,
+    unsigned long Tag );
+
+void __stdcall
+EngFreeMem(
+    void *Mem );
+
+#define MALLOC( _size ) EngAllocMem( 0, _size, 'D3AG' )
+#define _FREE( _ptr ) EngFreeMem( _ptr )
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+
+void *
+ExAllocatePool(
+    unsigned long PoolType, 
+    size_t NumberOfBytes);
+
+void 
+ExFreePool(void *P);
+
+#define MALLOC(_size) ExAllocatePool(0, _size)
+#define _FREE(_ptr) ExFreePool(_ptr)
+
+#else
+
+#define MALLOC( SIZE )  malloc( SIZE )
+#define CALLOC( COUNT, SIZE )   calloc( COUNT, SIZE )
+#define FREE( PTR )  free( PTR )
+#define REALLOC( OLDPTR, OLDSIZE, NEWSIZE )  realloc( OLDPTR, NEWSIZE )
+
+#endif
+
+
+#ifndef CALLOC
+static INLINE void *
+CALLOC( unsigned count, unsigned size )
+{
+   void *ptr = MALLOC( count * size );
+   if( ptr ) {
+      memset( ptr, 0, count * size );
+   }
+   return ptr;
+}
+#endif /* !CALLOC */
+
+#ifndef FREE
+static INLINE void
+FREE( void *ptr )
+{
+   if( ptr ) {
+      _FREE( ptr );
+   }
+}
+#endif /* !FREE */
+
+#ifndef REALLOC
+static INLINE void *
+REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
+{
+   void *new_ptr = NULL;
+
+   if (new_size != 0) {
+      unsigned copy_size = old_size < new_size ? old_size : new_size;
+      new_ptr = MALLOC( new_size );
+      if (new_ptr && old_ptr && copy_size) {
+         memcpy( new_ptr, old_ptr, copy_size );
+      }
+   }
+
+   FREE( old_ptr );
+   return new_ptr;
+}
+#endif /* !REALLOC */
+
+
+#define MALLOC_STRUCT(T)   (struct T *) MALLOC(sizeof(struct T))
+
+#define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
+
+
+/**
+ * Return memory on given byte alignment
+ */
+static INLINE void *
+align_malloc(size_t bytes, uint alignment)
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+   void *mem;
+   alignment = (alignment + (uint)sizeof(void*) - 1) & ~((uint)sizeof(void*) - 1);
+   if(posix_memalign(& mem, alignment, bytes) != 0)
+      return NULL;
+   return mem;
+#else
+   char *ptr, *buf;
+
+   assert( alignment > 0 );
+
+   ptr = (char *) MALLOC(bytes + alignment + sizeof(void *));
+   if (!ptr)
+      return NULL;
+
+   buf = (char *) align_pointer( ptr + sizeof(void *), alignment );
+   *(char **)(buf - sizeof(void *)) = ptr;
+
+   return buf;
+#endif /* defined(HAVE_POSIX_MEMALIGN) */
+}
+
+/**
+ * Free memory returned by align_malloc().
+ */
+static INLINE void
+align_free(void *ptr)
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+   FREE(ptr);
+#else
+   void **cubbyHole = (void **) ((char *) ptr - sizeof(void *));
+   void *realAddr = *cubbyHole;
+   FREE(realAddr);
+#endif /* defined(HAVE_POSIX_MEMALIGN) */
+}
+
+
+/**
+ * Duplicate a block of memory.
+ */
+static INLINE void *
+mem_dup(const void *src, uint size)
+{
+   void *dup = MALLOC(size);
+   if (dup)
+      memcpy(dup, src, size);
+   return dup;
+}
+
+
+/**
+ * Number of elements in an array.
+ */
+#ifndef Elements
+#define Elements(x) (sizeof(x)/sizeof((x)[0]))
+#endif
+
+
+/**
+ * Offset of a field in a struct, in bytes.
+ */
+#define Offset(TYPE, MEMBER) ((unsigned)&(((TYPE *)NULL)->MEMBER))
+
+
+
+#endif /* U_MEMORY_H */
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index b49ae074e0..0f51dd5977 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -24,9 +24,9 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
 #include "pipe/p_debug.h"
 
+#include "util/u_memory.h"
 #include "util/u_mm.h"
 
 
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index 06abb34d5a..39e4ae9d07 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -37,6 +37,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
+#include "util/u_math.h"
 
 
 /**
@@ -150,10 +151,10 @@ util_pack_color(const float rgba[4], enum pipe_format format, void *dest)
 
    if (pf_size_x(format) <= 8) {
       /* format uses 8-bit components or less */
-      UNCLAMPED_FLOAT_TO_UBYTE(r, rgba[0]);
-      UNCLAMPED_FLOAT_TO_UBYTE(g, rgba[1]);
-      UNCLAMPED_FLOAT_TO_UBYTE(b, rgba[2]);
-      UNCLAMPED_FLOAT_TO_UBYTE(a, rgba[3]);
+      r = float_to_ubyte(rgba[0]);
+      g = float_to_ubyte(rgba[1]);
+      b = float_to_ubyte(rgba[2]);
+      a = float_to_ubyte(rgba[3]);
    }
 
    switch (format) {
@@ -286,4 +287,31 @@ util_pack_z(enum pipe_format format, double z)
 }
 
 
+/**
+ * Pack 4 ubytes into a 4-byte word
+ */
+static INLINE unsigned
+pack_ub4(ubyte b0, ubyte b1, ubyte b2, ubyte b3)
+{
+   return ((((unsigned int)b0) << 0) |
+	   (((unsigned int)b1) << 8) |
+	   (((unsigned int)b2) << 16) |
+	   (((unsigned int)b3) << 24));
+}
+
+
+/**
+ * Pack/convert 4 floats into one 4-byte word.
+ */
+static INLINE unsigned
+pack_ui32_float4(float a, float b, float c, float d)
+{
+   return pack_ub4( float_to_ubyte(a),
+		    float_to_ubyte(b),
+		    float_to_ubyte(c),
+		    float_to_ubyte(d) );
+}
+
+
+
 #endif /* U_PACK_COLOR_H */
diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h
new file mode 100644
index 0000000000..e1af9f11cb
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_pointer.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef U_POINTER_H
+#define U_POINTER_H
+
+#include "pipe/p_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE intptr_t
+pointer_to_intptr( const void *p )
+{
+   union {
+      const void *p;
+      intptr_t i;
+   } pi;
+   pi.p = p;
+   return pi.i;
+}
+
+static INLINE void *
+intptr_to_pointer( intptr_t i )
+{
+   union {
+      void *p;
+      intptr_t i;
+   } pi;
+   pi.i = i;
+   return pi.p;
+}
+
+static INLINE uintptr_t
+pointer_to_uintptr( const void *ptr )
+{
+   union {
+      const void *p;
+      uintptr_t u;
+   } pu;
+   pu.p = ptr;
+   return pu.u;
+}
+
+static INLINE void *
+uintptr_to_pointer( uintptr_t u )
+{
+   union {
+      void *p;
+      uintptr_t u;
+   } pu;
+   pu.u = u;
+   return pu.p;
+}
+
+/**
+ * Return a pointer aligned to next multiple of N bytes.
+ */
+static INLINE void *
+align_pointer( const void *unaligned, uintptr_t alignment )
+{
+   uintptr_t aligned = (pointer_to_uintptr( unaligned ) + alignment - 1) & ~(alignment - 1);
+   return uintptr_to_pointer( aligned );
+}
+
+
+/**
+ * Return a pointer aligned to next multiple of 16 bytes.
+ */
+static INLINE void *
+align16( void *unaligned )
+{
+   return align_pointer( unaligned, 16 );
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_POINTER_H */
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index 94e447b9d5..b31ab5415f 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -31,7 +31,6 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_format.h"
 #include "util/u_rect.h"
 
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index c34fb6ee33..f06d13c2c4 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -37,10 +37,10 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_shader_tokens.h"
 
+#include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 
 #include "tgsi/tgsi_build.h"
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
new file mode 100644
index 0000000000..853c503f4f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -0,0 +1,1169 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * RGBA/float tile get/put functions.
+ * Usable both by drivers and state trackers.
+ * Surfaces should already be in a mapped state.
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_rect.h"
+#include "util/u_tile.h"
+
+
+/**
+ * Move raw block of pixels from surface to user memory.
+ * This should be usable by any hw driver that has mappable surfaces.
+ */
+void
+pipe_get_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  void *dst, int dst_stride)
+{
+   const void *src;
+
+   if (dst_stride == 0)
+      dst_stride = pf_get_nblocksx(&ps->block, w) * ps->block.size;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   src = pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_READ);
+   assert(src);
+   if(!src)
+      return;
+
+   pipe_copy_rect(dst, &ps->block, dst_stride, 0, 0, w, h, src, ps->stride, x, y);
+
+   pipe_surface_unmap(ps);
+}
+
+
+/**
+ * Move raw block of pixels from user memory to surface.
+ * This should be usable by any hw driver that has mappable surfaces.
+ */
+void
+pipe_put_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  const void *src, int src_stride)
+{
+   void *dst;
+
+   if (src_stride == 0)
+      src_stride = pf_get_nblocksx(&ps->block, w) * ps->block.size;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   dst = pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_WRITE);
+   assert(dst);
+   if(!dst)
+      return;
+
+   pipe_copy_rect(dst, &ps->block, ps->stride, x, y, w, h, src, src_stride, 0, 0);
+
+   pipe_surface_unmap(ps);
+}
+
+
+
+
+/** Convert short in [-32768,32767] to GLfloat in [-1.0,1.0] */
+#define SHORT_TO_FLOAT(S)   ((2.0F * (S) + 1.0F) * (1.0F/65535.0F))
+
+#define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
+   us = ( (short) ( CLAMP((f), -1.0, 1.0) * 32767.0F) )
+
+
+
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+
+static void
+a8r8g8b8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
+         pRow[3] = ubyte_to_float((pixel >> 24) & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8r8g8b8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (a << 24) | (r << 16) | (g << 8) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+
+static void
+x8r8g8b8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
+         pRow[3] = ubyte_to_float(0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+x8r8g8b8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         *dst++ = (0xff << 24) | (r << 16) | (g << 8) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
+
+static void
+b8g8r8a8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >> 24) & 0xff);
+         pRow[3] = ubyte_to_float((pixel >>  0) & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+b8g8r8a8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (b << 24) | (g << 16) | (r << 8) | a;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A1R5G5B5_UNORM ***/
+
+static void
+a1r5g5b5_get_tile_rgba(const ushort *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >> 10) & 0x1f) * (1.0f / 31.0f);
+         pRow[1] = ((pixel >>  5) & 0x1f) * (1.0f / 31.0f);
+         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
+         pRow[3] = ((pixel >> 15)       ) * 1.0f;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a1r5g5b5_put_tile_rgba(ushort *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         r = r >> 3;  /* 5 bits */
+         g = g >> 3;  /* 5 bits */
+         b = b >> 3;  /* 5 bits */
+         a = a >> 7;  /* 1 bit */
+         *dst++ = (a << 15) | (r << 10) | (g << 5) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A4R4G4B4_UNORM ***/
+
+static void
+a4r4g4b4_get_tile_rgba(const ushort *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >>  8) & 0xf) * (1.0f / 15.0f);
+         pRow[1] = ((pixel >>  4) & 0xf) * (1.0f / 15.0f);
+         pRow[2] = ((pixel      ) & 0xf) * (1.0f / 15.0f);
+         pRow[3] = ((pixel >> 12)      ) * (1.0f / 15.0f);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a4r4g4b4_put_tile_rgba(ushort *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         r >>= 4;
+         g >>= 4;
+         b >>= 4;
+         a >>= 4;
+         *dst++ = (a << 12) | (r << 16) | (g << 4) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_R5G6B5_UNORM ***/
+
+static void
+r5g6b5_get_tile_rgba(const ushort *src,
+                     unsigned w, unsigned h,
+                     float *p,
+                     unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >> 11) & 0x1f) * (1.0f / 31.0f);
+         pRow[1] = ((pixel >>  5) & 0x3f) * (1.0f / 63.0f);
+         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
+         pRow[3] = 1.0f;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r5g6b5_put_tile_rgba(ushort *dst,
+                     unsigned w, unsigned h,
+                     const float *p,
+                     unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         uint r = (uint) (CLAMP(pRow[0], 0.0, 1.0) * 31.0);
+         uint g = (uint) (CLAMP(pRow[1], 0.0, 1.0) * 63.0);
+         uint b = (uint) (CLAMP(pRow[2], 0.0, 1.0) * 31.0);
+         *dst++ = (r << 11) | (g << 5) | (b);
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_Z16_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z16_get_tile_rgba(const ushort *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   const float scale = 1.0f / 65535.0f;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = *src++ * scale;
+      }
+      p += dst_stride;
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_L8_UNORM ***/
+
+static void
+l8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = ubyte_to_float(*src);
+         pRow[3] = 1.0;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+l8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const float *p,
+                 unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r;
+         r = float_to_ubyte(pRow[0]);
+         *dst++ = r;
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_A8_UNORM ***/
+
+static void
+a8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = 0.0;
+         pRow[3] = ubyte_to_float(*src);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const float *p,
+                 unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned a;
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = a;
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_R16_SNORM ***/
+
+static void
+r16_get_tile_rgba(const short *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] = SHORT_TO_FLOAT(src[0]);
+         pRow[1] =
+         pRow[2] = 0.0;
+         pRow[3] = 1.0;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r16_put_tile_rgba(short *dst,
+                  unsigned w, unsigned h,
+                  const float *p,
+                  unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, dst++, pRow += 4) {
+         UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]);
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_R16G16B16A16_SNORM ***/
+
+static void
+r16g16b16a16_get_tile_rgba(const short *src,
+                           unsigned w, unsigned h,
+                           float *p,
+                           unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src += 4, pRow += 4) {
+         pRow[0] = SHORT_TO_FLOAT(src[0]);
+         pRow[1] = SHORT_TO_FLOAT(src[1]);
+         pRow[2] = SHORT_TO_FLOAT(src[2]);
+         pRow[3] = SHORT_TO_FLOAT(src[3]);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r16g16b16a16_put_tile_rgba(short *dst,
+                           unsigned w, unsigned h,
+                           const float *p,
+                           unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, dst += 4, pRow += 4) {
+         UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[1], pRow[1]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[2], pRow[2]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[3], pRow[3]);
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_I8_UNORM ***/
+
+static void
+i8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = ubyte_to_float(*src);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+i8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const float *p,
+                 unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r;
+         r = float_to_ubyte(pRow[0]);
+         *dst++ = r;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A8L8_UNORM ***/
+
+static void
+a8l8_get_tile_rgba(const ushort *src,
+                   unsigned w, unsigned h,
+                   float *p,
+                   unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         ushort p = *src++;
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = ubyte_to_float(p & 0xff);
+         pRow[3] = ubyte_to_float(p >> 8);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8l8_put_tile_rgba(ushort *dst,
+                   unsigned w, unsigned h,
+                   const float *p,
+                   unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, a;
+         r = float_to_ubyte(pRow[0]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (a << 8) | r;
+      }
+      p += src_stride;
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_Z32_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32_get_tile_rgba(const unsigned *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   const double scale = 1.0 / (double) 0xffffffff;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (*src++ * scale);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_S8Z24_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+s8z24_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (scale * (*src++ & 0xffffff));
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_Z24S8_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+z24s8_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (scale * (*src++ >> 8));
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
+
+/**
+ * Convert YCbCr (or YCrCb) to RGBA.
+ */
+static void
+ycbcr_get_tile_rgba(const ushort *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride,
+                    boolean rev)
+{
+   const float scale = 1.0f / 255.0f;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      /* do two texels at a time */
+      for (j = 0; j < (w & ~1); j += 2, src += 2) {
+         const ushort t0 = src[0];
+         const ushort t1 = src[1];
+         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
+         const ubyte y1 = (t1 >> 8) & 0xff;  /* luminance */
+         ubyte cb, cr;
+         float r, g, b;
+
+         if (rev) {
+            cb = t1 & 0xff;         /* chroma U */
+            cr = t0 & 0xff;         /* chroma V */
+         }
+         else {
+            cb = t0 & 0xff;         /* chroma U */
+            cr = t1 & 0xff;         /* chroma V */
+         }
+
+         /* even pixel: y0,cr,cb */
+         r = 1.164f * (y0-16) + 1.596f * (cr-128);
+         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y0-16) + 2.018f * (cb-128);
+         pRow[0] = r * scale;
+         pRow[1] = g * scale;
+         pRow[2] = b * scale;
+         pRow[3] = 1.0f;
+         pRow += 4;
+
+         /* odd pixel: use y1,cr,cb */
+         r = 1.164f * (y1-16) + 1.596f * (cr-128);
+         g = 1.164f * (y1-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y1-16) + 2.018f * (cb-128);
+         pRow[0] = r * scale;
+         pRow[1] = g * scale;
+         pRow[2] = b * scale;
+         pRow[3] = 1.0f;
+         pRow += 4;
+
+      }
+      /* do the last texel */
+      if (w & 1) {
+         const ushort t0 = src[0];
+         const ushort t1 = src[1];
+         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
+         ubyte cb, cr;
+         float r, g, b;
+
+         if (rev) {
+            cb = t1 & 0xff;         /* chroma U */
+            cr = t0 & 0xff;         /* chroma V */
+         }
+         else {
+            cb = t0 & 0xff;         /* chroma U */
+            cr = t1 & 0xff;         /* chroma V */
+         }
+
+         /* even pixel: y0,cr,cb */
+         r = 1.164f * (y0-16) + 1.596f * (cr-128);
+         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y0-16) + 2.018f * (cb-128);
+         pRow[0] = r * scale;
+         pRow[1] = g * scale;
+         pRow[2] = b * scale;
+         pRow[3] = 1.0f;
+         pRow += 4;
+      }
+      p += dst_stride;
+   }
+}
+
+
+void
+pipe_tile_raw_to_rgba(enum pipe_format format,
+                      void *src,
+                      uint w, uint h,
+                      float *dst, unsigned dst_stride)
+{
+   switch (format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      x8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      a1r5g5b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      a4r4g4b4_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g6b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      l8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      a8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      i8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A8L8_UNORM:
+      a8l8_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      r16_get_tile_rgba((short *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_get_tile_rgba((short *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      z16_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      z32_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_YCBCR:
+      ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
+      break;
+   case PIPE_FORMAT_YCBCR_REV:
+      ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, TRUE);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+void
+pipe_get_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   float *p)
+{
+   unsigned dst_stride = w * 4;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   packed = MALLOC(pf_get_nblocks(&ps->block, w, h) * ps->block.size);
+
+   if (!packed)
+      return;
+
+   if(ps->format == PIPE_FORMAT_YCBCR || ps->format == PIPE_FORMAT_YCBCR_REV)
+      assert((x & 1) == 0);
+
+   pipe_get_tile_raw(ps, x, y, w, h, packed, 0);
+
+   pipe_tile_raw_to_rgba(ps->format, packed, w, h, p, dst_stride);
+
+   FREE(packed);
+}
+
+
+void
+pipe_put_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   const float *p)
+{
+   unsigned src_stride = w * 4;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   packed = MALLOC(pf_get_nblocks(&ps->block, w, h) * ps->block.size);
+
+   if (!packed)
+      return;
+
+   switch (ps->format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      x8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g6b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      assert(0);
+      break;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      a4r4g4b4_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      l8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      a8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      i8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A8L8_UNORM:
+      a8l8_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      r16_put_tile_rgba((short *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_put_tile_rgba((short *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      /*z16_put_tile_rgba((ushort *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      /*z32_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_put_tile_raw(ps, x, y, w, h, packed, 0);
+
+   FREE(packed);
+}
+
+
+/**
+ * Get a block of Z values, converted to 32-bit range.
+ */
+void
+pipe_get_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                uint *z)
+{
+   const uint dstStride = w;
+   ubyte *map;
+   uint *pDest = z;
+   uint i, j;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   map = (ubyte *)pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_READ);
+   if (!map) {
+      assert(0);
+      return;
+   }
+
+   switch (ps->format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      {
+         const uint *pSrc
+            = (const uint *)(map  + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            memcpy(pDest, pSrc, 4 * w);
+            pDest += dstStride;
+            pSrc += ps->stride/4;
+         }
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      {
+         const uint *pSrc
+            = (const uint *)(map + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 24-bit Z to 32-bit Z */
+               pDest[j] = (pSrc[j] << 8) | (pSrc[j] & 0xff);
+            }
+            pDest += dstStride;
+            pSrc += ps->stride/4;
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      {
+         const ushort *pSrc
+            = (const ushort *)(map + y * ps->stride + x*2);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 16-bit Z to 32-bit Z */
+               pDest[j] = (pSrc[j] << 16) | pSrc[j];
+            }
+            pDest += dstStride;
+            pSrc += ps->stride/2;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_surface_unmap(ps);
+}
+
+
+void
+pipe_put_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                const uint *zSrc)
+{
+   const uint srcStride = w;
+   const uint *pSrc = zSrc;
+   ubyte *map;
+   uint i, j;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   map = (ubyte *)pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_WRITE);
+   if (!map) {
+      assert(0);
+      return;
+   }
+
+   switch (ps->format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      {
+         uint *pDest = (uint *) (map + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            memcpy(pDest, pSrc, 4 * w);
+            pDest += ps->stride/4;
+            pSrc += srcStride;
+         }
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      {
+         uint *pDest = (uint *) (map + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 32-bit Z to 24-bit Z (0 stencil) */
+               pDest[j] = pSrc[j] >> 8;
+            }
+            pDest += ps->stride/4;
+            pSrc += srcStride;
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      {
+         ushort *pDest = (ushort *) (map + y * ps->stride + x*2);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 32-bit Z to 16-bit Z */
+               pDest[j] = pSrc[j] >> 16;
+            }
+            pDest += ps->stride/2;
+            pSrc += srcStride;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_surface_unmap(ps);
+}
+
+
diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h
new file mode 100644
index 0000000000..a8ac805308
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_tile.h
@@ -0,0 +1,101 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef P_TILE_H
+#define P_TILE_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_surface;
+
+
+/**
+ * Clip tile against surface dims.
+ * \return TRUE if tile is totally clipped, FALSE otherwise
+ */
+static INLINE boolean
+pipe_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_surface *ps)
+{
+   if (x >= ps->width)
+      return TRUE;
+   if (y >= ps->height)
+      return TRUE;
+   if (x + *w > ps->width)
+      *w = ps->width - x;
+   if (y + *h > ps->height)
+      *h = ps->height - y;
+   return FALSE;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+pipe_get_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  void *p, int dst_stride);
+
+void
+pipe_put_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  const void *p, int src_stride);
+
+
+void
+pipe_get_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   float *p);
+
+void
+pipe_put_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   const float *p);
+
+
+void
+pipe_get_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                uint *z);
+
+void
+pipe_put_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                const uint *z);
+
+void
+pipe_tile_raw_to_rgba(enum pipe_format format,
+                      void *src,
+                      uint w, uint h,
+                      float *dst, unsigned dst_stride);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index f430e88b9c..6bace0bb11 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -34,7 +34,6 @@
 #define CELL_COMMON_H
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index 3ffe09add6..cee0917b63 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -34,7 +34,7 @@
 #include <assert.h>
 #include <stdint.h>
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "cell/common.h"
 #include "cell_clear.h"
 #include "cell_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 12eb5aa254..5af95a3c10 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -35,7 +35,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_format.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_screen.h"
 
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 67b87f16d7..971d65d09e 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -30,7 +30,7 @@
  *  Brian Paul
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "draw/draw_context.h"
 #include "cell_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index b663b37622..dd25ae880e 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -33,7 +33,7 @@
 #include "cell_context.h"
 #include "cell_render.h"
 #include "cell_spu.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "draw/draw_private.h"
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 2bf441a0c5..139b3719b6 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
index 5480534ad9..8ab938a02a 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_derived.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -25,7 +25,7 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 9cae67f091..3646a0ee4f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,7 +25,7 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "cell_context.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index f5707f2bb8..cd96b317fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
 #include "draw/draw_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index 01ffa31c2c..2d31ad89a6 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
 #include "util/p_tile.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 533b64227d..1add81373d 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -33,7 +33,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 
 #include "cell_context.h"
diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.c b/src/gallium/drivers/cell/ppu/cell_winsys.c
index ebabce3c8f..d570bbd2f9 100644
--- a/src/gallium/drivers/cell/ppu/cell_winsys.c
+++ b/src/gallium/drivers/cell/ppu/cell_winsys.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "cell_winsys.h"
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 42e5022f30..89c61136a4 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -63,7 +63,6 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index ab4ff8160a..8944ef171e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -32,7 +32,6 @@
 #include <transpose_matrix4x4.h>
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
-#include "pipe/p_util.h"
 #include "spu_colorpack.h"
 #include "spu_main.h"
 #include "spu_texture.h"
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index 74ab2bbd1f..dbcf4b0eb9 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,4 +1,3 @@
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 //#include "tgsi_build.h"
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 219fd90cc0..26f2363749 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -32,7 +32,6 @@
   *   Ian Romanick <idr@us.ibm.com>
   */
 
-#include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_exec.h"
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index 3119a78c06..a1e81975e6 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -34,7 +34,6 @@
 
 #include <spu_mfcio.h>
 
-#include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_vertex_shader.h"
diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c
index 014a3e31d5..10c4ffc209 100644
--- a/src/gallium/drivers/failover/fo_context.c
+++ b/src/gallium/drivers/failover/fo_context.c
@@ -28,7 +28,7 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_context.h"
 
 #include "fo_context.h"
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index e2bf5ab678..c6776716a2 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -35,7 +35,7 @@
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_screen.h"
 
 
diff --git a/src/gallium/drivers/i915simple/i915_debug_fp.c b/src/gallium/drivers/i915simple/i915_debug_fp.c
index c024a051a5..48be3e1472 100644
--- a/src/gallium/drivers/i915simple/i915_debug_fp.c
+++ b/src/gallium/drivers/i915simple/i915_debug_fp.c
@@ -29,7 +29,7 @@
 #include "i915_reg.h"
 #include "i915_debug.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 static void
diff --git a/src/gallium/drivers/i915simple/i915_fpc.h b/src/gallium/drivers/i915simple/i915_fpc.h
index 80a9576304..2f0f99d046 100644
--- a/src/gallium/drivers/i915simple/i915_fpc.h
+++ b/src/gallium/drivers/i915simple/i915_fpc.h
@@ -29,7 +29,6 @@
 #ifndef I915_FPC_H
 #define I915_FPC_H
 
-#include "pipe/p_util.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c
index 64432982c4..34b4a846c1 100644
--- a/src/gallium/drivers/i915simple/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c
@@ -33,6 +33,8 @@
 #include "i915_fpc.h"
 
 #include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_string.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_dump.h"
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
index 9ffa460138..d194c2fb15 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -27,7 +27,9 @@
 
 
 #include "draw/draw_pipe.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
 
 #include "i915_context.h"
 #include "i915_winsys.h"
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index aef3682bbf..e4ece55098 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -41,9 +41,10 @@
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
index 0afa17bed8..e9e40c3f0b 100644
--- a/src/gallium/drivers/i915simple/i915_screen.c
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "util/u_string.h"
 
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index e8521b385e..d2487d8277 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -31,8 +31,9 @@
 
 #include "draw/draw_context.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "tgsi/tgsi_parse.h"
 
 #include "i915_context.h"
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c
index 4daccec6e0..488615067c 100644
--- a/src/gallium/drivers/i915simple/i915_state_derived.c
+++ b/src/gallium/drivers/i915simple/i915_state_derived.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
diff --git a/src/gallium/drivers/i915simple/i915_state_dynamic.c b/src/gallium/drivers/i915simple/i915_state_dynamic.c
index 8cfbdddd19..86126a5a15 100644
--- a/src/gallium/drivers/i915simple/i915_state_dynamic.c
+++ b/src/gallium/drivers/i915simple/i915_state_dynamic.c
@@ -30,7 +30,9 @@
 #include "i915_context.h"
 #include "i915_reg.h"
 #include "i915_state.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
 
 #define FILE_DEBUG_FLAG DEBUG_STATE
 
diff --git a/src/gallium/drivers/i915simple/i915_state_immediate.c b/src/gallium/drivers/i915simple/i915_state_immediate.c
index 2501f2d7cb..8c16bb4e27 100644
--- a/src/gallium/drivers/i915simple/i915_state_immediate.c
+++ b/src/gallium/drivers/i915simple/i915_state_immediate.c
@@ -33,7 +33,7 @@
 #include "i915_context.h"
 #include "i915_state.h"
 #include "i915_reg.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 /* All state expressable with the LOAD_STATE_IMMEDIATE_1 packet.
diff --git a/src/gallium/drivers/i915simple/i915_state_sampler.c b/src/gallium/drivers/i915simple/i915_state_sampler.c
index 7868f21ca6..c09c10601b 100644
--- a/src/gallium/drivers/i915simple/i915_state_sampler.c
+++ b/src/gallium/drivers/i915simple/i915_state_sampler.c
@@ -27,7 +27,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "i915_state_inlines.h"
 #include "i915_context.h"
diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915simple/i915_surface.c
index 17b5125e56..62f1926644 100644
--- a/src/gallium/drivers/i915simple/i915_surface.c
+++ b/src/gallium/drivers/i915simple/i915_surface.c
@@ -30,10 +30,9 @@
 #include "i915_state.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_rect.h"
 
 
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
index ca0fb8761b..32344da4d5 100644
--- a/src/gallium/drivers/i915simple/i915_texture.c
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -34,8 +34,9 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "i915_context.h"
 #include "i915_texture.h"
diff --git a/src/gallium/drivers/i965simple/brw_cc.c b/src/gallium/drivers/i965simple/brw_cc.c
index 337e4f95f6..79d4150383 100644
--- a/src/gallium/drivers/i965simple/brw_cc.c
+++ b/src/gallium/drivers/i965simple/brw_cc.c
@@ -29,7 +29,8 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
@@ -232,8 +233,7 @@ static void upload_cc_unit( struct brw_context *brw )
       cc.cc3.alpha_test_func = 
 	 brw_translate_compare_func(brw->attribs.DepthStencil->alpha.func);
 
-      UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], 
-			       brw->attribs.DepthStencil->alpha.ref);
+      cc.cc7.alpha_ref.ub[0] = float_to_ubyte(brw->attribs.DepthStencil->alpha.ref);
 
       cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
    }
diff --git a/src/gallium/drivers/i965simple/brw_clip_state.c b/src/gallium/drivers/i965simple/brw_clip_state.c
index ea5c05a279..8e78dd51be 100644
--- a/src/gallium/drivers/i965simple/brw_clip_state.c
+++ b/src/gallium/drivers/i965simple/brw_clip_state.c
@@ -32,7 +32,8 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 
 static void upload_clip_unit( struct brw_context *brw )
diff --git a/src/gallium/drivers/i965simple/brw_context.c b/src/gallium/drivers/i965simple/brw_context.c
index 8326f7b9c4..96920df008 100644
--- a/src/gallium/drivers/i965simple/brw_context.c
+++ b/src/gallium/drivers/i965simple/brw_context.c
@@ -39,7 +39,7 @@
 
 #include "pipe/p_winsys.h"
 #include "pipe/p_context.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_screen.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_curbe.c b/src/gallium/drivers/i965simple/brw_curbe.c
index 52bbd525c1..824ee7fd6d 100644
--- a/src/gallium/drivers/i965simple/brw_curbe.c
+++ b/src/gallium/drivers/i965simple/brw_curbe.c
@@ -39,7 +39,8 @@
 #include "brw_wm.h"
 #include "pipe/p_state.h"
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #define FILE_DEBUG_FLAG DEBUG_FALLBACKS
 
diff --git a/src/gallium/drivers/i965simple/brw_draw_upload.c b/src/gallium/drivers/i965simple/brw_draw_upload.c
index 9c0c78c236..7c20ea52af 100644
--- a/src/gallium/drivers/i965simple/brw_draw_upload.c
+++ b/src/gallium/drivers/i965simple/brw_draw_upload.c
@@ -33,6 +33,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 
+
 struct brw_array_state {
    union header_union header;
 
diff --git a/src/gallium/drivers/i965simple/brw_gs_state.c b/src/gallium/drivers/i965simple/brw_gs_state.c
index 3932e9e939..5b8016b2e9 100644
--- a/src/gallium/drivers/i965simple/brw_gs_state.c
+++ b/src/gallium/drivers/i965simple/brw_gs_state.c
@@ -34,7 +34,8 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
index fadfbf94ab..ab7cd624b2 100644
--- a/src/gallium/drivers/i965simple/brw_screen.c
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "util/u_string.h"
 
diff --git a/src/gallium/drivers/i965simple/brw_sf_state.c b/src/gallium/drivers/i965simple/brw_sf_state.c
index 9acd3ea61b..2a5de61c21 100644
--- a/src/gallium/drivers/i965simple/brw_sf_state.c
+++ b/src/gallium/drivers/i965simple/brw_sf_state.c
@@ -30,11 +30,12 @@
   */
 
 
-
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
 
 static void upload_sf_vp(struct brw_context *brw)
 {
diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c
index 30f37a99d4..86d877d7ef 100644
--- a/src/gallium/drivers/i965simple/brw_shader_info.c
+++ b/src/gallium/drivers/i965simple/brw_shader_info.c
@@ -1,7 +1,7 @@
 
 #include "brw_context.h"
 #include "brw_state.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
index 27ca32843d..af46cb546f 100644
--- a/src/gallium/drivers/i965simple/brw_state.c
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -31,7 +31,7 @@
 
 
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_dump.h"
diff --git a/src/gallium/drivers/i965simple/brw_state_batch.c b/src/gallium/drivers/i965simple/brw_state_batch.c
index 35db76b594..43a1c89fc4 100644
--- a/src/gallium/drivers/i965simple/brw_state_batch.c
+++ b/src/gallium/drivers/i965simple/brw_state_batch.c
@@ -32,7 +32,7 @@
 #include "brw_state.h"
 #include "brw_winsys.h"
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 /* A facility similar to the data caching code above, which aims to
  * prevent identical commands being issued repeatedly.
diff --git a/src/gallium/drivers/i965simple/brw_state_cache.c b/src/gallium/drivers/i965simple/brw_state_cache.c
index b3a5124461..094248fa69 100644
--- a/src/gallium/drivers/i965simple/brw_state_cache.c
+++ b/src/gallium/drivers/i965simple/brw_state_cache.c
@@ -38,7 +38,7 @@
 #include "brw_sf.h"
 #include "brw_gs.h"
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_state_pool.c b/src/gallium/drivers/i965simple/brw_state_pool.c
index f3174bfe0a..78d4c0e411 100644
--- a/src/gallium/drivers/i965simple/brw_state_pool.c
+++ b/src/gallium/drivers/i965simple/brw_state_pool.c
@@ -43,7 +43,8 @@
  */
 
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "brw_context.h"
 #include "brw_state.h"
diff --git a/src/gallium/drivers/i965simple/brw_state_upload.c b/src/gallium/drivers/i965simple/brw_state_upload.c
index e727601e1e..bac9161b5f 100644
--- a/src/gallium/drivers/i965simple/brw_state_upload.c
+++ b/src/gallium/drivers/i965simple/brw_state_upload.c
@@ -33,7 +33,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 /* This is used to initialize brw->state.atoms[].  We could use this
  * list directly except for a single atom, brw_constant_buffer, which
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
index 69da252285..b89756c47b 100644
--- a/src/gallium/drivers/i965simple/brw_surface.c
+++ b/src/gallium/drivers/i965simple/brw_surface.c
@@ -29,10 +29,9 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_rect.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c
index 9b6cf81723..05eda9d1f2 100644
--- a/src/gallium/drivers/i965simple/brw_tex_layout.c
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.c
@@ -33,16 +33,16 @@
 /* Code to layout images in a mipmap tree for i965.
  */
 
-#include "brw_tex_layout.h"
-
 #include "pipe/p_state.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "brw_context.h"
+#include "brw_tex_layout.h"
+
 
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
 
diff --git a/src/gallium/drivers/i965simple/brw_vs_state.c b/src/gallium/drivers/i965simple/brw_vs_state.c
index c73469929c..1eaff87892 100644
--- a/src/gallium/drivers/i965simple/brw_vs_state.c
+++ b/src/gallium/drivers/i965simple/brw_vs_state.c
@@ -34,7 +34,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 static void upload_vs_unit( struct brw_context *brw )
 {
diff --git a/src/gallium/drivers/i965simple/brw_wm.c b/src/gallium/drivers/i965simple/brw_wm.c
index 7fc5f59a98..8de565b96c 100644
--- a/src/gallium/drivers/i965simple/brw_wm.c
+++ b/src/gallium/drivers/i965simple/brw_wm.c
@@ -35,7 +35,7 @@
 #include "brw_wm.h"
 #include "brw_eu.h"
 #include "brw_state.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c
index e6f1a44817..d50e66f613 100644
--- a/src/gallium/drivers/i965simple/brw_wm_decl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_decl.c
@@ -2,7 +2,8 @@
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
index 6a4a5aef09..ab6410aa60 100644
--- a/src/gallium/drivers/i965simple/brw_wm_glsl.c
+++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c
@@ -2,7 +2,8 @@
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi/tgsi_parse.h"
 
diff --git a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
index b9eaee56ee..52b2909a65 100644
--- a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
+++ b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
@@ -34,7 +34,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 
 #define COMPAREFUNC_ALWAYS		0
diff --git a/src/gallium/drivers/i965simple/brw_wm_state.c b/src/gallium/drivers/i965simple/brw_wm_state.c
index f3aa36b07f..37a9bf919c 100644
--- a/src/gallium/drivers/i965simple/brw_wm_state.c
+++ b/src/gallium/drivers/i965simple/brw_wm_state.c
@@ -34,7 +34,8 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_wm.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 /***********************************************************************
  * WM unit - fragment programs and rasterization
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 9b1313bc83..dda90f760a 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -32,8 +32,8 @@
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 #include "sp_clear.h"
 #include "sp_context.h"
 #include "sp_flush.h"
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index cc171bbc39..d0456731be 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -34,7 +34,7 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_parse.h"
diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c
index 20226da78c..34adac5226 100644
--- a/src/gallium/drivers/softpipe/sp_fs_llvm.c
+++ b/src/gallium/drivers/softpipe/sp_fs_llvm.c
@@ -36,7 +36,7 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "tgsi/tgsi_sse2.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 8b7da7c747..35653a8e48 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -34,7 +34,7 @@
 
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_sse2.h"
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
index 941ab62e00..038ff04d4f 100644
--- a/src/gallium/drivers/softpipe/sp_prim_setup.c
+++ b/src/gallium/drivers/softpipe/sp_prim_setup.c
@@ -41,7 +41,7 @@
 #include "sp_prim_setup.h"
 #include "draw/draw_pipe.h"
 #include "draw/draw_vertex.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 /**
  * Triangle setup info (derived from draw_stage).
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index e9fae951e0..425e13cd28 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -43,6 +43,7 @@
 #include "sp_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
 
 
 #define SP_MAX_VBUF_INDEXES 1024
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
index 7a42b08ef5..7d3580fb4f 100644
--- a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
@@ -7,7 +7,7 @@
 #include "sp_headers.h"
 #include "sp_quad.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 static void
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index 74c6bff84a..a834accb86 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -31,7 +31,8 @@
  */
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
@@ -128,15 +129,15 @@ logicop_quad(struct quad_stage *qs, struct quad_header *quad)
 
       /* convert to ubyte */
       for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
-         UNCLAMPED_FLOAT_TO_UBYTE(dst[j][0], dest[j][0]); /* P0 */
-         UNCLAMPED_FLOAT_TO_UBYTE(dst[j][1], dest[j][1]); /* P1 */
-         UNCLAMPED_FLOAT_TO_UBYTE(dst[j][2], dest[j][2]); /* P2 */
-         UNCLAMPED_FLOAT_TO_UBYTE(dst[j][3], dest[j][3]); /* P3 */
-
-         UNCLAMPED_FLOAT_TO_UBYTE(src[j][0], quadColor[j][0]); /* P0 */
-         UNCLAMPED_FLOAT_TO_UBYTE(src[j][1], quadColor[j][1]); /* P1 */
-         UNCLAMPED_FLOAT_TO_UBYTE(src[j][2], quadColor[j][2]); /* P2 */
-         UNCLAMPED_FLOAT_TO_UBYTE(src[j][3], quadColor[j][3]); /* P3 */
+         dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+         dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+         dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+         dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+         src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+         src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+         src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+         src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
       }
 
       switch (softpipe->blend->logicop_func) {
@@ -209,10 +210,10 @@ logicop_quad(struct quad_stage *qs, struct quad_header *quad)
       }
 
       for (j = 0; j < 4; j++) {
-         quadColor[j][0] = UBYTE_TO_FLOAT(res[j][0]);
-         quadColor[j][1] = UBYTE_TO_FLOAT(res[j][1]);
-         quadColor[j][2] = UBYTE_TO_FLOAT(res[j][2]);
-         quadColor[j][3] = UBYTE_TO_FLOAT(res[j][3]);
+         quadColor[j][0] = ubyte_to_float(res[j][0]);
+         quadColor[j][1] = ubyte_to_float(res[j][1]);
+         quadColor[j][2] = ubyte_to_float(res[j][2]);
+         quadColor[j][3] = ubyte_to_float(res[j][3]);
       }
    }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
index b3db428ef1..92e9af09c1 100644
--- a/src/gallium/drivers/softpipe/sp_quad_bufloop.c
+++ b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
@@ -1,5 +1,5 @@
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
index 7fe080990b..f72f31db97 100644
--- a/src/gallium/drivers/softpipe/sp_quad_colormask.c
+++ b/src/gallium/drivers/softpipe/sp_quad_colormask.c
@@ -31,7 +31,8 @@
  */
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
index dd5ebb2296..ad907ec25f 100644
--- a/src/gallium/drivers/softpipe/sp_quad_coverage.c
+++ b/src/gallium/drivers/softpipe/sp_quad_coverage.c
@@ -33,7 +33,7 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_quad.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index 0c82692c6e..227cb2014e 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -30,7 +30,7 @@
  */
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
index 22ea99049f..5a66a86699 100644
--- a/src/gallium/drivers/softpipe/sp_quad_earlyz.c
+++ b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
@@ -30,7 +30,7 @@
  */
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_headers.h"
 #include "sp_quad.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 8c88c192f8..5499ba5361 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -35,7 +35,8 @@
  * all the enabled attributes run contiguously.
  */
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
index 54254df1f1..db13e73ae3 100644
--- a/src/gallium/drivers/softpipe/sp_quad_occlusion.c
+++ b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
@@ -33,7 +33,7 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
index 40083138a4..b64646a449 100644
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -25,7 +25,7 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_headers.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
index b4c7e942fa..ce9562e07c 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stencil.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stencil.c
@@ -10,7 +10,7 @@
 #include "sp_tile_cache.h"
 #include "sp_quad.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 /** Only 8-bit stencil supported */
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
index f1e9b80e09..a39ecc2e9d 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stipple.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -7,7 +7,7 @@
 #include "sp_headers.h"
 #include "sp_quad.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 
 /**
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index adf9ccf64c..2106ee1d23 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -32,7 +32,7 @@
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_query.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index f6b3d7ac24..9644dbd168 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index c8c55fa6e8..87336ab6e3 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -42,9 +42,9 @@
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vertex.h"
-#include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 
 
 #define DEBUG_VERTS 0
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
index 2d40d6bd8f..384fe559af 100644
--- a/src/gallium/drivers/softpipe/sp_state_blend.c
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -28,7 +28,7 @@
 /* Authors:  Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_state.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index f10a1fa471..6b6a4c3ff3 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -25,7 +25,8 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 76fe6bfef9..1be461b3a4 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -30,7 +30,7 @@
 #include "sp_fs.h"
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_shader_tokens.h"
diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
index 98e04352db..87b7219683 100644
--- a/src/gallium/drivers/softpipe/sp_state_rasterizer.c
+++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "sp_context.h"
 #include "sp_state.h"
 #include "draw/draw_context.h"
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index 033288a0aa..99a28c0d7e 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -29,7 +29,7 @@
  *  Brian Paul
  */
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 
 #include "draw/draw_context.h"
diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c
index bfbae234f1..389aceb27c 100644
--- a/src/gallium/drivers/softpipe/sp_surface.c
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -26,10 +26,9 @@
  **************************************************************************/
 
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_rect.h"
 #include "sp_context.h"
 #include "sp_surface.h"
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 58a95d13e1..49250ec084 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -39,9 +39,9 @@
 #include "sp_tile_cache.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "tgsi/tgsi_exec.h"
 #include "util/u_math.h"
+#include "util/u_memory.h"
 
 
 /*
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index f775591352..3a737d6f72 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -33,8 +33,9 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "sp_context.h"
 #include "sp_state.h"
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 57c12ffe33..b50c984513 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -32,9 +32,9 @@
  *    Brian Paul
  */
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
 #include "sp_context.h"
 #include "sp_surface.h"
 #include "sp_texture.h"
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index f16359e8ad..1dd7719379 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_screen.h"
 
 #include "tr_dump.h"
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
index 1613a626df..48032c1617 100644
--- a/src/gallium/drivers/trace/tr_dump.c
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -45,6 +45,8 @@
 #endif
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
 #include "util/u_string.h"
 
 #include "tr_stream.h"
diff --git a/src/gallium/drivers/trace/tr_dump.h b/src/gallium/drivers/trace/tr_dump.h
index 6ddc8fc15c..76a53731b3 100644
--- a/src/gallium/drivers/trace/tr_dump.h
+++ b/src/gallium/drivers/trace/tr_dump.h
@@ -35,7 +35,6 @@
 
 
 #include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
 
 
 boolean trace_dump_trace_begin(void);
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index a6467ec35f..8789f86b1a 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -25,7 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "tr_dump.h"
 #include "tr_state.h"
diff --git a/src/gallium/drivers/trace/tr_state.c b/src/gallium/drivers/trace/tr_state.c
index 30ab5a8fdc..986d939e0c 100644
--- a/src/gallium/drivers/trace/tr_state.c
+++ b/src/gallium/drivers/trace/tr_state.c
@@ -27,6 +27,7 @@
 
 
 #include "pipe/p_compiler.h"
+#include "util/u_memory.h"
 #include "tgsi/tgsi_dump.h"
 
 #include "tr_dump.h"
diff --git a/src/gallium/drivers/trace/tr_stream_stdc.c b/src/gallium/drivers/trace/tr_stream_stdc.c
index 4c77e1c995..4c19ec0b24 100644
--- a/src/gallium/drivers/trace/tr_stream_stdc.c
+++ b/src/gallium/drivers/trace/tr_stream_stdc.c
@@ -36,7 +36,7 @@
 
 #include <stdio.h>
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 
 #include "tr_stream.h"
 
diff --git a/src/gallium/drivers/trace/tr_stream_wd.c b/src/gallium/drivers/trace/tr_stream_wd.c
index b3b65f0971..704eb15bd7 100644
--- a/src/gallium/drivers/trace/tr_stream_wd.c
+++ b/src/gallium/drivers/trace/tr_stream_wd.c
@@ -37,7 +37,7 @@
 #include <windows.h>
 #include <winddi.h>
 
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "util/u_string.h"
 
 #include "tr_stream.h"
diff --git a/src/gallium/drivers/trace/tr_texture.c b/src/gallium/drivers/trace/tr_texture.c
index 99ba74d366..440a78704a 100644
--- a/src/gallium/drivers/trace/tr_texture.c
+++ b/src/gallium/drivers/trace/tr_texture.c
@@ -25,9 +25,9 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "util/u_hash_table.h"
+#include "util/u_memory.h"
 
 #include "tr_screen.h"
 #include "tr_texture.h"
diff --git a/src/gallium/drivers/trace/tr_winsys.c b/src/gallium/drivers/trace/tr_winsys.c
index 2c7a6f893b..177835854e 100644
--- a/src/gallium/drivers/trace/tr_winsys.c
+++ b/src/gallium/drivers/trace/tr_winsys.c
@@ -25,8 +25,7 @@
  *
  **************************************************************************/
 
-#include "pipe/p_util.h"
-#include "pipe/p_state.h"
+#include "util/u_memory.h"
 #include "util/u_hash_table.h"
 
 #include "tr_dump.h"
diff --git a/src/gallium/include/pipe/p_util.h b/src/gallium/include/pipe/p_util.h
deleted file mode 100644
index 4a3fca5962..0000000000
--- a/src/gallium/include/pipe/p_util.h
+++ /dev/null
@@ -1,460 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef P_UTIL_H
-#define P_UTIL_H
-
-#include "p_config.h"
-#include "p_compiler.h"
-#include "p_debug.h"
-#include "p_pointer.h"
-
-#if defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-__inline double ceil(double val)
-{
-	double ceil_val;
-
-	if((val - (long) val) == 0) {
-		ceil_val = val;
-	} else {
-		if(val > 0) {
-			ceil_val = (long) val + 1;
-		} else {
-			ceil_val = (long) val;
-		}
-	}
-
-	return ceil_val;
-}
-
-#ifndef PIPE_SUBSYSTEM_WINDOWS_CE
-__inline double floor(double val)
-{
-	double floor_val;
-
-	if((val - (long) val) == 0) {
-		floor_val = val;
-	} else {
-		if(val > 0) {
-			floor_val = (long) val;
-		} else {
-			floor_val = (long) val - 1;
-		}
-	}
-
-	return floor_val;
-}
-#endif
-
-#pragma function(pow)
-__inline double __cdecl pow(double val, double exponent)
-{
-	/* XXX */
-	assert(0);
-	return 0;
-}
-
-#pragma function(log)
-__inline double __cdecl log(double val)
-{
-	/* XXX */
-	assert(0);
-	return 0;
-}
-
-#pragma function(atan2)
-__inline double __cdecl atan2(double val)
-{
-	/* XXX */
-	assert(0);
-	return 0;
-}
-#else
-#include <math.h>
-#include <stdarg.h>
-#endif
-
- /* Define ENOMEM for WINCE */ 
-#if (_WIN32_WCE < 600)
-#ifndef ENOMEM
-#define ENOMEM 12
-#endif
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && defined(DEBUG) 
-
-/* memory debugging */
-
-#include "p_debug.h"
-
-#define MALLOC( _size ) \
-   debug_malloc( __FILE__, __LINE__, __FUNCTION__, _size )
-#define CALLOC( _count, _size ) \
-   debug_calloc(__FILE__, __LINE__, __FUNCTION__, _count, _size )
-#define FREE( _ptr ) \
-   debug_free( __FILE__, __LINE__, __FUNCTION__,  _ptr )
-#define REALLOC( _ptr, _old_size, _size ) \
-   debug_realloc( __FILE__, __LINE__, __FUNCTION__,  _ptr, _old_size, _size )
-
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-
-void * __stdcall
-EngAllocMem(
-    unsigned long Flags,
-    unsigned long MemSize,
-    unsigned long Tag );
-
-void __stdcall
-EngFreeMem(
-    void *Mem );
-
-#define MALLOC( _size ) EngAllocMem( 0, _size, 'D3AG' )
-#define _FREE( _ptr ) EngFreeMem( _ptr )
-
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-
-void *
-ExAllocatePool(
-    unsigned long PoolType, 
-    size_t NumberOfBytes);
-
-void 
-ExFreePool(void *P);
-
-#define MALLOC(_size) ExAllocatePool(0, _size)
-#define _FREE(_ptr) ExFreePool(_ptr)
-
-#else
-
-#define MALLOC( SIZE )  malloc( SIZE )
-#define CALLOC( COUNT, SIZE )   calloc( COUNT, SIZE )
-#define FREE( PTR )  free( PTR )
-#define REALLOC( OLDPTR, OLDSIZE, NEWSIZE )  realloc( OLDPTR, NEWSIZE )
-
-#endif
-
-
-#ifndef CALLOC
-static INLINE void *
-CALLOC( unsigned count, unsigned size )
-{
-   void *ptr = MALLOC( count * size );
-   if( ptr ) {
-      memset( ptr, 0, count * size );
-   }
-   return ptr;
-}
-#endif /* !CALLOC */
-
-#ifndef FREE
-static INLINE void
-FREE( void *ptr )
-{
-   if( ptr ) {
-      _FREE( ptr );
-   }
-}
-#endif /* !FREE */
-
-#ifndef REALLOC
-static INLINE void *
-REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
-{
-   void *new_ptr = NULL;
-
-   if (new_size != 0) {
-      unsigned copy_size = old_size < new_size ? old_size : new_size;
-      new_ptr = MALLOC( new_size );
-      if (new_ptr && old_ptr && copy_size) {
-         memcpy( new_ptr, old_ptr, copy_size );
-      }
-   }
-
-   FREE( old_ptr );
-   return new_ptr;
-}
-#endif /* !REALLOC */
-
-
-#define MALLOC_STRUCT(T)   (struct T *) MALLOC(sizeof(struct T))
-
-#define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
-
-
-/**
- * Return memory on given byte alignment
- */
-static INLINE void *
-align_malloc(size_t bytes, uint alignment)
-{
-#if defined(HAVE_POSIX_MEMALIGN)
-   void *mem;
-   alignment = (alignment + (uint)sizeof(void*) - 1) & ~((uint)sizeof(void*) - 1);
-   if(posix_memalign(& mem, alignment, bytes) != 0)
-      return NULL;
-   return mem;
-#else
-   char *ptr, *buf;
-
-   assert( alignment > 0 );
-
-   ptr = (char *) MALLOC(bytes + alignment + sizeof(void *));
-   if (!ptr)
-      return NULL;
-
-   buf = (char *) align_pointer( ptr + sizeof(void *), alignment );
-   *(char **)(buf - sizeof(void *)) = ptr;
-
-   return buf;
-#endif /* defined(HAVE_POSIX_MEMALIGN) */
-}
-
-/**
- * Free memory returned by align_malloc().
- */
-static INLINE void
-align_free(void *ptr)
-{
-#if defined(HAVE_POSIX_MEMALIGN)
-   FREE(ptr);
-#else
-   void **cubbyHole = (void **) ((char *) ptr - sizeof(void *));
-   void *realAddr = *cubbyHole;
-   FREE(realAddr);
-#endif /* defined(HAVE_POSIX_MEMALIGN) */
-}
-
-
-
-/**
- * Duplicate a block of memory.
- */
-static INLINE void *
-mem_dup(const void *src, uint size)
-{
-   void *dup = MALLOC(size);
-   if (dup)
-      memcpy(dup, src, size);
-   return dup;
-}
-
-
-
-#define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
-#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
-#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
-
-#ifndef Elements
-#define Elements(x) (sizeof(x)/sizeof((x)[0]))
-#endif
-#define Offset(TYPE, MEMBER) ((unsigned)&(((TYPE *)NULL)->MEMBER))
-
-/**
- * Return a pointer aligned to next multiple of 16 bytes.
- */
-static INLINE void *
-align16( void *unaligned )
-{
-   return align_pointer( unaligned, 16 );
-}
-
-
-static INLINE int align(int value, int alignment)
-{
-   return (value + alignment - 1) & ~(alignment - 1);
-}
-
-
-
-
-#if defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
-static INLINE unsigned ffs( unsigned u )
-{
-   unsigned i;
-
-   if( u == 0 ) {
-      return 0;
-   }
-
-   __asm bsf eax, [u]
-   __asm inc eax
-   __asm mov [i], eax
-
-   return i;
-}
-#endif
-
-union fi {
-   float f;
-   int i;
-   unsigned ui;
-};
-
-#define UBYTE_TO_FLOAT( ub ) ((float)(ub) / 255.0F)
-
-#define IEEE_0996 0x3f7f0000	/* 0.996 or so */
-
-/* This function/macro is sensitive to precision.  Test very carefully
- * if you change it!
- */
-#define UNCLAMPED_FLOAT_TO_UBYTE(UB, F)					\
-        do {								\
-           union fi __tmp;						\
-           __tmp.f = (F);						\
-           if (__tmp.i < 0)						\
-              UB = (ubyte) 0;						\
-           else if (__tmp.i >= IEEE_0996)				\
-              UB = (ubyte) 255;					\
-           else {							\
-              __tmp.f = __tmp.f * (255.0f/256.0f) + 32768.0f;		\
-              UB = (ubyte) __tmp.i;					\
-           }								\
-        } while (0)
-
-
-
-static INLINE unsigned pack_ub4( unsigned char b0,
-				 unsigned char b1,
-				 unsigned char b2,
-				 unsigned char b3 )
-{
-   return ((((unsigned int)b0) << 0) |
-	   (((unsigned int)b1) << 8) |
-	   (((unsigned int)b2) << 16) |
-	   (((unsigned int)b3) << 24));
-}
-
-static INLINE unsigned fui( float f )
-{
-   union fi fi;
-   fi.f = f;
-   return fi.ui;
-}
-
-static INLINE unsigned char float_to_ubyte( float f )
-{
-   unsigned char ub;
-   UNCLAMPED_FLOAT_TO_UBYTE(ub, f);
-   return ub;
-}
-
-static INLINE unsigned pack_ui32_float4( float a,
-					 float b, 
-					 float c, 
-					 float d )
-{
-   return pack_ub4( float_to_ubyte(a),
-		    float_to_ubyte(b),
-		    float_to_ubyte(c),
-		    float_to_ubyte(d) );
-}
-
-#define COPY_4V( DST, SRC )         \
-do {                                \
-   (DST)[0] = (SRC)[0];             \
-   (DST)[1] = (SRC)[1];             \
-   (DST)[2] = (SRC)[2];             \
-   (DST)[3] = (SRC)[3];             \
-} while (0)
-
-
-#define COPY_4FV( DST, SRC )  COPY_4V(DST, SRC)
-
-
-#define ASSIGN_4V( DST, V0, V1, V2, V3 ) \
-do {                                     \
-   (DST)[0] = (V0);                      \
-   (DST)[1] = (V1);                      \
-   (DST)[2] = (V2);                      \
-   (DST)[3] = (V3);                      \
-} while (0)
-
-
-
-#if defined(_MSC_VER) 
-#if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
- 
-static INLINE float cosf( float f ) 
-{
-   return (float) cos( (double) f );
-}
-
-static INLINE float sinf( float f ) 
-{
-   return (float) sin( (double) f );
-}
-
-static INLINE float ceilf( float f ) 
-{
-   return (float) ceil( (double) f );
-}
-
-static INLINE float floorf( float f ) 
-{
-   return (float) floor( (double) f );
-}
-
-static INLINE float powf( float f, float g ) 
-{
-   return (float) pow( (double) f, (double) g );
-}
-
-static INLINE float sqrtf( float f ) 
-{
-   return (float) sqrt( (double) f );
-}
-
-static INLINE float fabsf( float f ) 
-{
-   return (float) fabs( (double) f );
-}
-
-static INLINE float logf( float f ) 
-{
-   return (float) log( (double) f );
-}
-
-#else
-/* Work-around an extra semi-colon in VS 2005 logf definition */
-#ifdef logf
-#undef logf
-#define logf(x) ((float)log((double)(x)))
-#endif /* logf */
-#endif
-#endif /* _MSC_VER */
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/gallium/state_trackers/python/gallium.i b/src/gallium/state_trackers/python/gallium.i
index 641b19e940..a67372c623 100644
--- a/src/gallium/state_trackers/python/gallium.i
+++ b/src/gallium/state_trackers/python/gallium.i
@@ -42,7 +42,7 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h" 
 #include "cso_cache/cso_context.h"
 #include "util/u_draw_quad.h" 
diff --git a/src/gallium/state_trackers/python/st_device.c b/src/gallium/state_trackers/python/st_device.c
index a1889539dc..f71d85dd9b 100644
--- a/src/gallium/state_trackers/python/st_device.c
+++ b/src/gallium/state_trackers/python/st_device.c
@@ -26,12 +26,13 @@
  **************************************************************************/
 
 
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 #include "pipe/p_inlines.h"
 #include "cso_cache/cso_context.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
 #include "trace/tr_screen.h"
 #include "trace/tr_context.h"
diff --git a/src/gallium/state_trackers/python/st_sample.c b/src/gallium/state_trackers/python/st_sample.c
index b47c7be293..7765df3c4a 100644
--- a/src/gallium/state_trackers/python/st_sample.c
+++ b/src/gallium/state_trackers/python/st_sample.c
@@ -29,9 +29,10 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "st_sample.h"
 
diff --git a/src/gallium/state_trackers/python/st_softpipe_winsys.c b/src/gallium/state_trackers/python/st_softpipe_winsys.c
index 6ea3c9a5cf..2d4f5434b3 100644
--- a/src/gallium/state_trackers/python/st_softpipe_winsys.c
+++ b/src/gallium/state_trackers/python/st_softpipe_winsys.c
@@ -39,8 +39,9 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_format.h"
 #include "pipe/p_context.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 #include "st_winsys.h"
 
diff --git a/src/gallium/winsys/drm/intel/common/intel_be_device.c b/src/gallium/winsys/drm/intel/common/intel_be_device.c
index 8db0329615..019ee5cbd2 100644
--- a/src/gallium/winsys/drm/intel/common/intel_be_device.c
+++ b/src/gallium/winsys/drm/intel/common/intel_be_device.c
@@ -13,8 +13,8 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_memory.h"
 
 #include "i915simple/i915_screen.h"
 
diff --git a/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c
index 0d98d16cf1..20920a2052 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c
@@ -32,8 +32,8 @@
 #include "intel_context.h"
 #include "intel_winsys_softpipe.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_format.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 
 
diff --git a/src/gallium/winsys/egl_xlib/egl_xlib.c b/src/gallium/winsys/egl_xlib/egl_xlib.c
index 829732eea8..e9f821d276 100644
--- a/src/gallium/winsys/egl_xlib/egl_xlib.c
+++ b/src/gallium/winsys/egl_xlib/egl_xlib.c
@@ -38,8 +38,8 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_winsys.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 
 #include "eglconfig.h"
diff --git a/src/gallium/winsys/egl_xlib/sw_winsys.c b/src/gallium/winsys/egl_xlib/sw_winsys.c
index f4199e6f89..ae81d7f801 100644
--- a/src/gallium/winsys/egl_xlib/sw_winsys.c
+++ b/src/gallium/winsys/egl_xlib/sw_winsys.c
@@ -37,8 +37,9 @@
 
 #include "pipe/p_winsys.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 
 #include "sw_winsys.h"
 
diff --git a/src/gallium/winsys/gdi/wmesa.c b/src/gallium/winsys/gdi/wmesa.c
index ff52ceb8c4..730fb1b541 100644
--- a/src/gallium/winsys/gdi/wmesa.c
+++ b/src/gallium/winsys/gdi/wmesa.c
@@ -12,8 +12,8 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_format.h"
 #include "pipe/p_context.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 #include "glapi/glapi.h"
 #include "colors.h"
diff --git a/src/gallium/winsys/xlib/brw_aub.c b/src/gallium/winsys/xlib/brw_aub.c
index 6e814ce5d1..f319802962 100644
--- a/src/gallium/winsys/xlib/brw_aub.c
+++ b/src/gallium/winsys/xlib/brw_aub.c
@@ -34,7 +34,6 @@
 #include "brw_aub.h"
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
-#include "pipe/p_util.h"
 #include "pipe/p_debug.h"
 
 
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 4b4dc56e84..68ead7f528 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -42,8 +42,9 @@
 #include "pipe/p_winsys.h"
 #include "pipe/p_format.h"
 #include "pipe/p_context.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
 #include "softpipe/sp_winsys.h"
 
 #ifdef GALLIUM_CELL
diff --git a/src/gallium/winsys/xlib/xm_winsys_aub.c b/src/gallium/winsys/xlib/xm_winsys_aub.c
index 7fc9debdd5..3439367636 100644
--- a/src/gallium/winsys/xlib/xm_winsys_aub.c
+++ b/src/gallium/winsys/xlib/xm_winsys_aub.c
@@ -37,7 +37,7 @@
 #include "xmesaP.h"
 
 #include "pipe/p_winsys.h"
-#include "pipe/p_util.h"
+#include "util/u_memory.h"
 #include "pipe/p_inlines.h"
 #include "i965simple/brw_winsys.h"
 #include "i965simple/brw_screen.h"
diff --git a/src/mesa/state_tracker/acc2.c b/src/mesa/state_tracker/acc2.c
new file mode 100644
index 0000000000..fa5de2b764
--- /dev/null
+++ b/src/mesa/state_tracker/acc2.c
@@ -0,0 +1,319 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Brian Paul
+  */
+
+#include "main/imports.h"
+#include "main/image.h"
+#include "main/macros.h"
+
+#include "st_context.h"
+#include "st_cb_accum.h"
+#include "st_cb_fbo.h"
+#include "st_draw.h"
+#include "st_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "util/p_tile.h"
+
+
+#define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
+   us = ( (short) ( CLAMP((f), -1.0, 1.0) * 32767.0F) )
+
+
+/**
+ * For hardware that supports deep color buffers, we could accelerate
+ * most/all the accum operations with blending/texturing.
+ * For now, just use the get/put_tile() functions and do things in software.
+ */
+
+
+static void
+acc_get_tile_rgba(struct pipe_context *pipe, struct pipe_surface *acc_ps,
+                  uint x, uint y, uint w, uint h, float *p)
+{
+   const enum pipe_format f = acc_ps->format;
+   const int cpp = acc_ps->cpp;
+
+   acc_ps->format = PIPE_FORMAT_R16G16B16A16_SNORM;
+   acc_ps->cpp = 8;
+
+   pipe_get_tile_rgba(pipe, acc_ps, x, y, w, h, p);
+
+   acc_ps->format = f;
+   acc_ps->cpp = cpp;
+}
+
+
+static void
+acc_put_tile_rgba(struct pipe_context *pipe, struct pipe_surface *acc_ps,
+                  uint x, uint y, uint w, uint h, const float *p)
+{
+   enum pipe_format f = acc_ps->format;
+   const int cpp = acc_ps->cpp;
+
+   acc_ps->format = PIPE_FORMAT_R16G16B16A16_SNORM;
+   acc_ps->cpp = 8;
+
+   pipe_put_tile_rgba(pipe, acc_ps, x, y, w, h, p);
+
+   acc_ps->format = f;
+   acc_ps->cpp = cpp;
+}
+
+
+
+void
+st_clear_accum_buffer(GLcontext *ctx, struct gl_renderbuffer *rb)
+{
+   struct pipe_context *pipe = ctx->st->pipe;
+   struct st_renderbuffer *acc_strb = st_renderbuffer(rb);
+   struct pipe_surface *acc_ps = acc_strb->surface;
+   const GLint xpos = ctx->DrawBuffer->_Xmin;
+   const GLint ypos = ctx->DrawBuffer->_Ymin;
+   const GLint width = ctx->DrawBuffer->_Xmax - xpos;
+   const GLint height = ctx->DrawBuffer->_Ymax - ypos;
+   const GLfloat r = ctx->Accum.ClearColor[0];
+   const GLfloat g = ctx->Accum.ClearColor[1];
+   const GLfloat b = ctx->Accum.ClearColor[2];
+   const GLfloat a = ctx->Accum.ClearColor[3];
+   GLfloat *accBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+   int i;
+
+#if 1
+   GLvoid *map;
+
+   map = pipe_surface_map(acc_ps);
+   switch (acc_strb->format) {
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      {
+         GLshort r = FLOAT_TO_SHORT(ctx->Accum.ClearColor[0]);
+         GLshort g = FLOAT_TO_SHORT(ctx->Accum.ClearColor[1]);
+         GLshort b = FLOAT_TO_SHORT(ctx->Accum.ClearColor[2]);
+         GLshort a = FLOAT_TO_SHORT(ctx->Accum.ClearColor[3]);
+         int i, j;
+         for (i = 0; i < height; i++) {
+            GLshort *dst = ((GLshort *) map
+                            + ((ypos + i) * acc_ps->pitch + xpos) * 4);
+            for (j = 0; j < width; j++) {
+               dst[0] = r;
+               dst[1] = g;
+               dst[2] = b;
+               dst[3] = a;
+               dst += 4;
+            }
+         }
+      }
+      break;
+   default:
+      _mesa_problem(ctx, "unexpected format in st_clear_accum_buffer()");
+   }
+
+   pipe_surface_unmap(acc_ps);
+
+#else
+   for (i = 0; i < width * height; i++) {
+      accBuf[i*4+0] = r;
+      accBuf[i*4+1] = g;
+      accBuf[i*4+2] = b;
+      accBuf[i*4+3] = a;
+   }
+
+   acc_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+#endif
+}
+
+
+/** For ADD/MULT */
+static void
+accum_mad(struct pipe_context *pipe, GLfloat scale, GLfloat bias,
+          GLint xpos, GLint ypos, GLint width, GLint height,
+          struct pipe_surface *acc_ps)
+{
+   GLfloat *accBuf;
+   GLint i;
+
+   accBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+
+   pipe_get_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+
+   for (i = 0; i < 4 * width * height; i++) {
+      accBuf[i] = accBuf[i] * scale + bias;
+   }
+
+   pipe_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+
+   free(accBuf);
+}
+
+
+static void
+accum_accum(struct pipe_context *pipe, GLfloat value,
+            GLint xpos, GLint ypos, GLint width, GLint height,
+            struct pipe_surface *acc_ps,
+            struct pipe_surface *color_ps)
+{
+   GLfloat *colorBuf, *accBuf;
+   GLint i;
+
+   colorBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+   accBuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+
+   pipe_get_tile_rgba(pipe, color_ps, xpos, ypos, width, height, colorBuf);
+   acc_get_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+
+   for (i = 0; i < 4 * width * height; i++) {
+      accBuf[i] = accBuf[i] + colorBuf[i] * value;
+   }
+
+   acc_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, accBuf);
+
+   free(colorBuf);
+   free(accBuf);
+}
+
+
+static void
+accum_load(struct pipe_context *pipe, GLfloat value,
+           GLint xpos, GLint ypos, GLint width, GLint height,
+           struct pipe_surface *acc_ps,
+           struct pipe_surface *color_ps)
+{
+   GLfloat *buf;
+   GLint i;
+
+   buf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+
+   pipe_get_tile_rgba(pipe, color_ps, xpos, ypos, width, height, buf);
+
+   for (i = 0; i < 4 * width * height; i++) {
+      buf[i] = buf[i] * value;
+   }
+
+   acc_put_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, buf);
+
+   free(buf);
+}
+
+
+static void
+accum_return(GLcontext *ctx, GLfloat value,
+             GLint xpos, GLint ypos, GLint width, GLint height,
+             struct pipe_surface *acc_ps,
+             struct pipe_surface *color_ps)
+{
+   struct pipe_context *pipe = ctx->st->pipe;
+   const GLubyte *colormask = ctx->Color.ColorMask;
+   GLfloat *abuf, *cbuf = NULL;
+   GLint i, ch;
+
+   abuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+
+   acc_get_tile_rgba(pipe, acc_ps, xpos, ypos, width, height, abuf);
+
+   if (!colormask[0] || !colormask[1] || !colormask[2] || !colormask[3]) {
+      cbuf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
+      pipe_get_tile_rgba(pipe, color_ps, xpos, ypos, width, height, cbuf);
+   }
+
+   for (i = 0; i < width * height; i++) {
+      for (ch = 0; ch < 4; ch++) {
+         if (colormask[ch]) {
+            GLfloat val = abuf[i * 4 + ch] * value;
+            abuf[i * 4 + ch] = CLAMP(val, 0.0, 1.0);
+         }
+         else {
+            abuf[i * 4 + ch] = cbuf[i * 4 + ch];
+         }
+      }
+   }
+
+   pipe_put_tile_rgba(pipe, color_ps, xpos, ypos, width, height, abuf);
+
+   free(abuf);
+   if (cbuf)
+      free(cbuf);
+}
+
+
+static void
+st_Accum(GLcontext *ctx, GLenum op, GLfloat value)
+{
+   struct st_context *st = ctx->st;
+   struct pipe_context *pipe = st->pipe;
+   struct st_renderbuffer *acc_strb
+     = st_renderbuffer(ctx->DrawBuffer->Attachment[BUFFER_ACCUM].Renderbuffer);
+   struct st_renderbuffer *color_strb
+      = st_renderbuffer(ctx->ReadBuffer->_ColorReadBuffer);
+   struct pipe_surface *acc_ps = acc_strb->surface;
+   struct pipe_surface *color_ps = color_strb->surface;
+
+   const GLint xpos = ctx->DrawBuffer->_Xmin;
+   const GLint ypos = ctx->DrawBuffer->_Ymin;
+   const GLint width = ctx->DrawBuffer->_Xmax - xpos;
+   const GLint height = ctx->DrawBuffer->_Ymax - ypos;
+
+   /* make sure color bufs aren't cached */
+   pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
+
+   switch (op) {
+   case GL_ADD:
+      if (value != 0.0F) {
+         accum_mad(pipe, 1.0, value, xpos, ypos, width, height, acc_ps);
+      }
+      break;
+   case GL_MULT:
+      if (value != 1.0F) {
+         accum_mad(pipe, value, 0.0, xpos, ypos, width, height, acc_ps);
+      }
+      break;
+   case GL_ACCUM:
+      if (value != 0.0F) {
+         accum_accum(pipe, value, xpos, ypos, width, height, acc_ps, color_ps);
+      }
+      break;
+   case GL_LOAD:
+      accum_load(pipe, value, xpos, ypos, width, height, acc_ps, color_ps);
+      break;
+   case GL_RETURN:
+      accum_return(ctx, value, xpos, ypos, width, height, acc_ps, color_ps);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+void st_init_accum_functions(struct dd_function_table *functions)
+{
+   functions->Accum = st_Accum;
+}
diff --git a/src/mesa/state_tracker/st_cb_accum.c b/src/mesa/state_tracker/st_cb_accum.c
index a992e08ff6..cf3a99e7e9 100644
--- a/src/mesa/state_tracker/st_cb_accum.c
+++ b/src/mesa/state_tracker/st_cb_accum.c
@@ -42,7 +42,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 
 
 #define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index d5696a909f..a0c305d66f 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -50,7 +50,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_draw_quad.h"
 #include "util/u_simple_shaders.h"
 #include "shader/prog_instruction.h"
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 0c5e21d4ff..4ec7c752df 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -55,7 +55,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_draw_quad.h"
 #include "shader/prog_instruction.h"
 #include "cso_cache/cso_context.h"
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 39f5856f94..c801532788 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -41,7 +41,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "st_context.h"
 #include "st_cb_bitmap.h"
 #include "st_cb_readpixels.h"
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 6177ac63f0..16bbf3d80f 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -51,7 +51,7 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "util/p_tile.h"
+#include "util/u_tile.h"
 #include "util/u_blit.h"
 
 
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 325d95e865..936a6e32ea 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -55,7 +55,7 @@
 #define TGSI_DEBUG 0
 
 
-/** XXX we should use the version of this from p_util.h but including
+/** XXX we should use the version of this from u_memory.h but including
  * that header causes symbol collisions.
  */
 static INLINE void *
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 63046a0ecc..73cebff33f 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -36,7 +36,6 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_inlines.h"
-#include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "util/u_rect.h"
 
-- 
cgit v1.2.3


From 0bb852fa49e7f9a31036089ea4f5dfbd312a4a3a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Tue, 26 Aug 2008 16:35:12 -0600
Subject: gallium: thread wrapper clean-up

In p_thread.h replace _glthread_* functions with new pipe_* functions.
Remove other old cruft.
---
 .../auxiliary/pipebuffer/pb_buffer_fenced.c        |  24 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c |  18 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c    |  20 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c  |  24 +-
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c  |  20 +-
 src/gallium/auxiliary/rtasm/rtasm_execmem.c        |  11 +-
 src/gallium/include/pipe/p_thread.h                | 366 ++++++++-------------
 .../winsys/drm/intel/common/ws_dri_bufmgr.c        | 102 +++---
 .../winsys/drm/intel/common/ws_dri_bufpool.h       |   8 +-
 .../winsys/drm/intel/common/ws_dri_drmpool.c       |   4 +-
 .../winsys/drm/intel/common/ws_dri_fencemgr.c      |  66 ++--
 .../winsys/drm/intel/common/ws_dri_mallocpool.c    |   6 +-
 .../winsys/drm/intel/common/ws_dri_slabpool.c      |  66 ++--
 src/gallium/winsys/drm/intel/dri/intel_lock.c      |   8 +-
 src/gallium/winsys/xlib/glxapi.c                   |  26 +-
 src/gallium/winsys/xlib/xm_api.c                   |   9 +-
 src/gallium/winsys/xlib/xmesaP.h                   |   3 +-
 17 files changed, 328 insertions(+), 453 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 410d336fef..17b2781052 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -69,7 +69,7 @@
 
 struct fenced_buffer_list
 {
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
    
    struct pipe_winsys *winsys;
    
@@ -240,7 +240,7 @@ fenced_buffer_destroy(struct pb_buffer *buf)
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);   
    struct fenced_buffer_list *fenced_list = fenced_buf->list;
 
-   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_lock(fenced_list->mutex);
    assert(fenced_buf->base.base.refcount == 0);
    if (fenced_buf->fence) {
       struct pipe_winsys *winsys = fenced_list->winsys;
@@ -263,7 +263,7 @@ fenced_buffer_destroy(struct pb_buffer *buf)
    else {
       _fenced_buffer_destroy(fenced_buf);
    }
-   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_unlock(fenced_list->mutex);
 }
 
 
@@ -396,7 +396,7 @@ buffer_fence(struct pb_buffer *buf,
       return;
    }
    
-   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_lock(fenced_list->mutex);
    if (fenced_buf->fence)
       _fenced_buffer_remove(fenced_list, fenced_buf);
    if (fence) {
@@ -404,7 +404,7 @@ buffer_fence(struct pb_buffer *buf,
       fenced_buf->flags |= flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE;
       _fenced_buffer_add(fenced_buf);
    }
-   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_unlock(fenced_list->mutex);
 }
 
 
@@ -423,7 +423,7 @@ fenced_buffer_list_create(struct pipe_winsys *winsys)
 
    fenced_list->numDelayed = 0;
    
-   _glthread_INIT_MUTEX(fenced_list->mutex);
+   pipe_mutex_init(fenced_list->mutex);
 
    return fenced_list;
 }
@@ -433,28 +433,28 @@ void
 fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
                               int wait)
 {
-   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_lock(fenced_list->mutex);
    _fenced_buffer_list_check_free(fenced_list, wait);
-   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_unlock(fenced_list->mutex);
 }
 
 
 void
 fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list)
 {
-   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_lock(fenced_list->mutex);
 
    /* Wait on outstanding fences */
    while (fenced_list->numDelayed) {
-      _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+      pipe_mutex_unlock(fenced_list->mutex);
 #if defined(PIPE_OS_LINUX)
       sched_yield();
 #endif
       _fenced_buffer_list_check_free(fenced_list, 1);
-      _glthread_LOCK_MUTEX(fenced_list->mutex);
+      pipe_mutex_lock(fenced_list->mutex);
    }
 
-   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_unlock(fenced_list->mutex);
    
    FREE(fenced_list);
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index b914c2d0fe..e2b8fe0f98 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -79,7 +79,7 @@ struct pb_cache_manager
    struct pb_manager *provider;
    unsigned usecs;
    
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
    
    struct list_head delayed;
    size_t numDelayed;
@@ -153,7 +153,7 @@ pb_cache_buffer_destroy(struct pb_buffer *_buf)
    struct pb_cache_buffer *buf = pb_cache_buffer(_buf);   
    struct pb_cache_manager *mgr = buf->mgr;
 
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
    assert(buf->base.base.refcount == 0);
    
    _pb_cache_buffer_list_check_free(mgr);
@@ -162,7 +162,7 @@ pb_cache_buffer_destroy(struct pb_buffer *_buf)
    util_time_add(&buf->start, mgr->usecs, &buf->end);
    LIST_ADDTAIL(&buf->head, &mgr->delayed);
    ++mgr->numDelayed;
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
 }
 
 
@@ -235,7 +235,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    struct list_head *curr, *next;
    struct util_time now;
    
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
 
    buf = NULL;
    curr = mgr->delayed.next;
@@ -264,12 +264,12 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    
    if(buf) {
       LIST_DEL(&buf->head);
-      _glthread_UNLOCK_MUTEX(mgr->mutex);
+      pipe_mutex_unlock(mgr->mutex);
       ++buf->base.base.refcount;
       return &buf->base;
    }
    
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
 
    buf = CALLOC_STRUCT(pb_cache_buffer);
    if(!buf)
@@ -305,7 +305,7 @@ pb_cache_flush(struct pb_manager *_mgr)
    struct list_head *curr, *next;
    struct pb_cache_buffer *buf;
 
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
    curr = mgr->delayed.next;
    next = curr->next;
    while(curr != &mgr->delayed) {
@@ -314,7 +314,7 @@ pb_cache_flush(struct pb_manager *_mgr)
       curr = next; 
       next = curr->next;
    }
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
 }
 
 
@@ -345,7 +345,7 @@ pb_cache_manager_create(struct pb_manager *provider,
    mgr->usecs = usecs;
    LIST_INITHEAD(&mgr->delayed);
    mgr->numDelayed = 0;
-   _glthread_INIT_MUTEX(mgr->mutex);
+   pipe_mutex_init(mgr->mutex);
       
    return &mgr->base;
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index b40eb6cc90..e8c7f8e1f8 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -53,7 +53,7 @@ struct mm_pb_manager
 {
    struct pb_manager base;
    
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
    
    size_t size;
    struct mem_block *heap;
@@ -99,10 +99,10 @@ mm_buffer_destroy(struct pb_buffer *buf)
    
    assert(buf->base.refcount == 0);
    
-   _glthread_LOCK_MUTEX(mm->mutex);
+   pipe_mutex_lock(mm->mutex);
    mmFreeMem(mm_buf->block);
    FREE(buf);
-   _glthread_UNLOCK_MUTEX(mm->mutex);
+   pipe_mutex_unlock(mm->mutex);
 }
 
 
@@ -158,11 +158,11 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    if(desc->alignment % (1 << mm->align2))
       return NULL;
    
-   _glthread_LOCK_MUTEX(mm->mutex);
+   pipe_mutex_lock(mm->mutex);
 
    mm_buf = CALLOC_STRUCT(mm_buffer);
    if (!mm_buf) {
-      _glthread_UNLOCK_MUTEX(mm->mutex);
+      pipe_mutex_unlock(mm->mutex);
       return NULL;
    }
 
@@ -185,7 +185,7 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
       mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
       if(!mm_buf->block) {
          FREE(mm_buf);
-         _glthread_UNLOCK_MUTEX(mm->mutex);
+         pipe_mutex_unlock(mm->mutex);
          return NULL;
       }
    }
@@ -194,7 +194,7 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    assert(0 <= (unsigned)mm_buf->block->ofs && (unsigned)mm_buf->block->ofs < mm->size);
    assert(size <= (unsigned)mm_buf->block->size && (unsigned)mm_buf->block->ofs + (unsigned)mm_buf->block->size <= mm->size);
    
-   _glthread_UNLOCK_MUTEX(mm->mutex);
+   pipe_mutex_unlock(mm->mutex);
    return SUPER(mm_buf);
 }
 
@@ -204,14 +204,14 @@ mm_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct mm_pb_manager *mm = mm_pb_manager(mgr);
    
-   _glthread_LOCK_MUTEX(mm->mutex);
+   pipe_mutex_lock(mm->mutex);
 
    mmDestroy(mm->heap);
    
    pb_unmap(mm->buffer);
    pb_reference(&mm->buffer, NULL);
    
-   _glthread_UNLOCK_MUTEX(mm->mutex);
+   pipe_mutex_unlock(mm->mutex);
    
    FREE(mgr);
 }
@@ -236,7 +236,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    mm->size = size;
    mm->align2 = align2; /* 64-byte alignment */
 
-   _glthread_INIT_MUTEX(mm->mutex);
+   pipe_mutex_init(mm->mutex);
 
    mm->buffer = buffer; 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index 93d2cc9635..3ef72c5bbb 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -56,7 +56,7 @@ struct pool_pb_manager
 {
    struct pb_manager base;
    
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
    
    size_t bufSize;
    size_t bufAlign;
@@ -110,10 +110,10 @@ pool_buffer_destroy(struct pb_buffer *buf)
    
    assert(pool_buf->base.base.refcount == 0);
 
-   _glthread_LOCK_MUTEX(pool->mutex);
+   pipe_mutex_lock(pool->mutex);
    LIST_ADD(&pool_buf->head, &pool->free);
    pool->numFree++;
-   _glthread_UNLOCK_MUTEX(pool->mutex);
+   pipe_mutex_unlock(pool->mutex);
 }
 
 
@@ -124,9 +124,9 @@ pool_buffer_map(struct pb_buffer *buf, unsigned flags)
    struct pool_pb_manager *pool = pool_buf->mgr;
    void *map;
 
-   _glthread_LOCK_MUTEX(pool->mutex);
+   pipe_mutex_lock(pool->mutex);
    map = (unsigned char *) pool->map + pool_buf->start;
-   _glthread_UNLOCK_MUTEX(pool->mutex);
+   pipe_mutex_unlock(pool->mutex);
    return map;
 }
 
@@ -171,10 +171,10 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
    assert(size == pool->bufSize);
    assert(pool->bufAlign % desc->alignment == 0);
    
-   _glthread_LOCK_MUTEX(pool->mutex);
+   pipe_mutex_lock(pool->mutex);
 
    if (pool->numFree == 0) {
-      _glthread_UNLOCK_MUTEX(pool->mutex);
+      pipe_mutex_unlock(pool->mutex);
       debug_printf("warning: out of fixed size buffer objects\n");
       return NULL;
    }
@@ -182,7 +182,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
    item = pool->free.next;
 
    if (item == &pool->free) {
-      _glthread_UNLOCK_MUTEX(pool->mutex);
+      pipe_mutex_unlock(pool->mutex);
       debug_printf("error: fixed size buffer pool corruption\n");
       return NULL;
    }
@@ -190,7 +190,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
    LIST_DEL(item);
    --pool->numFree;
 
-   _glthread_UNLOCK_MUTEX(pool->mutex);
+   pipe_mutex_unlock(pool->mutex);
    
    pool_buf = LIST_ENTRY(struct pool_buffer, item, head);
    assert(pool_buf->base.base.refcount == 0);
@@ -206,14 +206,14 @@ static void
 pool_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct pool_pb_manager *pool = pool_pb_manager(mgr);
-   _glthread_LOCK_MUTEX(pool->mutex);
+   pipe_mutex_lock(pool->mutex);
 
    FREE(pool->bufs);
    
    pb_unmap(pool->buffer);
    pb_reference(&pool->buffer, NULL);
    
-   _glthread_UNLOCK_MUTEX(pool->mutex);
+   pipe_mutex_unlock(pool->mutex);
    
    FREE(mgr);
 }
@@ -246,7 +246,7 @@ pool_bufmgr_create(struct pb_manager *provider,
    pool->bufSize = bufSize;
    pool->bufAlign = desc->alignment; 
    
-   _glthread_INIT_MUTEX(pool->mutex);
+   pipe_mutex_init(pool->mutex);
 
    pool->buffer = provider->create_buffer(provider, numBufs*bufSize, desc); 
    if (!pool->buffer)
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index af307e265a..ac0296b26a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -57,7 +57,7 @@ struct pb_slab_buffer
    struct list_head head;
    unsigned mapCount;
    size_t start;
-   _glthread_Cond event;
+   pipe_condvar event;
 };
 
 struct pb_slab
@@ -85,7 +85,7 @@ struct pb_slab_manager
    struct list_head slabs;
    struct list_head freeSlabs;
    
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
 };
 
 /**
@@ -143,7 +143,7 @@ pb_slab_buffer_destroy(struct pb_buffer *_buf)
    struct pb_slab_manager *mgr = slab->mgr;
    struct list_head *list = &buf->head;
 
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
    
    assert(buf->base.base.refcount == 0);
    
@@ -179,7 +179,7 @@ pb_slab_buffer_destroy(struct pb_buffer *_buf)
       }
    }
    
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
 }
 
 
@@ -201,7 +201,7 @@ pb_slab_buffer_unmap(struct pb_buffer *_buf)
 
    --buf->mapCount;
    if (buf->mapCount == 0) 
-       _glthread_COND_BROADCAST(buf->event);
+       pipe_condvar_broadcast(buf->event);
 }
 
 
@@ -283,7 +283,7 @@ pb_slab_create(struct pb_slab_manager *mgr)
       buf->slab = slab;
       buf->start = i* mgr->bufSize;
       buf->mapCount = 0;
-      _glthread_INIT_COND(buf->event);
+      pipe_condvar_init(buf->event);
       LIST_ADDTAIL(&buf->head, &slab->freeBuffers);
       slab->numFree++;
       buf++;
@@ -328,11 +328,11 @@ pb_slab_manager_create_buffer(struct pb_manager *_mgr,
    if(!pb_check_usage(desc->usage, mgr->desc.usage))
       return NULL;
 
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
    if (mgr->slabs.next == &mgr->slabs) {
       (void) pb_slab_create(mgr);
       if (mgr->slabs.next == &mgr->slabs) {
-	 _glthread_UNLOCK_MUTEX(mgr->mutex);
+	 pipe_mutex_unlock(mgr->mutex);
 	 return NULL;
       }
    }
@@ -344,7 +344,7 @@ pb_slab_manager_create_buffer(struct pb_manager *_mgr,
    list = slab->freeBuffers.next;
    LIST_DELINIT(list);
 
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
    buf = LIST_ENTRY(struct pb_slab_buffer, list, head);
    
    ++buf->base.base.refcount;
@@ -388,7 +388,7 @@ pb_slab_manager_create(struct pb_manager *provider,
    LIST_INITHEAD(&mgr->slabs);
    LIST_INITHEAD(&mgr->freeSlabs);
    
-   _glthread_INIT_MUTEX(mgr->mutex);
+   pipe_mutex_init(mgr->mutex);
 
    return &mgr->base;
 }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index dfa5c35ab6..19087589a8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -47,11 +47,12 @@
 
 #include <unistd.h>
 #include <sys/mman.h>
+#include "pipe/p_thread.h"
 #include "util/u_mm.h"
 
 #define EXEC_HEAP_SIZE (10*1024*1024)
 
-_glthread_DECLARE_STATIC_MUTEX(exec_mutex);
+pipe_static_mutex(exec_mutex);
 
 static struct mem_block *exec_heap = NULL;
 static unsigned char *exec_mem = NULL;
@@ -76,7 +77,7 @@ rtasm_exec_malloc(size_t size)
    struct mem_block *block = NULL;
    void *addr = NULL;
 
-   _glthread_LOCK_MUTEX(exec_mutex);
+   pipe_mutex_lock(exec_mutex);
 
    init_heap();
 
@@ -90,7 +91,7 @@ rtasm_exec_malloc(size_t size)
    else 
       debug_printf("rtasm_exec_malloc failed\n");
    
-   _glthread_UNLOCK_MUTEX(exec_mutex);
+   pipe_mutex_unlock(exec_mutex);
    
    return addr;
 }
@@ -99,7 +100,7 @@ rtasm_exec_malloc(size_t size)
 void 
 rtasm_exec_free(void *addr)
 {
-   _glthread_LOCK_MUTEX(exec_mutex);
+   pipe_mutex_lock(exec_mutex);
 
    if (exec_heap) {
       struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
@@ -108,7 +109,7 @@ rtasm_exec_free(void *addr)
 	 mmFreeMem(block);
    }
 
-   _glthread_UNLOCK_MUTEX(exec_mutex);
+   pipe_mutex_unlock(exec_mutex);
 }
 
 
diff --git a/src/gallium/include/pipe/p_thread.h b/src/gallium/include/pipe/p_thread.h
index 6e526b7aa8..4e6f7cbb44 100644
--- a/src/gallium/include/pipe/p_thread.h
+++ b/src/gallium/include/pipe/p_thread.h
@@ -23,307 +23,199 @@
  * 
  **************************************************************************/
 
+
 /**
- * @file
- * Thread
- *
- * Initial version by John Stone (j.stone@acm.org) (johns@cs.umr.edu)
- *                and Christoph Poliwoda (poliwoda@volumegraphics.com)
- * Revised by Keith Whitwell
- * Adapted for new gl dispatcher by Brian Paul
- *
- *
- *
- * DOCUMENTATION
- *
- * This thread module exports the following types:
- *   _glthread_TSD     Thread-specific data area
- *   _glthread_Thread  Thread datatype
- *   _glthread_Mutex   Mutual exclusion lock
- *
- * Macros:
- *   _glthread_DECLARE_STATIC_MUTEX(name)   Declare a non-local mutex
- *   _glthread_INIT_MUTEX(name)             Initialize a mutex
- *   _glthread_LOCK_MUTEX(name)             Lock a mutex
- *   _glthread_UNLOCK_MUTEX(name)           Unlock a mutex
- *
- * Functions:
- *   _glthread_GetID(v)      Get integer thread ID
- *   _glthread_InitTSD()     Initialize thread-specific data
- *   _glthread_GetTSD()      Get thread-specific data
- *   _glthread_SetTSD()      Set thread-specific data
- *
- * If this file is accidentally included by a non-threaded build,
- * it should not cause the build to fail, or otherwise cause problems.
- * In general, it should only be included when needed however.
+ * Thread, mutex, condition var and thread-specific data functions.
  */
 
-#ifndef _P_THREAD_H_
-#define _P_THREAD_H_
 
+#ifndef _P_THREAD2_H_
+#define _P_THREAD2_H_
 
-#if (defined(PTHREADS) || defined(SOLARIS_THREADS) ||\
-     defined(WIN32_THREADS) || defined(USE_XTHREADS) || defined(BEOS_THREADS)) \
-    && !defined(THREADS)
-# define THREADS
-#endif
 
-#ifdef VMS
-#include <GL/vms_x_fix.h>
-#endif
+#include "pipe/p_compiler.h"
 
-/*
- * POSIX threads. This should be your choice in the Unix world
- * whenever possible.  When building with POSIX threads, be sure
- * to enable any compiler flags which will cause the MT-safe
- * libc (if one exists) to be used when linking, as well as any
- * header macros for MT-safe errno, etc.  For Solaris, this is the -mt
- * compiler flag.  On Solaris with gcc, use -D_REENTRANT to enable
- * proper compiling for MT-safe libc etc.
- */
-#if defined(PTHREADS)
-#include <pthread.h> /* POSIX threads headers */
 
-typedef struct {
-   pthread_key_t  key;
-   int initMagic;
-} _glthread_TSD;
+#if defined(PIPE_OS_LINUX)
 
-typedef pthread_t _glthread_Thread;
+#include <pthread.h> /* POSIX threads headers */
+#include <stdio.h> /* for perror() */
 
-typedef pthread_mutex_t _glthread_Mutex;
 
-#define _glthread_DECLARE_STATIC_MUTEX(name) \
-   static _glthread_Mutex name = PTHREAD_MUTEX_INITIALIZER
+typedef pthread_t pipe_thread;
+typedef pthread_mutex_t pipe_mutex;
+typedef pthread_cond_t pipe_condvar;
 
-#define _glthread_INIT_MUTEX(name) \
-   pthread_mutex_init(&(name), NULL)
+#define pipe_static_mutex(mutex) \
+   static pipe_mutex mutex = PTHREAD_MUTEX_INITIALIZER
 
-#define _glthread_DESTROY_MUTEX(name) \
-   pthread_mutex_destroy(&(name))
+#define pipe_mutex_init(mutex) \
+   pthread_mutex_init(&(mutex), NULL)
 
-#define _glthread_LOCK_MUTEX(name) \
-   (void) pthread_mutex_lock(&(name))
+#define pipe_mutex_destroy(mutex) \
+   pthread_mutex_destroy(&(mutex))
 
-#define _glthread_UNLOCK_MUTEX(name) \
-   (void) pthread_mutex_unlock(&(name))
+#define pipe_mutex_lock(mutex) \
+   (void) pthread_mutex_lock(&(mutex))
 
-typedef pthread_cond_t _glthread_Cond;
+#define pipe_mutex_unlock(mutex) \
+   (void) pthread_mutex_unlock(&(mutex))
 
-#define _glthread_DECLARE_STATIC_COND(name) \
-   static _glthread_Cond name = PTHREAD_COND_INITIALIZER
+#define pipe_static_condvar(mutex) \
+   static pipe_condvar mutex = PTHREAD_COND_INITIALIZER
 
-#define _glthread_INIT_COND(cond)			\
+#define pipe_condvar_init(cond)	\
    pthread_cond_init(&(cond), NULL)
 
-#define _glthread_DESTROY_COND(name) \
-   pthread_cond_destroy(&(name))
+#define pipe_condvar_destroy(cond) \
+   pthread_cond_destroy(&(cond))
 
-#define _glthread_COND_WAIT(cond, mutex) \
+#define pipe_condvar_wait(cond, mutex) \
   pthread_cond_wait(&(cond), &(mutex))
 
-#define _glthread_COND_SIGNAL(cond) \
+#define pipe_condvar_signal(cond) \
   pthread_cond_signal(&(cond))
 
-#define _glthread_COND_BROADCAST(cond) \
+#define pipe_condvar_broadcast(cond) \
   pthread_cond_broadcast(&(cond))
 
-#endif /* PTHREADS */
-
-
 
+#elif defined(PIPE_OS_WINDOWS)
 
-/*
- * Solaris threads. Use only up to Solaris 2.4.
- * Solaris 2.5 and higher provide POSIX threads.
- * Be sure to compile with -mt on the Solaris compilers, or
- * use -D_REENTRANT if using gcc.
- */
-#ifdef SOLARIS_THREADS
-#include <thread.h>
-
-typedef struct {
-   thread_key_t key;
-   mutex_t      keylock;
-   int          initMagic;
-} _glthread_TSD;
-
-typedef thread_t _glthread_Thread;
-
-typedef mutex_t _glthread_Mutex;
-
-/* XXX need to really implement mutex-related macros */
-#define _glthread_DECLARE_STATIC_MUTEX(name)  static _glthread_Mutex name = 0
-#define _glthread_INIT_MUTEX(name)  (void) name
-#define _glthread_DESTROY_MUTEX(name) (void) name
-#define _glthread_LOCK_MUTEX(name)  (void) name
-#define _glthread_UNLOCK_MUTEX(name)  (void) name
-
-#endif /* SOLARIS_THREADS */
-
-
-
-
-/*
- * Windows threads. Should work with Windows NT and 95.
- * IMPORTANT: Link with multithreaded runtime library when THREADS are
- * used!
- */
-#ifdef WIN32_THREADS
 #include <windows.h>
 
-typedef struct {
-   DWORD key;
-   int   initMagic;
-} _glthread_TSD;
-
-typedef HANDLE _glthread_Thread;
+typedef HANDLE pipe_thread;
+typedef CRITICAL_SECTION pipe_mutex;
 
-typedef CRITICAL_SECTION _glthread_Mutex;
+#define pipe_static_mutex(name) \
+   /*static*/ pipe_mutex name = {0,0,0,0,0,0}
 
-#define _glthread_DECLARE_STATIC_MUTEX(name)  /*static*/ _glthread_Mutex name = {0,0,0,0,0,0}
-#define _glthread_INIT_MUTEX(name)  InitializeCriticalSection(&name)
-#define _glthread_DESTROY_MUTEX(name)  DeleteCriticalSection(&name)
-#define _glthread_LOCK_MUTEX(name)  EnterCriticalSection(&name)
-#define _glthread_UNLOCK_MUTEX(name)  LeaveCriticalSection(&name)
+#define pipe_mutex_init(name) \
+   InitializeCriticalSection(&name)
 
-#endif /* WIN32_THREADS */
+#define pipe_mutex_destroy(name) \
+   DeleteCriticalSection(&name)
 
+#define pipe_mutex_lock(name) \
+   EnterCriticalSection(&name)
 
+#define pipe_mutex_unlock(name) \
+   LeaveCriticalSection(&name)
 
 
-/*
- * XFree86 has its own thread wrapper, Xthreads.h
- * We wrap it again for GL.
- */
-#ifdef USE_XTHREADS
-#include <X11/Xthreads.h>
-
-typedef struct {
-   xthread_key_t key;
-   int initMagic;
-} _glthread_TSD;
-
-typedef xthread_t _glthread_Thread;
-
-typedef xmutex_rec _glthread_Mutex;
-
-#ifdef XMUTEX_INITIALIZER
-#define _glthread_DECLARE_STATIC_MUTEX(name) \
-   static _glthread_Mutex name = XMUTEX_INITIALIZER
 #else
-#define _glthread_DECLARE_STATIC_MUTEX(name) \
-   static _glthread_Mutex name
-#endif
-
-#define _glthread_INIT_MUTEX(name) \
-   xmutex_init(&(name))
-
-#define _glthread_DESTROY_MUTEX(name) \
-   xmutex_clear(&(name))
-
-#define _glthread_LOCK_MUTEX(name) \
-   (void) xmutex_lock(&(name))
-
-#define _glthread_UNLOCK_MUTEX(name) \
-   (void) xmutex_unlock(&(name))
-
-#endif /* USE_XTHREADS */
-
-
-
-/*
- * BeOS threads. R5.x required.
- */
-#ifdef BEOS_THREADS
-
-#include <kernel/OS.h>
-#include <support/TLS.h>
-
-typedef struct {
-   int32        key;
-   int          initMagic;
-} _glthread_TSD;
-
-typedef thread_id _glthread_Thread;
-
-/* Use Benaphore, aka speeder semaphore */
-typedef struct {
-    int32   lock;
-    sem_id  sem;
-} benaphore;
-typedef benaphore _glthread_Mutex;
 
-#define _glthread_DECLARE_STATIC_MUTEX(name)  static _glthread_Mutex name = { 0, 0 }
-#define _glthread_INIT_MUTEX(name)    	name.sem = create_sem(0, #name"_benaphore"), name.lock = 0
-#define _glthread_DESTROY_MUTEX(name) 	delete_sem(name.sem), name.lock = 0
-#define _glthread_LOCK_MUTEX(name)    	if (name.sem == 0) _glthread_INIT_MUTEX(name); \
-									  	if (atomic_add(&(name.lock), 1) >= 1) acquire_sem(name.sem)
-#define _glthread_UNLOCK_MUTEX(name)  	if (atomic_add(&(name.lock), -1) > 1) release_sem(name.sem)
+/** Dummy definitions */
 
-#endif /* BEOS_THREADS */
+typedef unsigned pipe_thread;
+typedef unsigned pipe_mutex;
+typedef unsigned pipe_condvar;
+typedef unsigned pipe_tsd;
 
+#define pipe_static_mutex(mutex) \
+   static pipe_mutex mutex = 0
 
+#define pipe_mutex_init(mutex) \
+   (void) mutex
 
-#ifndef THREADS
+#define pipe_mutex_destroy(mutex) \
+   (void) mutex
 
-/*
- * THREADS not defined
- */
-
-typedef unsigned _glthread_TSD;
-
-typedef unsigned _glthread_Thread;
-
-typedef unsigned _glthread_Mutex;
+#define pipe_mutex_lock(mutex) \
+   (void) mutex
 
-#define _glthread_DECLARE_STATIC_MUTEX(name)  static _glthread_Mutex name = 0
+#define pipe_mutex_unlock(mutex) \
+   (void) mutex
 
-#define _glthread_INIT_MUTEX(name)  (void) name
+#define pipe_static_condvar(condvar) \
+   static _glthread_Cond condvar = 0
 
-#define _glthread_DESTROY_MUTEX(name)  (void) name
+#define pipe_condvar_init(condvar) \
+   (void) condvar
 
-#define _glthread_LOCK_MUTEX(name)  (void) name
+#define pipe_condvar_destroy(condvar) \
+   (void) condvar
 
-#define _glthread_UNLOCK_MUTEX(name)  (void) name
+#define pipe_condvar_wait(condvar, mutex) \
+   (void) condvar
 
-typedef unsigned _glthread_Cond;
+#define pipe_condvar_signal(condvar) \
+   (void) condvar
 
-#define _glthread_DECLARE_STATIC_COND(name) static _glthread_Cond name = 0
+#define pipe_condvar_broadcast(condvar) \
+   (void) condvar
 
-#define _glthread_INIT_COND(name)  (void) name
 
-#define _glthread_DESTROY_COND(name)  (void) name
-
-#define _glthread_COND_WAIT(name, mutex)  (void) name
-
-#define _glthread_COND_SIGNAL(name)  (void) name
-
-#define _glthread_COND_BROADCAST(name)  (void) name
-
-#endif /* THREADS */
+#endif  /* PIPE_OS_? */
 
 
 /*
- * Platform independent thread specific data API.
+ * Thread-specific data.
  */
 
-extern unsigned long
-_glthread_GetID(void);
-
-
-extern void
-_glthread_InitTSD(_glthread_TSD *);
+typedef struct {
+#if defined(PIPE_OS_LINUX)
+   pthread_key_t key;
+#elif defined(PIPE_OS_WINDOWS)
+   DWORD key;
+#endif
+   int initMagic;
+} pipe_tsd;
 
 
-extern void *
-_glthread_GetTSD(_glthread_TSD *);
+#define PIPE_TSD_INIT_MAGIC 0xff8adc98
 
 
-extern void
-_glthread_SetTSD(_glthread_TSD *, void *);
+static INLINE void
+pipe_tsd_init(pipe_tsd *tsd)
+{
+#if defined(PIPE_OS_LINUX)
+   if (pthread_key_create(&tsd->key, NULL/*free*/) != 0) {
+      perror("pthread_key_create(): failed to allocate key for thread specific data");
+      exit(-1);
+   }
+#elif defined(PIPE_OS_WINDOWS)
+   assert(0);
+#endif
+   tsd->initMagic = PIPE_TSD_INIT_MAGIC;
+}
+
+static INLINE void *
+pipe_tsd_get(pipe_tsd *tsd)
+{
+   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+      pipe_tsd_init(tsd);
+   }
+#if defined(PIPE_OS_LINUX)
+   return pthread_getspecific(tsd->key);
+#elif defined(PIPE_OS_WINDOWS)
+   assert(0);
+   return NULL;
+#else
+   assert(0);
+   return NULL;
+#endif
+}
+
+static INLINE void
+pipe_tsd_set(pipe_tsd *tsd, void *value)
+{
+   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+      pipe_tsd_init(tsd);
+   }
+#if defined(PIPE_OS_LINUX)
+   if (pthread_setspecific(tsd->key, value) != 0) {
+      perror("pthread_set_specific() failed");
+      exit(-1);
+   }
+#elif defined(PIPE_OS_WINDOWS)
+   assert(0);
+#else
+   assert(0);
+#endif
+}
 
 
-#endif /* _P_THREAD_H_ */
+#endif /* _P_THREAD2_H_ */
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c b/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c
index b6d901f85e..499f7bef8f 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c
@@ -33,7 +33,7 @@
 #include <xf86drm.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include "glthread.h"
+#include "pipe/p_thread.h"
 #include "errno.h"
 #include "ws_dri_bufmgr.h"
 #include "string.h"
@@ -51,8 +51,8 @@
  * driBufferObject mutex - > this rw lock.
  */
 
-_glthread_DECLARE_STATIC_MUTEX(bmMutex);
-_glthread_DECLARE_STATIC_COND(bmCond);
+pipe_static_mutex(bmMutex);
+pipe_static_condvar(bmCond);
 
 static int kernelReaders = 0;
 static int num_buffers = 0;
@@ -241,29 +241,29 @@ static int drmBOResetList(drmBOList *list)
 
 void driWriteLockKernelBO(void)
 {
-    _glthread_LOCK_MUTEX(bmMutex);
+    pipe_mutex_lock(bmMutex);
     while(kernelReaders != 0)
-	_glthread_COND_WAIT(bmCond, bmMutex);
+	pipe_condvar_wait(bmCond, bmMutex);
 }
 
 void driWriteUnlockKernelBO(void)
 {
-    _glthread_UNLOCK_MUTEX(bmMutex);
+    pipe_mutex_unlock(bmMutex);
 }
 
 void driReadLockKernelBO(void)
 {
-    _glthread_LOCK_MUTEX(bmMutex);
+    pipe_mutex_lock(bmMutex);
     kernelReaders++;
-    _glthread_UNLOCK_MUTEX(bmMutex);
+    pipe_mutex_unlock(bmMutex);
 }
 
 void driReadUnlockKernelBO(void)
 {
-    _glthread_LOCK_MUTEX(bmMutex);
+    pipe_mutex_lock(bmMutex);
     if (--kernelReaders == 0)
-        _glthread_COND_BROADCAST(bmCond);
-    _glthread_UNLOCK_MUTEX(bmMutex);
+       pipe_condvar_broadcast(bmCond);
+    pipe_mutex_unlock(bmMutex);
 }
 
 
@@ -277,7 +277,7 @@ void driReadUnlockKernelBO(void)
 typedef struct _DriBufferObject
 {
    DriBufferPool *pool;
-   _glthread_Mutex mutex;
+   pipe_mutx mutex;
    int refCount;
    const char *name;
    uint64_t flags;
@@ -318,12 +318,12 @@ driBOKernel(struct _DriBufferObject *buf)
    drmBO *ret;
 
    driReadLockKernelBO();
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    assert(buf->private != NULL);
    ret = buf->pool->kernel(buf->pool, buf->private);
    if (!ret)
       BM_CKFATAL(-EINVAL);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    driReadUnlockKernelBO();
 
    return ret;
@@ -338,9 +338,9 @@ driBOWaitIdle(struct _DriBufferObject *buf, int lazy)
    * that time??
    */
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    BM_CKFATAL(buf->pool->waitIdle(buf->pool, buf->private, &buf->mutex, lazy));
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 void *
@@ -353,11 +353,11 @@ driBOMap(struct _DriBufferObject *buf, unsigned flags, unsigned hint)
       return buf->userData;
    }
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    assert(buf->private != NULL);
    retval = buf->pool->map(buf->pool, buf->private, flags, hint,
 			   &buf->mutex, &virtual);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 
    return retval == 0 ? virtual : NULL;
 }
@@ -369,9 +369,9 @@ driBOUnmap(struct _DriBufferObject *buf)
       return;
 
    assert(buf->private != NULL);
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    BM_CKFATAL(buf->pool->unmap(buf->pool, buf->private));
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 unsigned long
@@ -381,9 +381,9 @@ driBOOffset(struct _DriBufferObject *buf)
 
    assert(buf->private != NULL);
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    ret = buf->pool->offset(buf->pool, buf->private);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    return ret;
 }
 
@@ -394,9 +394,9 @@ driBOPoolOffset(struct _DriBufferObject *buf)
 
    assert(buf->private != NULL);
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    ret = buf->pool->poolOffset(buf->pool, buf->private);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    return ret;
 }
 
@@ -408,9 +408,9 @@ driBOFlags(struct _DriBufferObject *buf)
    assert(buf->private != NULL);
 
    driReadLockKernelBO();
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    ret = buf->pool->flags(buf->pool, buf->private);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    driReadUnlockKernelBO();
    return ret;
 }
@@ -418,12 +418,12 @@ driBOFlags(struct _DriBufferObject *buf)
 struct _DriBufferObject *
 driBOReference(struct _DriBufferObject *buf)
 {
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (++buf->refCount == 1) {
-      _glthread_UNLOCK_MUTEX(buf->mutex);
+      pipe_mutex_unlock(buf->mutex);
       BM_CKFATAL(-EINVAL);
    }
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    return buf;
 }
 
@@ -435,10 +435,10 @@ driBOUnReference(struct _DriBufferObject *buf)
    if (!buf)
       return;
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    tmp = --buf->refCount;
    if (!tmp) {
-      _glthread_UNLOCK_MUTEX(buf->mutex);
+      pipe_mutex_unlock(buf->mutex);
       if (buf->private) {
 	 if (buf->createdByReference)
 	    buf->pool->unreference(buf->pool, buf->private);
@@ -451,7 +451,7 @@ driBOUnReference(struct _DriBufferObject *buf)
 	 num_buffers--;
       free(buf);
    } else
-     _glthread_UNLOCK_MUTEX(buf->mutex);
+     pipe_mutex_unlock(buf->mutex);
 
 }
 
@@ -469,7 +469,7 @@ driBOData(struct _DriBufferObject *buf,
 
    assert(!buf->userBuffer); /* XXX just do a memcpy? */
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    pool = buf->pool;
 
    if (pool == NULL && newPool != NULL) {
@@ -556,7 +556,7 @@ driBOData(struct _DriBufferObject *buf,
    }
 
  out:
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 
    return retval;
 }
@@ -569,7 +569,7 @@ driBOSubData(struct _DriBufferObject *buf,
 
    assert(!buf->userBuffer); /* XXX just do a memcpy? */
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (size && data) {
       BM_CKFATAL(buf->pool->map(buf->pool, buf->private,
                                 DRM_BO_FLAG_WRITE, 0, &buf->mutex,
@@ -577,7 +577,7 @@ driBOSubData(struct _DriBufferObject *buf,
       memcpy((unsigned char *) virtual + offset, data, size);
       BM_CKFATAL(buf->pool->unmap(buf->pool, buf->private));
    }
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 void
@@ -588,21 +588,21 @@ driBOGetSubData(struct _DriBufferObject *buf,
 
    assert(!buf->userBuffer); /* XXX just do a memcpy? */
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (size && data) {
       BM_CKFATAL(buf->pool->map(buf->pool, buf->private,
                                 DRM_BO_FLAG_READ, 0, &buf->mutex, &virtual));
       memcpy(data, (unsigned char *) virtual + offset, size);
       BM_CKFATAL(buf->pool->unmap(buf->pool, buf->private));
    }
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 void
 driBOSetReferenced(struct _DriBufferObject *buf,
 		   unsigned long handle)
 {
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (buf->private != NULL) {
       assert((size_t)"Invalid buffer for setReferenced\n" & 0);
       BM_CKFATAL(-EINVAL);
@@ -619,7 +619,7 @@ driBOSetReferenced(struct _DriBufferObject *buf,
    }
    buf->createdByReference = TRUE;
    buf->flags = buf->pool->kernel(buf->pool, buf->private)->flags;
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 int
@@ -644,8 +644,8 @@ driGenBuffers(struct _DriBufferPool *pool,
       if (!buf)
 	 return -ENOMEM;
 
-      _glthread_INIT_MUTEX(buf->mutex);
-      _glthread_LOCK_MUTEX(buf->mutex);
+      pipe_mutex_init(buf->mutex);
+      pipe_mutex_lock(buf->mutex);
       buf->refCount = 1;
       buf->flags = flags;
       buf->hint = hint;
@@ -653,7 +653,7 @@ driGenBuffers(struct _DriBufferPool *pool,
       buf->alignment = alignment;
       buf->pool = pool;
       buf->createdByReference = 0;
-      _glthread_UNLOCK_MUTEX(buf->mutex);
+      pipe_mutex_unlock(buf->mutex);
       buffers[i] = buf;
    }
    return 0;
@@ -818,7 +818,7 @@ driBOAddListItem(struct _DriBufferList * list, struct _DriBufferObject *buf,
 {
    int newItem;
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    BM_CKFATAL(driAddValidateItem(&list->drmBuffers,
 				 buf->pool->kernel(buf->pool, buf->private),
                                  flags, mask, itemLoc, node));
@@ -827,7 +827,7 @@ driBOAddListItem(struct _DriBufferList * list, struct _DriBufferObject *buf,
    if (newItem)
      buf->refCount++;
 
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 drmBOList *driGetdrmBOList(struct _DriBufferList *list)
@@ -845,10 +845,10 @@ void driPutdrmBOList(struct _DriBufferList *list)
 void
 driBOFence(struct _DriBufferObject *buf, struct _DriFenceObject *fence)
 {
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (buf->pool->fence)
        BM_CKFATAL(buf->pool->fence(buf->pool, buf->private, fence));
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 
 }
 
@@ -908,10 +908,10 @@ driBOValidateUserList(struct _DriBufferList * list)
 
     while (curBuf) {
 	buf = (struct _DriBufferObject *) drmBOListBuf(curBuf);
-	_glthread_LOCK_MUTEX(buf->mutex);
+	pipe_mutex_lock(buf->mutex);
 	if (buf->pool->validate)
 	    BM_CKFATAL(buf->pool->validate(buf->pool, buf->private, &buf->mutex));
-	_glthread_UNLOCK_MUTEX(buf->mutex);
+	pipe_mutex_unlock(buf->mutex);
 	curBuf = drmBOListNext(&list->driBuffers, curBuf);
     }
 }
@@ -929,9 +929,9 @@ driBOSize(struct _DriBufferObject *buf)
 {
   unsigned long size;
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    size = buf->pool->size(buf->pool, buf->private);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 
   return size;
 
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h b/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h
index bf60798924..ad3b6f3931 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h
@@ -33,14 +33,14 @@
 #define _PSB_BUFPOOL_H_
 
 #include <xf86drm.h>
-#include <glthread.h>
+#include "pipe/p_thread.h"
 struct _DriFenceObject;
 
 typedef struct _DriBufferPool
 {
    int fd;
    int (*map) (struct _DriBufferPool * pool, void *private,
-               unsigned flags, int hint, _glthread_Mutex *mutex,
+               unsigned flags, int hint, pipe_mutex *mutex,
 	       void **virtual);
    int (*unmap) (struct _DriBufferPool * pool, void *private);
    int (*destroy) (struct _DriBufferPool * pool, void *private);
@@ -55,8 +55,8 @@ typedef struct _DriBufferPool
    int (*fence) (struct _DriBufferPool * pool, void *private,
                  struct _DriFenceObject * fence);
    drmBO *(*kernel) (struct _DriBufferPool * pool, void *private);
-   int (*validate) (struct _DriBufferPool * pool, void *private, _glthread_Mutex *mutex);
-   int (*waitIdle) (struct _DriBufferPool *pool, void *private, _glthread_Mutex *mutex,
+   int (*validate) (struct _DriBufferPool * pool, void *private, pipe_mutex *mutex);
+   int (*waitIdle) (struct _DriBufferPool *pool, void *private, pipe_mutex *mutex,
 		    int lazy);
    int (*setStatus)  (struct _DriBufferPool *pool, void *private,
 		      uint64_t flag_diff, uint64_t old_flags);
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c b/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c
index 40929efa2f..54618b1c82 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c
@@ -113,7 +113,7 @@ pool_unreference(struct _DriBufferPool *pool, void *private)
 
 static int
 pool_map(struct _DriBufferPool *pool, void *private, unsigned flags,
-         int hint, _glthread_Mutex *mutex, void **virtual)
+         int hint, pipe_mutex *mutex, void **virtual)
 {
    drmBO *buf = (drmBO *) private;
    int ret;
@@ -202,7 +202,7 @@ pool_kernel(struct _DriBufferPool *pool, void *private)
 }
 
 static int
-pool_waitIdle(struct _DriBufferPool *pool, void *private, _glthread_Mutex *mutex,
+pool_waitIdle(struct _DriBufferPool *pool, void *private, pipe_mutex *mutex,
 	      int lazy)
 {
    drmBO *buf = (drmBO *) private;
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c b/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c
index b56bc269da..831c75d30c 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c
@@ -1,5 +1,5 @@
 #include "ws_dri_fencemgr.h"
-#include "glthread.h"
+#include "pipe/p_thread.h"
 #include <xf86mm.h>
 #include <string.h>
 #include <unistd.h>
@@ -20,7 +20,7 @@ struct _DriFenceMgr {
     /*
      * These members are protected by this->mutex
      */
-    _glthread_Mutex mutex;
+    pipe_mutex mutex;
     int refCount;
     drmMMListHead *heads;
     int num_fences;
@@ -44,7 +44,7 @@ struct _DriFenceObject {
     /*
      * These members are protected by this->mutex.
      */
-    _glthread_Mutex mutex;
+    pipe_mutex mutex;
     uint32_t signaled_type;
     void *private;
 };
@@ -65,8 +65,8 @@ driFenceMgrCreate(const struct _DriFenceMgrCreateInfo *info)
   if (!tmp)
       return NULL;
 
-  _glthread_INIT_MUTEX(tmp->mutex);
-  _glthread_LOCK_MUTEX(tmp->mutex);
+  pipe_mutex_init(tmp->mutex);
+  pipe_mutex_lock(tmp->mutex);
   tmp->refCount = 1;
   tmp->info = *info;
   tmp->num_fences = 0;
@@ -77,7 +77,7 @@ driFenceMgrCreate(const struct _DriFenceMgrCreateInfo *info)
   for (i=0; i<tmp->info.num_classes; ++i) {
       DRMINITLISTHEAD(&tmp->heads[i]);
   }
-  _glthread_UNLOCK_MUTEX(tmp->mutex);
+  pipe_mutex_unlock(tmp->mutex);
   return tmp;
 
   out_err:
@@ -95,13 +95,13 @@ driFenceMgrUnrefUnlock(struct _DriFenceMgr **pMgr)
     if (--mgr->refCount == 0)
 	free(mgr);
     else
-	_glthread_UNLOCK_MUTEX(mgr->mutex);
+	pipe_mutex_unlock(mgr->mutex);
 }
 
 void
 driFenceMgrUnReference(struct _DriFenceMgr **pMgr)
 {
-    _glthread_LOCK_MUTEX((*pMgr)->mutex);
+    pipe_mutex_lock((*pMgr)->mutex);
     driFenceMgrUnrefUnlock(pMgr);
 }
 
@@ -143,9 +143,9 @@ driSignalPreviousFencesLocked(struct _DriFenceMgr *mgr,
 	 */
 
 	++entry->refCount;
-	_glthread_UNLOCK_MUTEX(mgr->mutex);
-	_glthread_LOCK_MUTEX(entry->mutex);
-	_glthread_LOCK_MUTEX(mgr->mutex);
+	pipe_mutex_unlock(mgr->mutex);
+	pipe_mutex_lock(entry->mutex);
+	pipe_mutex_lock(mgr->mutex);
 
 	prev = list->prev;
 
@@ -157,7 +157,7 @@ driSignalPreviousFencesLocked(struct _DriFenceMgr *mgr,
 		 * Somebody else removed the entry from the list.
 		 */
 
-		_glthread_UNLOCK_MUTEX(entry->mutex);
+		pipe_mutex_unlock(entry->mutex);
 		driFenceUnReferenceLocked(&entry);
 		return;
 	}
@@ -167,7 +167,7 @@ driSignalPreviousFencesLocked(struct _DriFenceMgr *mgr,
 	    DRMLISTDELINIT(list);
 	    mgr->info.unreference(mgr, &entry->private);
 	}
-	_glthread_UNLOCK_MUTEX(entry->mutex);
+	pipe_mutex_unlock(entry->mutex);
 	driFenceUnReferenceLocked(&entry);
 	list = prev;
     }
@@ -181,7 +181,7 @@ driFenceFinish(struct _DriFenceObject *fence, uint32_t fence_type,
     struct _DriFenceMgr *mgr = fence->mgr;
     int ret = 0;
 
-    _glthread_LOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(fence->mutex);
 
     if ((fence->signaled_type & fence_type) == fence_type)
 	goto out0;
@@ -190,16 +190,16 @@ driFenceFinish(struct _DriFenceObject *fence, uint32_t fence_type,
     if (ret)
 	goto out0;
 
-    _glthread_LOCK_MUTEX(mgr->mutex);
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(mgr->mutex);
+    pipe_mutex_unlock(fence->mutex);
 
     driSignalPreviousFencesLocked(mgr, &fence->head, fence->fence_class,
 				  fence_type);
-    _glthread_UNLOCK_MUTEX(mgr->mutex);
+    pipe_mutex_unlock(mgr->mutex);
     return 0;
 
   out0:
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_unlock(fence->mutex);
     return ret;
 }
 
@@ -207,9 +207,9 @@ uint32_t driFenceSignaledTypeCached(struct _DriFenceObject *fence)
 {
     uint32_t ret;
 
-    _glthread_LOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(fence->mutex);
     ret = fence->signaled_type;
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_unlock(fence->mutex);
 
     return ret;
 }
@@ -221,7 +221,7 @@ driFenceSignaledType(struct _DriFenceObject *fence, uint32_t flush_type,
     int ret = 0;
     struct _DriFenceMgr *mgr;
 
-    _glthread_LOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(fence->mutex);
     mgr = fence->mgr;
     *signaled = fence->signaled_type;
     if ((fence->signaled_type & flush_type) == flush_type)
@@ -236,25 +236,25 @@ driFenceSignaledType(struct _DriFenceObject *fence, uint32_t flush_type,
     if ((fence->signaled_type | *signaled) == fence->signaled_type)
 	goto out0;
 
-    _glthread_LOCK_MUTEX(mgr->mutex);
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(mgr->mutex);
+    pipe_mutex_unlock(fence->mutex);
 
     driSignalPreviousFencesLocked(mgr, &fence->head, fence->fence_class,
 				  *signaled);
 
-    _glthread_UNLOCK_MUTEX(mgr->mutex);
+    pipe_mutex_unlock(mgr->mutex);
     return 0;
   out0:
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_unlock(fence->mutex);
     return ret;
 }
 
 struct _DriFenceObject *
 driFenceReference(struct _DriFenceObject *fence)
 {
-    _glthread_LOCK_MUTEX(fence->mgr->mutex);
+    pipe_mutex_lock(fence->mgr->mutex);
     ++fence->refCount;
-    _glthread_UNLOCK_MUTEX(fence->mgr->mutex);
+    pipe_mutex_unlock(fence->mgr->mutex);
     return fence;
 }
 
@@ -267,7 +267,7 @@ driFenceUnReference(struct _DriFenceObject **pFence)
 	return;
 
     mgr = (*pFence)->mgr;
-    _glthread_LOCK_MUTEX(mgr->mutex);
+    pipe_mutex_lock(mgr->mutex);
     ++mgr->refCount;
     driFenceUnReferenceLocked(pFence);
     driFenceMgrUnrefUnlock(&mgr);
@@ -294,15 +294,15 @@ struct _DriFenceObject
 	return NULL;
     }
 
-    _glthread_INIT_MUTEX(fence->mutex);
-    _glthread_LOCK_MUTEX(fence->mutex);
-    _glthread_LOCK_MUTEX(mgr->mutex);
+    pipe_mutex_init(fence->mutex);
+    pipe_mutex_lock(fence->mutex);
+    pipe_mutex_lock(mgr->mutex);
     fence->refCount = 1;
     DRMLISTADDTAIL(&fence->head, &mgr->heads[fence_class]);
     fence->mgr = mgr;
     ++mgr->refCount;
     ++mgr->num_fences;
-    _glthread_UNLOCK_MUTEX(mgr->mutex);
+    pipe_mutex_unlock(mgr->mutex);
     fence->fence_class = fence_class;
     fence->fence_type = fence_type;
     fence->signaled_type = 0;
@@ -312,7 +312,7 @@ struct _DriFenceObject
 	memcpy(fence->private, private, private_size);
     }
 
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_unlock(fence->mutex);
     return fence;
 }
 
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c b/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c
index a80555c9c7..60924eac9e 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c
@@ -33,7 +33,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include "pipe/p_debug.h"
-#include "glthread.h"
+#include "pipe/p_thread.h"
 #include "ws_dri_bufpool.h"
 #include "ws_dri_bufmgr.h"
 
@@ -60,14 +60,14 @@ pool_destroy(struct _DriBufferPool *pool, void *private)
 
 static int
 pool_waitIdle(struct _DriBufferPool *pool, void *private,
-	      _glthread_Mutex *mutex, int lazy)
+	      pipe_mutex *mutex, int lazy)
 {
     return 0;
 }
 
 static int
 pool_map(struct _DriBufferPool *pool, void *private, unsigned flags,
-         int hint, _glthread_Mutex *mutex, void **virtual)
+         int hint, pipe_mutex *mutex, void **virtual)
 {
     *virtual = (void *)((unsigned long *)private + 2);
     return 0;
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c b/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c
index dfcf6d6b19..391cea50a7 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c
@@ -37,7 +37,7 @@
 #include "ws_dri_bufpool.h"
 #include "ws_dri_fencemgr.h"
 #include "ws_dri_bufmgr.h"
-#include "glthread.h"
+#include "pipe/p_thread.h"
 
 #define DRI_SLABPOOL_ALLOC_RETRIES 100
 
@@ -53,7 +53,7 @@ struct _DriSlabBuffer {
     uint32_t start;
     uint32_t fenceType;
     int unFenced;
-    _glthread_Cond event;
+    pipe_condvar event;
 };
 
 struct _DriKernelBO {
@@ -84,7 +84,7 @@ struct _DriSlabSizeHeader {
     uint32_t numDelayed;
     struct _DriSlabPool *slabPool;
     uint32_t bufSize;
-    _glthread_Mutex mutex;
+    pipe_mutex mutex;
 };
 
 struct _DriFreeSlabManager {
@@ -94,7 +94,7 @@ struct _DriFreeSlabManager {
     drmMMListHead timeoutList;
     drmMMListHead unCached;
     drmMMListHead cached;
-    _glthread_Mutex mutex;
+    pipe_mutex mutex;
 };
 
 
@@ -196,7 +196,7 @@ driSetKernelBOFree(struct _DriFreeSlabManager *fMan,
 {
     struct timeval time;
 
-    _glthread_LOCK_MUTEX(fMan->mutex);
+    pipe_mutex_lock(fMan->mutex);
     gettimeofday(&time, NULL);
     driTimeAdd(&time, &fMan->slabTimeout);
 
@@ -210,7 +210,7 @@ driSetKernelBOFree(struct _DriFreeSlabManager *fMan,
     DRMLISTADDTAIL(&kbo->timeoutHead, &fMan->timeoutList);
     driFreeTimeoutKBOsLocked(fMan, &time);
 
-    _glthread_UNLOCK_MUTEX(fMan->mutex);
+    pipe_mutex_unlock(fMan->mutex);
 }
 
 /*
@@ -237,7 +237,7 @@ driAllocKernelBO(struct _DriSlabSizeHeader *header)
 
     size = (size <= slabPool->maxSlabSize) ? size : slabPool->maxSlabSize;
     size = (size + slabPool->pageSize - 1) & ~(slabPool->pageSize - 1);
-    _glthread_LOCK_MUTEX(fMan->mutex);
+    pipe_mutex_lock(fMan->mutex);
 
     kbo = NULL;
 
@@ -269,7 +269,7 @@ driAllocKernelBO(struct _DriSlabSizeHeader *header)
 	DRMLISTDELINIT(&kbo->timeoutHead);
     }
 
-    _glthread_UNLOCK_MUTEX(fMan->mutex);
+    pipe_mutex_unlock(fMan->mutex);
 
     if (kbo) {
         uint64_t new_mask = kbo->bo.proposedFlags ^ slabPool->proposedFlags;
@@ -360,7 +360,7 @@ driAllocSlab(struct _DriSlabSizeHeader *header)
 	buf->start = i* header->bufSize;
 	buf->mapCount = 0;
 	buf->isSlabBuffer = 1;
-	_glthread_INIT_COND(buf->event);
+	pipe_condvar_init(buf->event);
 	DRMLISTADDTAIL(&buf->head, &slab->freeBuffers);
 	slab->numFree++;
 	buf++;
@@ -494,23 +494,23 @@ driSlabAllocBuffer(struct _DriSlabSizeHeader *header)
     drmMMListHead *list;
     int count = DRI_SLABPOOL_ALLOC_RETRIES;
 
-    _glthread_LOCK_MUTEX(header->mutex);
+    pipe_mutex_lock(header->mutex);
     while(header->slabs.next == &header->slabs && count > 0) {
         driSlabCheckFreeLocked(header, 0);
 	if (header->slabs.next != &header->slabs)
 	  break;
 
-	_glthread_UNLOCK_MUTEX(header->mutex);
+	pipe_mutex_unlock(header->mutex);
 	if (count != DRI_SLABPOOL_ALLOC_RETRIES)
 	    usleep(1);
-	_glthread_LOCK_MUTEX(header->mutex);
+	pipe_mutex_lock(header->mutex);
 	(void) driAllocSlab(header);
 	count--;
     }
 
     list = header->slabs.next;
     if (list == &header->slabs) {
-	_glthread_UNLOCK_MUTEX(header->mutex);
+	pipe_mutex_unlock(header->mutex);
 	return NULL;
     }
     slab = DRMLISTENTRY(struct _DriSlab, list, head);
@@ -520,7 +520,7 @@ driSlabAllocBuffer(struct _DriSlabSizeHeader *header)
     list = slab->freeBuffers.next;
     DRMLISTDELINIT(list);
 
-    _glthread_UNLOCK_MUTEX(header->mutex);
+    pipe_mutex_unlock(header->mutex);
     buf = DRMLISTENTRY(struct _DriSlabBuffer, list, head);
     return buf;
 }
@@ -618,7 +618,7 @@ pool_destroy(struct _DriBufferPool *driPool, void *private)
     slab = buf->parent;
     header = slab->header;
 
-    _glthread_LOCK_MUTEX(header->mutex);
+    pipe_mutex_lock(header->mutex);
     buf->unFenced = 0;
     buf->mapCount = 0;
 
@@ -631,18 +631,18 @@ pool_destroy(struct _DriBufferPool *driPool, void *private)
 	driSlabFreeBufferLocked(buf);
     }
 
-    _glthread_UNLOCK_MUTEX(header->mutex);
+    pipe_mutex_unlock(header->mutex);
     return 0;
 }
 
 static int
 pool_waitIdle(struct _DriBufferPool *driPool, void *private,
-	      _glthread_Mutex *mutex, int lazy)
+	      pipe_mutex *mutex, int lazy)
 {
    struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
 
    while(buf->unFenced)
-       _glthread_COND_WAIT(buf->event, *mutex);
+       pipe_condvar_wait(buf->event, *mutex);
 
    if (!buf->fence)
      return 0;
@@ -655,7 +655,7 @@ pool_waitIdle(struct _DriBufferPool *driPool, void *private,
 
 static int
 pool_map(struct _DriBufferPool *pool, void *private, unsigned flags,
-         int hint, _glthread_Mutex *mutex, void **virtual)
+         int hint, pipe_mutex *mutex, void **virtual)
 {
    struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
    int busy;
@@ -689,7 +689,7 @@ pool_unmap(struct _DriBufferPool *pool, void *private)
 
    --buf->mapCount;
    if (buf->mapCount == 0 && buf->isSlabBuffer)
-       _glthread_COND_BROADCAST(buf->event);
+      pipe_condvar_broadcast(buf->event);
 
    return 0;
 }
@@ -760,7 +760,7 @@ pool_fence(struct _DriBufferPool *pool, void *private,
    buf->fenceType = bo->fenceFlags;
 
    buf->unFenced = 0;
-   _glthread_COND_BROADCAST(buf->event);
+   pipe_condvar_broadcast(buf->event);
 
    return 0;
 }
@@ -775,7 +775,7 @@ pool_kernel(struct _DriBufferPool *pool, void *private)
 
 static int
 pool_validate(struct _DriBufferPool *pool, void *private,
-	      _glthread_Mutex *mutex)
+	      pipe_mutex *mutex)
 {
    struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
 
@@ -783,7 +783,7 @@ pool_validate(struct _DriBufferPool *pool, void *private,
        return 0;
 
    while(buf->mapCount != 0)
-       _glthread_COND_WAIT(buf->event, *mutex);
+      pipe_condvar_wait(buf->event, *mutex);
 
    buf->unFenced = 1;
    return 0;
@@ -799,8 +799,8 @@ driInitFreeSlabManager(uint32_t checkIntervalMsec, uint32_t slabTimeoutMsec)
     if (!tmp)
 	return NULL;
 
-    _glthread_INIT_MUTEX(tmp->mutex);
-    _glthread_LOCK_MUTEX(tmp->mutex);
+    pipe_mutex_init(tmp->mutex);
+    pipe_mutex_lock(tmp->mutex);
     tmp->slabTimeout.tv_usec = slabTimeoutMsec*1000;
     tmp->slabTimeout.tv_sec = tmp->slabTimeout.tv_usec / 1000000;
     tmp->slabTimeout.tv_usec -=  tmp->slabTimeout.tv_sec*1000000;
@@ -814,7 +814,7 @@ driInitFreeSlabManager(uint32_t checkIntervalMsec, uint32_t slabTimeoutMsec)
     DRMINITLISTHEAD(&tmp->timeoutList);
     DRMINITLISTHEAD(&tmp->unCached);
     DRMINITLISTHEAD(&tmp->cached);
-    _glthread_UNLOCK_MUTEX(tmp->mutex);
+    pipe_mutex_unlock(tmp->mutex);
 
     return tmp;
 }
@@ -827,9 +827,9 @@ driFinishFreeSlabManager(struct _DriFreeSlabManager *fMan)
     time = fMan->nextCheck;
     driTimeAdd(&time, &fMan->checkInterval);
 
-    _glthread_LOCK_MUTEX(fMan->mutex);
+    pipe_mutex_lock(fMan->mutex);
     driFreeTimeoutKBOsLocked(fMan, &time);
-    _glthread_UNLOCK_MUTEX(fMan->mutex);
+    pipe_mutex_unlock(fMan->mutex);
 
     assert(fMan->timeoutList.next == &fMan->timeoutList);
     assert(fMan->unCached.next == &fMan->unCached);
@@ -842,8 +842,8 @@ static void
 driInitSizeHeader(struct _DriSlabPool *pool, uint32_t size,
 		  struct _DriSlabSizeHeader *header)
 {
-    _glthread_INIT_MUTEX(header->mutex);
-    _glthread_LOCK_MUTEX(header->mutex);
+    pipe_mutex_init(header->mutex);
+    pipe_mutex_lock(header->mutex);
 
     DRMINITLISTHEAD(&header->slabs);
     DRMINITLISTHEAD(&header->freeSlabs);
@@ -853,7 +853,7 @@ driInitSizeHeader(struct _DriSlabPool *pool, uint32_t size,
     header->slabPool = pool;
     header->bufSize = size;
 
-    _glthread_UNLOCK_MUTEX(header->mutex);
+    pipe_mutex_unlock(header->mutex);
 }
 
 static void
@@ -862,7 +862,7 @@ driFinishSizeHeader(struct _DriSlabSizeHeader *header)
     drmMMListHead *list, *next;
     struct _DriSlabBuffer *buf;
 
-    _glthread_LOCK_MUTEX(header->mutex);
+    pipe_mutex_lock(header->mutex);
     for (list = header->delayedBuffers.next, next = list->next;
 	 list != &header->delayedBuffers;
 	 list = next, next = list->next) {
@@ -875,7 +875,7 @@ driFinishSizeHeader(struct _DriSlabSizeHeader *header)
 	header->numDelayed--;
 	driSlabFreeBufferLocked(buf);
     }
-    _glthread_UNLOCK_MUTEX(header->mutex);
+    pipe_mutex_unlock(header->mutex);
 }
 
 static void
diff --git a/src/gallium/winsys/drm/intel/dri/intel_lock.c b/src/gallium/winsys/drm/intel/dri/intel_lock.c
index 406284c98f..18c7bba0d5 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_lock.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_lock.c
@@ -27,7 +27,7 @@
 
 
 #include "main/glheader.h"
-#include "glapi/glthread.h"
+#include "pipe/p_lthread.h"
 #include <GL/internal/glcore.h>
 #include "state_tracker/st_public.h"
 #include "intel_context.h"
@@ -35,7 +35,7 @@
 
 
-_glthread_DECLARE_STATIC_MUTEX( lockMutex );
+pipe_static_mutex( lockMutex );
 
 
 static void
@@ -72,7 +72,7 @@ void LOCK_HARDWARE( struct intel_context *intel )
 {
     char __ret = 0;
 
-    _glthread_LOCK_MUTEX(lockMutex);
+    pipe_mutex_lock(lockMutex);
     assert(!intel->locked);
 
     DRM_CAS(intel->driHwLock, intel->hHWContext,
@@ -96,7 +96,7 @@ void UNLOCK_HARDWARE( struct intel_context *intel )
 
    DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
 
-   _glthread_UNLOCK_MUTEX(lockMutex);
+   pipe_mutex_unlock(lockMutex);
 
    DBG(LOCK, "%s - unlocked\n", __progname);
 }
diff --git a/src/gallium/winsys/xlib/glxapi.c b/src/gallium/winsys/xlib/glxapi.c
index c2ccce6f52..c059fc3edb 100644
--- a/src/gallium/winsys/xlib/glxapi.c
+++ b/src/gallium/winsys/xlib/glxapi.c
@@ -37,6 +37,7 @@
 #include "main/glheader.h"
 #include "glapi/glapi.h"
 #include "glxapi.h"
+#include "pipe/p_thread.h"
 
 
 extern struct _glxapi_table *_real_GetGLXDispatchTable(void);
@@ -127,26 +128,13 @@ get_dispatch(Display *dpy)
 /**
  * GLX API current context.
  */
-#if defined(GLX_USE_TLS)
-PUBLIC __thread void * CurrentContext
-    __attribute__((tls_model("initial-exec")));
-#elif defined(THREADS)
-static _glthread_TSD ContextTSD;         /**< Per-thread context pointer */
-#else
-static GLXContext CurrentContext = 0;
-#endif
+pipe_tsd ContextTSD;
 
 
 static void
 SetCurrentContext(GLXContext c)
 {
-#if defined(GLX_USE_TLS)
-   CurrentContext = c;
-#elif defined(THREADS)
-   _glthread_SetTSD(&ContextTSD, c);
-#else
-   CurrentContext = c;
-#endif
+   pipe_tsd_set(&ContextTSD, c);
 }
 
 
@@ -238,13 +226,7 @@ glXGetConfig(Display *dpy, XVisualInfo *visinfo, int attrib, int *value)
 GLXContext PUBLIC
 glXGetCurrentContext(void)
 {
-#if defined(GLX_USE_TLS)
-   return CurrentContext;
-#elif defined(THREADS)
-   return (GLXContext) _glthread_GetTSD(&ContextTSD);
-#else
-   return CurrentContext;
-#endif
+   return (GLXContext) pipe_tsd_get(&ContextTSD);
 }
 
 
diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c
index 7256340420..edcadff9c5 100644
--- a/src/gallium/winsys/xlib/xm_api.c
+++ b/src/gallium/winsys/xlib/xm_api.c
@@ -62,7 +62,6 @@
 #include "xmesaP.h"
 #include "main/context.h"
 #include "main/framebuffer.h"
-#include "glapi/glthread.h"
 
 #include "state_tracker/st_public.h"
 #include "state_tracker/st_context.h"
@@ -75,7 +74,7 @@
 /**
  * Global X driver lock
  */
-_glthread_Mutex _xmesa_lock;
+pipe_mutex _xmesa_lock;
 
 
 int xmesa_mode;
@@ -245,10 +244,10 @@ xmesa_get_window_size(XMesaDisplay *dpy, XMesaBuffer b,
 #else
    Status stat;
 
-   _glthread_LOCK_MUTEX(_xmesa_lock);
+   pipe_mutex_lock(_xmesa_lock);
    XSync(b->xm_visual->display, 0); /* added for Chromium */
    stat = get_drawable_size(dpy, b->drawable, width, height);
-   _glthread_UNLOCK_MUTEX(_xmesa_lock);
+   pipe_mutex_unlock(_xmesa_lock);
 
    if (!stat) {
       /* probably querying a window that's recently been destroyed */
@@ -779,7 +778,7 @@ XMesaContext XMesaCreateContext( XMesaVisual v, XMesaContext share_list )
    uint pf;
 
    if (firstTime) {
-      _glthread_INIT_MUTEX(_xmesa_lock);
+      pipe_mutex_init(_xmesa_lock);
       firstTime = GL_FALSE;
    }
 
diff --git a/src/gallium/winsys/xlib/xmesaP.h b/src/gallium/winsys/xlib/xmesaP.h
index 9b15b2ddf9..fcaeee52bc 100644
--- a/src/gallium/winsys/xlib/xmesaP.h
+++ b/src/gallium/winsys/xlib/xmesaP.h
@@ -35,9 +35,10 @@
 
 #include "state_tracker/st_context.h"
 #include "state_tracker/st_public.h"
+#include "pipe/p_thread.h"
 
 
-extern _glthread_Mutex _xmesa_lock;
+extern pipe_mutex _xmesa_lock;
 
 extern XMesaBuffer XMesaBufferList;
 
-- 
cgit v1.2.3


From f3a7463feefcf1f22c1309e1f5b0bfe381859686 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 1 Sep 2008 15:30:26 -0600
Subject: gallium: include u_pointer,h, not p_pointer.h

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index f4ca282dd9..6d4c081e04 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -27,7 +27,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_pointer.h"
+#include "util/u_pointer.h"
 
 #include "rtasm_execmem.h"
 #include "rtasm_x86sse.h"
-- 
cgit v1.2.3


From bb5becf1e289b2c9240d98299e9447a9673da9fc Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 5 Sep 2008 13:54:14 -0600
Subject: gallium: comments, assertions, etc

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 36 +++++++++++++++++++++++++----
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 20 +++++++++-------
 2 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 285ddc0e3f..fe5beba456 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -300,7 +300,9 @@ void _name (struct spe_function *p, int imm) \
 #include "rtasm_ppc_spe.h"
 
 
-/*
+/**
+ * Initialize an spe_function.
+ * \param code_size  size of instruction buffer to allocate, in bytes.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
@@ -324,10 +326,14 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+/**
+ * Alloate a SPE register.
+ * \return register index or -1 if none left.
+ */
 int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
-   for (i = 0; i < 128; i++) {
+   for (i = 0; i < SPE_NUM_REGS; i++) {
       const uint64_t mask = (1ULL << (i % 64));
       const unsigned idx = i / 64;
 
@@ -341,11 +347,15 @@ int spe_allocate_available_register(struct spe_function *p)
 }
 
 
+/**
+ * Mark the given SPE register as "allocated".
+ */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) != 0);
 
    p->regs[idx] &= ~(1ULL << bit);
@@ -353,57 +363,73 @@ int spe_allocate_register(struct spe_function *p, int reg)
 }
 
 
+/**
+ * Mark the given SPE register as "unallocated".
+ */
 void spe_release_register(struct spe_function *p, int reg)
 {
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) == 0);
 
    p->regs[idx] |= (1ULL << bit);
 }
 
 
+/**
+ * For branch instructions:
+ * \param d  if 1, disable interupts if branch is taken
+ * \param e  if 1, enable interupts if branch is taken
+ * If d and e are both zero, don't change interupt status (right?)
+ */
 
-
+/** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
 }
 
+/** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
     emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
     emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
 }
 
-void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d,
-		int e)
+/** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
+void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 1cacc717b1..7dd754ba77 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -32,13 +32,17 @@
 #ifndef RTASM_PPC_SPE_H
 #define RTASM_PPC_SPE_H
 
-struct spe_function {
-    /**
-     *
-     */
-    uint32_t *store;
-    uint32_t *csr;
-    const char *fn;
+/** 4 bytes per instruction */
+#define SPE_INST_SIZE 4
+
+/** number of general-purpose SIMD registers */
+#define SPE_NUM_REGS  128
+
+struct spe_function
+{
+    uint32_t *store;  /**< instruction buffer */
+    uint32_t *csr;    /**< next free pos in instruction buffer */
+    const char *fn;   /**< unused */
 
     /**
      * Mask of used / unused registers
@@ -50,7 +54,7 @@ struct spe_function {
      * spe_allocate_register, spe_allocate_available_register,
      * spe_release_register
      */
-    uint64_t regs[2];
+    uint64_t regs[SPE_NUM_REGS / 64];
 };
 
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
-- 
cgit v1.2.3


From ee582fd3a7a9ddbcb5595249201cf213a6c6f014 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 10 Sep 2008 17:11:48 -0600
Subject: gallium: assorted additions and fixes to Cell SPE rtasm code

Fix incorrect opcode for fsmbi.
Added "macro" functions for loading floats/ints, register complement, zero, move.
Added #defines for return address and stack pointer registers.
Added assertions to check that the instruction buffer doesn't overflow.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 88 +++++++++++++++++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 38 +++++++++++--
 2 files changed, 105 insertions(+), 21 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index fe5beba456..61010e4333 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -151,8 +151,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -165,8 +165,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rC = rC;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -178,8 +178,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i7 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -192,8 +192,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i8 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -206,8 +206,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i10 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -218,8 +218,8 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i16 = imm;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -230,8 +230,8 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i18 = imm;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -307,8 +307,9 @@ void _name (struct spe_function *p, int imm) \
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     p->store = align_malloc(code_size, 16);
-    p->csr = p->store;
-    
+    p->num_inst = 0;
+    p->max_inst = code_size / SPE_INST_SIZE;
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
     p->regs[0] = ~7;
@@ -318,11 +319,11 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
 
 void spe_release_func(struct spe_function *p)
 {
+    assert(p->num_inst <= p->max_inst);
     if (p->store != NULL) {
         align_free(p->store);
     }
     p->store = NULL;
-    p->csr = NULL;
 }
 
 
@@ -337,6 +338,7 @@ int spe_allocate_available_register(struct spe_function *p)
       const uint64_t mask = (1ULL << (i % 64));
       const unsigned idx = i / 64;
 
+      assert(idx < 2);
       if ((p->regs[idx] & mask) != 0) {
          p->regs[idx] &= ~mask;
          return i;
@@ -371,6 +373,8 @@ void spe_release_register(struct spe_function *p, int reg)
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(idx < 2);
+
    assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) == 0);
 
@@ -458,4 +462,54 @@ EMIT_R   (spe_mfspr, 0x00c);
 EMIT_R   (spe_mtspr, 0x10c);
 #endif
 
+
+/**
+ ** Helper / "macro" instructions.
+ ** Use somewhat verbose names as a reminder that these aren't native
+ ** SPE instructions.
+ **/
+
+
+void
+spe_load_float(struct spe_function *p, unsigned rT, float x)
+{
+   union {
+      float f;
+      unsigned u;
+   } bits;
+   bits.f = x;
+   spe_ilhu(p, rT, bits.u >> 16);
+   spe_iohl(p, rT, bits.u & 0xffff);
+}
+
+
+void
+spe_load_int(struct spe_function *p, unsigned rT, int i)
+{
+   spe_ilhu(p, rT, i >> 16);
+   spe_iohl(p, rT, i & 0xffff);
+}
+
+
+void
+spe_complement(struct spe_function *p, unsigned rT)
+{
+   spe_nor(p, rT, rT, rT);
+}
+
+
+void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   spe_ori(p, rT, rA, 0);
+}
+
+
+void
+spe_zero(struct spe_function *p, unsigned rT)
+{
+   spe_xor(p, rT, rT, rT);
+}
+
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7dd754ba77..dee8c55c4a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -25,6 +25,7 @@
 /**
  * \file
  * Real-time assembly generation interface for Cell B.E. SPEs.
+ * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <idr@us.ibm.com>
  */
@@ -38,11 +39,18 @@
 /** number of general-purpose SIMD registers */
 #define SPE_NUM_REGS  128
 
+/** Return Address register */
+#define SPE_REG_RA  0
+
+/** Stack Pointer register */
+#define SPE_REG_SP  1
+
+
 struct spe_function
 {
-    uint32_t *store;  /**< instruction buffer */
-    uint32_t *csr;    /**< next free pos in instruction buffer */
-    const char *fn;   /**< unused */
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
 
     /**
      * Mask of used / unused registers
@@ -123,7 +131,8 @@ EMIT_RI16(spe_ilhu,  0x082);
 EMIT_RI16(spe_il,    0x081);
 EMIT_RI18(spe_ila,   0x021);
 EMIT_RI16(spe_iohl,  0x0c1);
-EMIT_RI16(spe_fsmbi, 0x0c5);
+EMIT_RI16(spe_fsmbi, 0x065);
+
 
 
 /* Integer and logical instructions
@@ -275,6 +284,27 @@ extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
     int d, int e);
 
 
+/** Load/splat immediate float into rT. */
+extern void
+spe_load_float(struct spe_function *p, unsigned rT, float x);
+
+/** Load/splat immediate int into rT. */
+extern void
+spe_load_int(struct spe_function *p, unsigned rT, int i);
+
+/** Complement/invert all bits in rT. */
+extern void
+spe_complement(struct spe_function *p, unsigned rT);
+
+/** rT = rA. */
+extern void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA);
+
+/** rT = {0,0,0,0}. */
+extern void
+spe_zero(struct spe_function *p, unsigned rT);
+
+
 /* Floating-point instructions
  */
 EMIT_RR  (spe_fa,         0x2c4);
-- 
cgit v1.2.3


From 178bbaff80d079606a1135bd65f1a85bac9774c4 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 11 Sep 2008 17:07:30 -0600
Subject: gallium: add special cases in spe_load_float(), spe_load_int(), added
 spe_splat()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 45 +++++++++++++++++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h |  4 +++
 2 files changed, 40 insertions(+), 9 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 61010e4333..a04cc6c4ff 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -473,21 +473,48 @@ EMIT_R   (spe_mtspr, 0x10c);
 void
 spe_load_float(struct spe_function *p, unsigned rT, float x)
 {
-   union {
-      float f;
-      unsigned u;
-   } bits;
-   bits.f = x;
-   spe_ilhu(p, rT, bits.u >> 16);
-   spe_iohl(p, rT, bits.u & 0xffff);
+   if (x == 0.0f) {
+      spe_il(p, rT, 0x0);
+   }
+   else if (x == 0.5f) {
+      spe_ilhu(p, rT, 0x3f00);
+   }
+   else if (x == 1.0f) {
+      spe_ilhu(p, rT, 0x3f80);
+   }
+   else if (x == -1.0f) {
+      spe_ilhu(p, rT, 0xbf80);
+   }
+   else {
+      union {
+         float f;
+         unsigned u;
+      } bits;
+      bits.f = x;
+      spe_ilhu(p, rT, bits.u >> 16);
+      spe_iohl(p, rT, bits.u & 0xffff);
+   }
 }
 
 
 void
 spe_load_int(struct spe_function *p, unsigned rT, int i)
 {
-   spe_ilhu(p, rT, i >> 16);
-   spe_iohl(p, rT, i & 0xffff);
+   if (-32768 <= i && i <= 32767) {
+      spe_il(p, rT, i);
+   }
+   else {
+      spe_ilhu(p, rT, i >> 16);
+      spe_iohl(p, rT, i & 0xffff);
+   }
+}
+
+
+void
+spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   spe_ila(p, rT, 66051);
+   spe_shufb(p, rT, rA, rA, rT);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index dee8c55c4a..d95e5aace3 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -292,6 +292,10 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Replicate word 0 of rA across rT. */
+extern void
+spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
+
 /** Complement/invert all bits in rT. */
 extern void
 spe_complement(struct spe_function *p, unsigned rT);
-- 
cgit v1.2.3


From be5d8bd07886157fe524b8715509cd03ade2fda9 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 08:21:43 -0600
Subject: gallium: initial PPC/Altivec codegen

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 365 ++++++++++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 181 ++++++++++++++++
 2 files changed, 546 insertions(+)
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_ppc.c
 create mode 100644 src/gallium/auxiliary/rtasm/rtasm_ppc.h

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
new file mode 100644
index 0000000000..534a23568d
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -0,0 +1,365 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#include "util/u_memory.h"
+#include "pipe/p_debug.h"
+#include "rtasm_ppc.h"
+
+
+void
+ppc_init_func(struct ppc_function *p, unsigned max_inst)
+{
+    p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
+    p->num_inst = 0;
+    p->max_inst = max_inst;
+    p->vec_used = ~0;
+}
+
+
+void
+ppc_release_func(struct ppc_function *p)
+{
+    assert(p->num_inst <= p->max_inst);
+    if (p->store != NULL) {
+        align_free(p->store);
+    }
+    p->store = NULL;
+}
+
+
+/**
+ * Alloate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p, int reg)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->vec_used & mask) != 0) {
+         p->vec_used &= ~mask;
+         return i;
+      }
+   }
+
+   return -1;
+}
+
+
+/**
+ * Mark the given vector register as "unallocated".
+ */
+void
+ppc_release_vec_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_VEC_REGS);
+   assert((p->vec_used & (1 << reg)) == 0);
+
+   p->vec_used |= (1 << reg);
+}
+
+
+
+union vx_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned op2:11;
+   } inst;
+};
+
+union vxr_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned rC:1;
+      unsigned op2:10;
+   } inst;
+};
+
+union va_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned vC:5;
+      unsigned op2:6;
+   } inst;
+};
+
+
+static inline void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vx_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+static inline void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vxr_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.rC = 0;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+static inline void
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+{
+   union va_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.vC = vC;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 10, vD, vA, vB);
+}
+
+/** vector float substract */
+void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 74, vD, vA, vB);
+}
+
+/** vector float min */
+void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1098, vD, vA, vB);
+}
+
+/** vector float max */
+void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1034, vD, vA, vB);
+}
+
+/** vector float mult add */
+void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 46, vD, vA, vB, vC);
+}
+
+/** vector float compare greater than */
+void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 710, vD, vA, vB);
+}
+
+/** vector float compare greater than or equal to */
+void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 454, vD, vA, vB);
+}
+
+/** vector float compare equal */
+void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 198, vD, vA, vB);
+}
+
+/** vector float 2^x */
+void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 394, vD, 0, vB);
+}
+
+/** vector float log2(x) */
+void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 458, vD, 0, vB);
+}
+
+/** vector float reciprocol */
+void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 266, vD, 0, vB);
+}
+
+/** vector float reciprocol sqrt estimate */
+void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 330, vD, 0, vB);
+}
+
+/** vector float round to negative infinity */
+void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 714, vD, 0, vB);
+}
+
+/** vector float round to positive infinity */
+void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 650, vD, 0, vB);
+}
+
+/** vector float round to nearest int */
+void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 522, vD, 0, vB);
+}
+
+/** vector float round to int toward zero */
+void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 586, vD, 0, vB);
+}
+
+
+
+/**
+ ** bitwise operations
+ **/
+
+
+/** vector and */
+void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1028, vD, vA, vB);
+}
+
+/** vector and complement */
+void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1092, vD, vA, vB);
+}
+
+/** vector or */
+void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1156, vD, vA, vB);
+}
+
+/** vector nor */
+void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1284, vD, vA, vB);
+}
+
+/** vector xor */
+void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1220, vD, vA, vB);
+}
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 43, vD, vA, vB, vC);
+}
+
+/** vector select */
+void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 42, vD, vA, vB, vC);
+}
+
+/** vector splat byte */
+void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 42, vD, imm, vB);
+}
+
+/** vector splat half word */
+void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 588, vD, imm, vB);
+}
+
+/** vector splat word */
+void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 652, vD, imm, vB);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
new file mode 100644
index 0000000000..ed14e943df
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -0,0 +1,181 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#ifndef RTASM_PPC_H
+#define RTASM_PPC_H
+
+
+#include "pipe/p_compiler.h"
+
+
+#define PPC_INST_SIZE 4  /**< 4 bytes / instruction */
+
+#define PPC_NUM_VEC_REGS 32
+
+
+struct ppc_function
+{
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
+   uint32_t vec_used;   /** used/free vector registers bitmask */
+   uint32_t reg_used;   /** used/free general-purpose registers bitmask */
+};
+
+
+
+extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
+extern void ppc_release_func(struct ppc_function *p);
+
+extern int ppc_allocate_vec_register(struct ppc_function *p, int reg);
+extern void ppc_release_vec_register(struct ppc_function *p, int reg);
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+extern void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB);
+
+/** vector float substract */
+extern void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float min */
+extern void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float max */
+extern void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float mult add */
+extern void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float compare greater than */
+extern void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare greater than or equal to */
+extern void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare equal */
+extern void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float 2^x */
+extern void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float log2(x) */
+extern void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol */
+extern void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol sqrt estimate */
+extern void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to negative infinity */
+extern void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to positive infinity */
+extern void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to nearest int */
+extern void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to int toward zero */
+extern void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
+
+
+
+/**
+ ** bitwise operations
+ **/
+
+
+/** vector and */
+extern void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector and complement */
+extern void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector or */
+extern void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector nor */
+extern void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector xor */
+extern void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+extern void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector select */
+extern void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector splat byte */
+extern void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat half word */
+extern void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat word */
+extern void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+
+#endif /* RTASM_PPC_H */
-- 
cgit v1.2.3


From b71f4150c8be662d777da22ed0554663a9d1c84d Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 08:22:15 -0600
Subject: gallium: minor optimization to spe_load_int()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a04cc6c4ff..62e3adb357 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -505,7 +505,8 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
    else {
       spe_ilhu(p, rT, i >> 16);
-      spe_iohl(p, rT, i & 0xffff);
+      if (i & 0xffff)
+         spe_iohl(p, rT, i & 0xffff);
    }
 }
 
-- 
cgit v1.2.3


From 31a112cad4d2e515bc668b58abd4e402b4362c70 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 21:08:01 -0600
Subject: gallium: added spe_splat_word()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 25 +++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h |  4 ++++
 2 files changed, 29 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 62e3adb357..89f8e24ce6 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -540,4 +540,29 @@ spe_zero(struct spe_function *p, unsigned rT)
 }
 
 
+void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+{
+   assert(word >= 0);
+   assert(word <= 3);
+
+   if (word == 0) {
+      int tmp1 = rT;
+      spe_ila(p, tmp1, 66051);
+      spe_shufb(p, rT, rA, rA, tmp1);
+   }
+   else {
+      /* XXX review this, we may not need the rotqbyi instruction */
+      int tmp1 = rT;
+      int tmp2 = spe_allocate_available_register(p);
+
+      spe_ila(p, tmp1, 66051);
+      spe_rotqbyi(p, tmp2, rA, 4 * word);
+      spe_shufb(p, rT, tmp2, tmp2, tmp1);
+
+      spe_release_register(p, tmp2);
+   }
+}
+
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d95e5aace3..7a3ab9ace5 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -308,6 +308,10 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA);
 extern void
 spe_zero(struct spe_function *p, unsigned rT);
 
+/** rT = splat(rA, word) */
+extern void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+
 
 /* Floating-point instructions
  */
-- 
cgit v1.2.3


From 8b5013d232bf6846717fac093465e8a39064e0b6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 12 Sep 2008 21:52:47 -0600
Subject: gallium: added print/dump code to SPE code emitter

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 128 ++++++++++++++++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h |  10 +++
 2 files changed, 113 insertions(+), 25 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 89f8e24ce6..8718be9ded 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -27,12 +27,16 @@
  * Real-time assembly generation interface for Cell B.E. SPEs.
  *
  * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
+
+#include <stdio.h>
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
 #include "rtasm_ppc_spe.h"
 
+
 #ifdef GALLIUM_CELL
 /**
  * SPE instruction types
@@ -143,8 +147,25 @@ union spe_inst_RI18 {
 /*@}*/
 
 
+static void
+indent(const struct spe_function *p)
+{
+   int i;
+   for (i = 0; i < p->indent; i++) {
+      putchar(' ');
+   }
+}
+
+
+static const char *
+rem_prefix(const char *longname)
+{
+   return longname + 4;
+}
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB)
+		    unsigned rA, unsigned rB, const char *name)
 {
     union spe_inst_RR inst;
     inst.inst.op = op;
@@ -153,11 +174,15 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, r%d\n", rem_prefix(name), rT, rA, rB);
+    }
 }
 
 
 static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB, unsigned rC)
+                     unsigned rA, unsigned rB, unsigned rC, const char *name)
 {
     union spe_inst_RRR inst;
     inst.inst.op = op;
@@ -167,11 +192,15 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rC = rC;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, r%d, r%d\n", rem_prefix(name), rT, rA, rB, rB);
+    }
 }
 
 
 static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI7 inst;
     inst.inst.op = op;
@@ -180,12 +209,16 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI8 inst;
     inst.inst.op = op;
@@ -194,12 +227,16 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
-		      unsigned rA, int imm)
+		      unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI10 inst;
     inst.inst.op = op;
@@ -208,11 +245,15 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, r%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI16 inst;
     inst.inst.op = op;
@@ -220,11 +261,15 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, 0x%x\n", rem_prefix(name), rT, imm);
+    }
 }
 
 
 static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI18 inst;
     inst.inst.op = op;
@@ -232,6 +277,10 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\tr%d, 0x%x\n", rem_prefix(name), rT, imm);
+    }
 }
 
 
@@ -240,61 +289,61 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
 { \
-    emit_RR(p, _op, rT, 0, 0); \
+   emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
 }
 
 #define EMIT_R(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA) \
 { \
-    emit_RR(p, _op, rT, rA, 0); \
+   emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
 }
 
 #define EMIT_RR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
 { \
-    emit_RR(p, _op, rT, rA, rB); \
+   emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
 }
 
 #define EMIT_RRR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
 { \
-    emit_RRR(p, _op, rT, rA, rB, rC); \
+   emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
 }
 
 #define EMIT_RI7(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI7(p, _op, rT, rA, imm); \
+   emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
 }
 
 #define EMIT_RI8(_name, _op, bias) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI8(p, _op, rT, rA, bias - imm); \
+   emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
 }
 
 #define EMIT_RI10(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI10(p, _op, rT, rA, imm); \
+   emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI16(p, _op, rT, imm); \
+   emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_RI18(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI18(p, _op, rT, imm); \
+   emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_I16(_name, _op) \
 void _name (struct spe_function *p, int imm) \
 { \
-    emit_RI16(p, _op, 0, imm); \
+   emit_RI16(p, _op, 0, imm, __FUNCTION__);                  \
 }
 
 #include "rtasm_ppc_spe.h"
@@ -314,6 +363,9 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
      */
     p->regs[0] = ~7;
     p->regs[1] = (1U << (80 - 64)) - 1;
+
+    p->print = false;
+    p->indent = 0;
 }
 
 
@@ -382,6 +434,32 @@ void spe_release_register(struct spe_function *p, int reg)
 }
 
 
+void
+spe_print_code(struct spe_function *p, boolean enable)
+{
+   p->print = enable;
+}
+
+
+void
+spe_indent(struct spe_function *p, int spaces)
+{
+   p->indent += spaces;
+}
+
+
+extern void
+spe_comment(struct spe_function *p, int rel_indent, const char *s)
+{
+   if (p->print) {
+      p->indent += rel_indent;
+      indent(p);
+      p->indent -= rel_indent;
+      printf("%s\n", s);
+   }
+}
+
+
 /**
  * For branch instructions:
  * \param d  if 1, disable interupts if branch is taken
@@ -392,51 +470,51 @@ void spe_release_register(struct spe_function *p, int reg)
 /** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
 void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7a3ab9ace5..2579045232 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -28,6 +28,7 @@
  * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
  */
 
 #ifndef RTASM_PPC_SPE_H
@@ -63,8 +64,12 @@ struct spe_function
      * spe_release_register
      */
     uint64_t regs[SPE_NUM_REGS / 64];
+
+    boolean print; /**< print/dump instructions as they're emitted? */
+    int indent;    /**< number of spaces to indent */
 };
 
+
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
 
@@ -72,6 +77,11 @@ extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
 
+extern void spe_print_code(struct spe_function *p, boolean enable);
+extern void spe_indent(struct spe_function *p, int spaces);
+extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
+
+
 #endif /* RTASM_PPC_SPE_H */
 
 #ifndef EMIT_
-- 
cgit v1.2.3


From 367774a62aa0627c5589e91ab7b411634113c815 Mon Sep 17 00:00:00 2001
From: Jonathan White <jwhite@tungstengraphics.com>
Date: Mon, 15 Sep 2008 11:56:21 -0600
Subject: Fixed emit_RRR

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 8718be9ded..74cd4176e7 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -194,7 +194,7 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\tr%d, r%d, r%d, r%d\n", rem_prefix(name), rT, rA, rB, rB);
+       printf("%s\tr%d, r%d, r%d, r%d\n", rem_prefix(name), rT, rA, rB, rC);
     }
 }
 
-- 
cgit v1.2.3


From ae3373441dd4548702f23fe44bd04830e4902241 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Mon, 15 Sep 2008 15:10:02 -0600
Subject: gallium: emit SPU instructions in assembler-compatible syntax

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 74cd4176e7..870ae802c5 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -176,7 +176,7 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\tr%d, r%d, r%d\n", rem_prefix(name), rT, rA, rB);
+       printf("%s\t$%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB);
     }
 }
 
@@ -194,7 +194,7 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\tr%d, r%d, r%d, r%d\n", rem_prefix(name), rT, rA, rB, rC);
+       printf("%s\t$%d, $%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB, rC);
     }
 }
 
@@ -211,7 +211,7 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\tr%d, r%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
     }
 }
 
@@ -229,7 +229,7 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\tr%d, r%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
     }
 }
 
@@ -247,7 +247,11 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\tr%d, r%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       if (strcmp(name, "spe_lqd") == 0 ||
+           strcmp(name, "spe_stqd") == 0)
+          printf("%s\t$%d, 0x%x($%d)\n", rem_prefix(name), rT, imm, rA);
+       else
+          printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
     }
 }
 
@@ -263,7 +267,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\tr%d, 0x%x\n", rem_prefix(name), rT, imm);
+       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
     }
 }
 
@@ -279,7 +283,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\tr%d, 0x%x\n", rem_prefix(name), rT, imm);
+       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
     }
 }
 
@@ -455,7 +459,7 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
       p->indent += rel_indent;
       indent(p);
       p->indent -= rel_indent;
-      printf("%s\n", s);
+      printf("# %s\n", s);
    }
 }
 
-- 
cgit v1.2.3


From f8bba34d4e12ef4c620cac881a4b697a1e668377 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 18 Sep 2008 01:29:41 -0600
Subject: CELL: finish fragment ops blending (except for unusual D3D modes)

- Added new "macro" functions spe_float_min() and spe_float_max()
  to rtasm_ppc_spe.{ch}.  These emit instructions that cause
  the minimum or maximum of each element in a vector of floats
  to be saved in the destination register.

- Major changes to cell_gen_fragment.c to implement all the blending
  modes (except for the mysterious D3D-based PIPE_BLENDFACTOR_SRC1_COLOR,
  PIPE_BLENDFACTOR_SRC1_ALPHA, PIPE_BLENDFACTOR_INV_SRC1_COLOR, and
  PIPE_BLENDFACTOR_INV_SRC1_ALPHA).

- Some revamping of code in cell_gen_fragment.c: use the new spe_float_min()
  and spe_float_max() functions (instead of expanding these calculations
  inline via macros); create and use an inline utility function for handling
  "optional" register allocation (for the {1,1,1,1} vector, and the
  blend color vectors) instead of expanding with macros; use the Float
  Multiply and Subtract (fnms) instruction to simplify and optimize many
  blending calculations.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  41 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |   8 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 546 ++++++++++++++---------
 3 files changed, 377 insertions(+), 218 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 870ae802c5..12e0826fb9 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -384,7 +384,7 @@ void spe_release_func(struct spe_function *p)
 
 
 /**
- * Alloate a SPE register.
+ * Allocate a SPE register.
  * \return register index or -1 if none left.
  */
 int spe_allocate_available_register(struct spe_function *p)
@@ -646,5 +646,44 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
    }
 }
 
+/* For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB.  But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void 
+spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rA, rB, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
+/* For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ * 
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void 
+spe_float_max(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rB, rA, compare_reg);
+   spe_release_register(p, compare_reg);
+}
 
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 2579045232..4ef05ea27d 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -322,6 +322,14 @@ spe_zero(struct spe_function *p, unsigned rT);
 extern void
 spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
 
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
 
 /* Floating-point instructions
  */
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 2c80dd712e..9d25e820ad 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -229,35 +229,26 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    spe_release_register(f, amask_reg);
 }
 
-/* This is a convenient and oft-used sequence.  It chooses
- * the smaller of each element of reg1 and reg2, and combines them
- * into the result register, as follows:
- * 
- * The Float Compare Greater Than (fcgt) instruction will put
- * 1s into compare_reg where reg1 > reg2, and 0s where reg1 <= reg2.
- *
- * Then the Select Bits (selb) instruction will take bits from
- * reg1 where compare_reg is 0, and from reg2 where compare_reg is
- * 1.  Ergo, result_reg will have the bits from reg1 where reg1 <= reg2,
- * and the bits from reg2 where reg1 > reg2, which is exactly the
- * MIN operation.
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
  */
-#define FLOAT_VECTOR_MIN(f, result_reg, reg1, reg2) {\
-   int compare_reg = spe_allocate_available_register(f); \
-   spe_fcgt(f, compare_reg, reg1, reg2); \
-   spe_selb(f, result_reg, reg1, reg2, compare_reg); \
-   spe_release_register(f, compare_reg); \
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   *r = spe_allocate_available_register(f);
+   spe_load_float(f, *r, value);
+   *is_already_set = true;
 }
 
-/* The FLOAT_VECTOR_MAX sequence is similar to the FLOAT_VECTOR_MIN 
- * sequence above, except that the registers specified when selecting
- * bits are reversed.
- */
-#define FLOAT_VECTOR_MAX(f, result_reg, reg1, reg2) {\
-   int compare_reg = spe_allocate_available_register(f); \
-   spe_fcgt(f, compare_reg, reg1, reg2); \
-   spe_selb(f, result_reg, reg2, reg1, compare_reg); \
-   spe_release_register(f, compare_reg); \
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    if (!*is_already_set) return;
+    spe_release_register(f, r);
+    *is_already_set = false;
 }
 
 /**
@@ -294,51 +285,15 @@ gen_blend(const struct pipe_blend_state *blend,
 
    int tmp_reg = spe_allocate_available_register(f);
 
-   /* These values might or might not eventually get put into
-    * registers.  We avoid allocating them and setting them until
-    * they're actually needed; then we avoid setting them more than
-    * once, and release them at the end of code generation.
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
     */
-   boolean one_reg_set = false; 
-   int one_reg;
-#define SET_ONE_REG_IF_UNSET(f) if (!one_reg_set) {\
-   one_reg = spe_allocate_available_register(f); \
-   spe_load_float(f, one_reg, 1.0f); \
-   one_reg_set = true; \
-}
-#define RELEASE_ONE_REG_IF_USED(f) if (one_reg_set) {\
-   spe_release_register(f, one_reg); \
-}
-  
-   boolean const_color_set = false;
-   int constR_reg, constG_reg, constB_reg;
-#define SET_CONST_COLOR_IF_UNSET(f, blend_color) if (!const_color_set) {\
-   constR_reg = spe_allocate_available_register(f); \
-   constG_reg = spe_allocate_available_register(f); \
-   constG_reg = spe_allocate_available_register(f); \
-   spe_load_float(f, constR_reg, blend_color->color[0]); \
-   spe_load_float(f, constG_reg, blend_color->color[1]); \
-   spe_load_float(f, constB_reg, blend_color->color[2]); \
-   const_color_set = true;\
-}
-#define RELEASE_CONST_COLOR_IF_USED(f) if (const_color_set) {\
-   spe_release_register(f, constR_reg); \
-   spe_release_register(f, constG_reg); \
-   spe_release_register(f, constB_reg); \
-}
-
-   boolean const_alpha_set = false;
-   int constA_reg;
-#define SET_CONST_ALPHA_IF_UNSET(f, blend_color) if (!const_alpha_set) {\
-   constA_reg = spe_allocate_available_register(f); \
-   spe_load_float(f, constA_reg, blend_color->color[3]); \
-   const_alpha_set = true; \
-}
-#define RELEASE_CONST_ALPHA_IF_USED(f) if (const_alpha_set) {\
-   spe_release_register(f, constA_reg); \
-}
-
-   /* Real code starts here */
+   boolean one_reg_set = false;
+   unsigned int one_reg;
+   boolean constR_reg_set = false, constG_reg_set = false, 
+      constB_reg_set = false, constA_reg_set = false;
+   unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
 
    ASSERT(blend->blend_enable);
 
@@ -419,10 +374,11 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_release_register(f, mask_reg);
    }
 
-
    /*
     * Compute Src RGB terms.  We're actually looking for the value
-    * of (the appropriate RGB factors) * (the incoming source RGB color).
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
     */
    switch (blend->rgb_src_factor) {
    case PIPE_BLENDFACTOR_ONE:
@@ -450,18 +406,13 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - R */
-      spe_fs(f, tmp_reg, one_reg, fragR_reg);
-      /* term = R * tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      /* repeat for G and B */
-      spe_fs(f, tmp_reg, one_reg, fragG_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, fragB_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_DST_COLOR:
       /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
@@ -470,30 +421,22 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
       break;
    case PIPE_BLENDFACTOR_INV_DST_COLOR:
-      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - Rfb */
-      spe_fs(f, tmp_reg, one_reg, fbR_reg);
-      /* term = R * tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      /* repeat for G and B */
-      spe_fs(f, tmp_reg, one_reg, fbG_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, fbB_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - A */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* term = R * tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      /* repeat for G and B with the same (1-A) factor */
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_DST_ALPHA:
       /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
@@ -502,19 +445,19 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) */
-      /* we'll need the optional constant {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
-      /* tmp = 1 - A */
-      spe_fs(f, tmp_reg, one_reg, fbA_reg);
-      /* term = R * tmp, G*tmp, and B*tmp */
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_CONST_COLOR:
-      /* We'll need the optional blend color registers */
-      SET_CONST_COLOR_IF_UNSET(f,blend_color)
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
       /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
       spe_fm(f, term1R_reg, fragR_reg, constR_reg);
       spe_fm(f, term1G_reg, fragG_reg, constG_reg);
@@ -522,55 +465,61 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_CONST_ALPHA:
       /* we'll need the optional constant alpha register */
-      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
       /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
       spe_fm(f, term1R_reg, fragR_reg, constA_reg);
       spe_fm(f, term1G_reg, fragG_reg, constA_reg);
       spe_fm(f, term1B_reg, fragB_reg, constA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-      /* We need both the optional {1,1,1,1} register, and the optional
-       * constant color registers
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
        */
-      SET_ONE_REG_IF_UNSET(f)
-      SET_CONST_COLOR_IF_UNSET(f, blend_color)
-      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) */
-      spe_fs(f, tmp_reg, one_reg, constR_reg);
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, constG_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fs(f, tmp_reg, one_reg, constB_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-      /* We need the optional {1,1,1,1} register and the optional 
-       * constant alpha register
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
        */
-      SET_ONE_REG_IF_UNSET(f)
-      SET_CONST_ALPHA_IF_UNSET(f, blend_color)
-      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac)) */
-      spe_fs(f, tmp_reg, one_reg, constA_reg);
-      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
-      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
-      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
       /* We'll need the optional {1,1,1,1} register */
-      SET_ONE_REG_IF_UNSET(f)
+      setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
       /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
        * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
        */
       /* tmp = 1 - Afb */
       spe_fs(f, tmp_reg, one_reg, fbA_reg);
       /* tmp = min(A,tmp) */
-      FLOAT_VECTOR_MIN(f, tmp_reg, fragA_reg, tmp_reg)
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
       /* term = R*tmp */
       spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
       spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
       spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
       break;
 
-      /* non-OpenGL cases? */
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
    case PIPE_BLENDFACTOR_SRC1_COLOR:
    case PIPE_BLENDFACTOR_SRC1_ALPHA:
    case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
@@ -581,132 +530,293 @@ gen_blend(const struct pipe_blend_state *blend,
    }
 
    /*
-    * Compute Src Alpha term
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
       spe_move(f, term1A_reg, fragA_reg);
       break;
+
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
       spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
       break;
-      /* XXX more cases */
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest RGB terms
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->rgb_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
       spe_move(f, term2R_reg, fbR_reg);
       spe_move(f, term2G_reg, fbG_reg);
       spe_move(f, term2B_reg, fbB_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2R_reg);
-      spe_zero(f, term2G_reg);
-      spe_zero(f, term2B_reg);
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
       break;
    case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
       spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
       break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
    case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
       spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
       spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
       spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
       break;
    case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-#if 0
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         one_reg = spe_allocate_available_register(f);
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* term = fb * tmp */
-      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
-      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
-      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
-#else
-      /* Compute:  term2x = fbx * (1.0 - fragA)
-       * Which is:  term2x = fbx - fbx * fragA
-       * Use fnms t,a,b,c which computes t=c-a*b
-       */
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
       spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
       spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
       spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
-#endif
       break;
-      /* XXX more cases */
-      // GL_ONE_MINUS_SRC_COLOR
-      // GL_DST_COLOR
-      // GL_ONE_MINUS_DST_COLOR
-      // GL_DST_ALPHA
-      // GL_CONSTANT_COLOR
-      // GL_ONE_MINUS_CONSTANT_COLOR
-      // GL_CONSTANT_ALPHA
-      // GL_ONE_MINUS_CONSTANT_ALPHA
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
    default:
       ASSERT(0);
    }
 
    /*
-    * Compute Dest Alpha term
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
     */
    switch (blend->alpha_dst_factor) {
    case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
       spe_move(f, term2A_reg, fbA_reg);
       break;
    case PIPE_BLENDFACTOR_ZERO:
-      spe_zero(f, term2A_reg);
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
       break;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
       spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
       break;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-#if 0
-      /* one = {1.0, 1.0, 1.0, 1.0} */
-      if (!one_reg_set) {
-         one_reg = spe_allocate_available_register(f);
-         spe_load_float(f, one_reg, 1.0f);
-         one_reg_set = true;
-      }
-      /* tmp = one - fragA */
-      spe_fs(f, tmp_reg, one_reg, fragA_reg);
-      /* termA = fbA * tmp */
-      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
-#else
-      /* Compute:  term2A = fbA * (1.0 - fragA)
-       * Which is:  term2A = fbA - fbA * fragA
-       * Use fnms t,a,b,c which computes t=c-a*b
-       */
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
       spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
-#endif
       break;
-      /* XXX more cases */
-      // GL_ONE_MINUS_SRC_COLOR
-      // GL_DST_COLOR
-      // GL_ONE_MINUS_DST_COLOR
-      // GL_DST_ALPHA
-      // GL_CONSTANT_COLOR
-      // GL_ONE_MINUS_CONSTANT_COLOR
-      // GL_CONSTANT_ALPHA
-      // GL_ONE_MINUS_CONSTANT_ALPHA
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
    default:
       ASSERT(0);
    }
 
    /*
-    * Combine Src/Dest RGB terms
+    * Combine Src/Dest RGB terms as per the blend equation.
     */
    switch (blend->rgb_func) {
    case PIPE_BLEND_ADD:
@@ -725,14 +835,14 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
       break;
    case PIPE_BLEND_MIN:
-      FLOAT_VECTOR_MIN(f, fragR_reg, term1R_reg, term2R_reg)
-      FLOAT_VECTOR_MIN(f, fragG_reg, term1G_reg, term2G_reg)
-      FLOAT_VECTOR_MIN(f, fragB_reg, term1B_reg, term2B_reg)
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
       break;
    case PIPE_BLEND_MAX:
-      FLOAT_VECTOR_MAX(f, fragR_reg, term1R_reg, term2R_reg)
-      FLOAT_VECTOR_MAX(f, fragG_reg, term1G_reg, term2G_reg)
-      FLOAT_VECTOR_MAX(f, fragB_reg, term1B_reg, term2B_reg)
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
       break;
    default:
       ASSERT(0);
@@ -752,10 +862,10 @@ gen_blend(const struct pipe_blend_state *blend,
       spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
       break;
    case PIPE_BLEND_MIN:
-      FLOAT_VECTOR_MIN(f, fragA_reg, term1A_reg, term2A_reg)
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
       break;
    case PIPE_BLEND_MAX:
-      FLOAT_VECTOR_MAX(f, fragA_reg, term1A_reg, term2A_reg)
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
       break;
    default:
       ASSERT(0);
@@ -779,9 +889,11 @@ gen_blend(const struct pipe_blend_state *blend,
    spe_release_register(f, tmp_reg);
 
    /* Free any optional registers that actually got used */
-   RELEASE_ONE_REG_IF_USED(f)
-   RELEASE_CONST_COLOR_IF_USED(f)
-   RELEASE_CONST_ALPHA_IF_USED(f)
+   release_const_register(f, &one_reg_set, one_reg);
+   release_const_register(f, &constR_reg_set, constR_reg);
+   release_const_register(f, &constG_reg_set, constG_reg);
+   release_const_register(f, &constB_reg_set, constB_reg);
+   release_const_register(f, &constA_reg_set, constA_reg);
 }
 
 
-- 
cgit v1.2.3


From a57fbe53dcb54694da9c9b4be1533c9d800079d2 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 19 Sep 2008 01:55:00 -0600
Subject: CELL: add codegen for logic op, color mask

- rtasm_ppc_spe.c, rtasm_ppc_spe.h: added a new macro function
  "spe_load_uint" for loading and splatting unsigned integers
  in a register; it will use "ila" for values 18 bits or less,
  "ilh" for word values that are symmetric across halfwords,
  "ilhu" for values that have zeroes in their bottom halfwords,
  or "ilhu" followed by "iohl" for general 32-bit values.

  Of the 15 color masks of interest, 4 are 18 bits or less,
  2 are symmetric across halfwords, 3 are zero in the bottom
  halfword, and 6 require two instructions to load.

- cell_gen_fragment.c: added full codegen for logic op and
  color mask.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  23 +++-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |   4 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 143 ++++++++++++++++++++++-
 3 files changed, 163 insertions(+), 7 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 12e0826fb9..f60bfba3f5 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -592,11 +592,32 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
 }
 
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+   /* If the whole value is in the lower 18 bits, use ila, which
+    * doesn't sign-extend.  Otherwise, if the two halfwords of
+    * the constant are identical, use ilh.  Otherwise, we have
+    * to use ilhu followed by iohl.
+    */
+   if ((ui & 0xfffc0000) == ui) {
+      spe_ila(p, rT, ui);
+   }
+   else if ((ui >> 16) == (ui & 0xffff)) {
+      spe_ilh(p, rT, ui & 0xffff);
+   }
+   else {
+      spe_ilhu(p, rT, ui >> 16);
+      if (ui & 0xffff)
+         spe_iohl(p, rT, ui & 0xffff);
+   }
+}
+
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ila(p, rT, 66051);
+   /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+   spe_ila(p, rT, 0x00010203);
    spe_shufb(p, rT, rA, rA, rT);
 }
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 4ef05ea27d..09400b3fb2 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -302,6 +302,10 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 9d25e820ad..899d8423b2 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -902,8 +902,69 @@ gen_logicop(const struct pipe_blend_state *blend,
             struct spe_function *f,
             int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
 }
 
 
@@ -912,11 +973,81 @@ gen_colormask(uint colormask,
               struct spe_function *f,
               int fragRGBA_reg, int fbRGBA_reg)
 {
-   /* XXX to-do */
-   /* operate on 32-bit packed pixels, not float colors */
-}
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+
+   /* The color mask operation can prevent any set of color
+    * components in the incoming fragment from being written to the frame 
+    * buffer; we do this by replacing the masked components of the 
+    * fragment with the frame buffer values.
+    *
+    * There are only 16 possibilities, with a unique mask for
+    * each of the possibilities.  (Technically, there are only 15
+    * possibilities, since we shouldn't be called for the one mask
+    * that does nothing, but the complete implementation is here
+    * anyway to avoid confusion.)
+    *
+    * We implement this via a constant static array which we'll index 
+    * into to get the correct mask.
+    * 
+    * We're dependent on the mask values being low-order bits,
+    * with particular values for each bit; so we start with a
+    * few assertions, which will fail if any of the values were
+    * to change.
+    */
+   ASSERT(PIPE_MASK_R == 0x1);
+   ASSERT(PIPE_MASK_G == 0x2);
+   ASSERT(PIPE_MASK_B == 0x4);
+   ASSERT(PIPE_MASK_A == 0x8);
 
+   /* Here's the list of all possible colormasks, indexed by the
+    * value of the combined mask specifier.
+    */
+   static const unsigned int colormasks[16] = {
+      0x00000000, /* 0: all colors masked */
+      0xff000000, /* 1: PIPE_MASK_R */
+      0x00ff0000, /* 2: PIPE_MASK_G */
+      0xffff0000, /* 3: PIPE_MASK_R | PIPE_MASK_G */
+      0x0000ff00, /* 4: PIPE_MASK_B */
+      0xff00ff00, /* 5: PIPE_MASK_R | PIPE_MASK_B */
+      0x00ffff00, /* 6: PIPE_MASK_G | PIPE_MASK_B */
+      0xffffff00, /* 7: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B */
+      0x000000ff, /* 8: PIPE_MASK_A */
+      0xff0000ff, /* 9: PIPE_MASK_R | PIPE_MASK_A */
+      0x00ff00ff, /* 10: PIPE_MASK_G | PIPE_MASK_A */
+      0xffff00ff, /* 11: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_A */
+      0x0000ffff, /* 12: PIPE_MASK_B | PIPE_MASK_A */
+      0xff00ffff, /* 13: PIPE_MASK_R | PIPE_MASK_B | PIPE_MASK_A */
+      0x00ffffff, /* 14: PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+      0xffffffff  /* 15: PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B | PIPE_MASK_A */
+   };
+
+   /* Get a temporary register to hold the mask */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* Look up the desired mask directly and load it into the mask register.
+    * This will load the same mask into each of the four words in the
+    * mask register.
+    */
+   spe_load_uint(f, colormask_reg, colormasks[colormask]);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
 
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
 
 /**
  * Generate code to pack a quad of float colors into a four 32-bit integers.
@@ -1223,7 +1354,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
-      if (blend->colormask != 0xf) {
+      if (blend->colormask != PIPE_MASK_RGBA) {
          gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
       }
 
-- 
cgit v1.2.3


From 0838b702750d85b0284a97be211fa379e9f8d8d8 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 09:36:29 -0600
Subject: cell: change spe_complement() to take a src and dst reg, like other
 instructions

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      | 14 ++++++++------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  4 ++--
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       |  4 ++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c |  4 ++--
 4 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index f60bfba3f5..85280f680a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -623,9 +623,9 @@ spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 
 
 void
-spe_complement(struct spe_function *p, unsigned rT)
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_nor(p, rT, rT, rT);
+   spe_nor(p, rT, rA, rA);
 }
 
 
@@ -667,7 +667,8 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
    }
 }
 
-/* For each 32-bit float element of rA and rB, choose the smaller of the
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
  * two, compositing them into the rT register.
  * 
  * The Float Compare Greater Than (fcgt) instruction will put 1s into
@@ -683,7 +684,7 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
  * like "x = min(x, a)", we always allocate a new register to be safe.
  */
 void 
-spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
 {
    unsigned int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
@@ -691,7 +692,8 @@ spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned
    spe_release_register(p, compare_reg);
 }
 
-/* For each 32-bit float element of rA and rB, choose the greater of the
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
  * two, compositing them into the rT register.
  * 
  * The logic is similar to that of spe_float_min() above; the only
@@ -699,7 +701,7 @@ spe_float_min(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned
  * so that the larger of the two is selected instead of the smaller.
  */
 void 
-spe_float_max(struct spe_function *p, unsigned int rT, unsigned int rA, unsigned int rB)
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
 {
    unsigned int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 09400b3fb2..8a0d70fdac 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -310,9 +310,9 @@ spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
 
-/** Complement/invert all bits in rT. */
+/** rT = complement_all_bits(rA). */
 extern void
-spe_complement(struct spe_function *p, unsigned rT);
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
 
 /** rT = rA. */
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 6f2b89c695..d835aae255 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -924,7 +924,7 @@ emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
    /* tmp = (s1_reg == 0) */
    spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
    /* tmp = !tmp */
-   spe_complement(gen->f, tmp_reg);
+   spe_complement(gen->f, tmp_reg, tmp_reg);
    /* exec_mask = exec_mask & tmp */
    spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
 
@@ -944,7 +944,7 @@ emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
    spe_comment(gen->f, -4, "ELSE:");
 
    /* exec_mask = !exec_mask */
-   spe_complement(gen->f, exec_reg);
+   spe_complement(gen->f, exec_reg, exec_reg);
 
    return true;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 899d8423b2..06a9fa102f 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -920,7 +920,7 @@ gen_logicop(const struct pipe_blend_state *blend,
          spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
          break;
       case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
-         spe_complement(f, fragRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
          break;
       case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
          /* andc R, A, B computes R = A & ~B */
@@ -941,7 +941,7 @@ gen_logicop(const struct pipe_blend_state *blend,
          break;
       case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
          spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
-         spe_complement(f, fragRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
          break;
       case PIPE_LOGICOP_NOOP: /* d */
          spe_move(f, fragRGBA_reg, fbRGBA_reg);
-- 
cgit v1.2.3


From 7af5f944e5709920623c766bc572f8d587709270 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 17:45:51 -0600
Subject: gallium: added spe_code_size()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 7 +++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 85280f680a..1c3e21b4c0 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -383,6 +383,13 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+/** Return current code size in bytes. */
+unsigned spe_code_size(const struct spe_function *p)
+{
+   return p->num_inst * SPE_INST_SIZE;
+}
+
+
 /**
  * Allocate a SPE register.
  * \return register index or -1 if none left.
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 8a0d70fdac..4165a971a2 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -72,6 +72,7 @@ struct spe_function
 
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
+extern unsigned spe_code_size(const struct spe_function *p);
 
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
-- 
cgit v1.2.3


From 99cdfc997b9da10fee57cf1048a55354e1ee4244 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 19 Sep 2008 17:55:54 -0600
Subject: cell: use different opcodes for spe_move() depending on even/odd
 address

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 1c3e21b4c0..491141f190 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -639,7 +639,13 @@ spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
 void
 spe_move(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ori(p, rT, rA, 0);
+   /* Use different instructions depending on the instruction address
+    * to take advantage of the dual pipelines.
+    */
+   if (p->num_inst & 1)
+      spe_shlqbyi(p, rT, rA, 0);  /* odd pipe */
+   else
+      spe_ori(p, rT, rA, 0);  /* even pipe */
 }
 
 
-- 
cgit v1.2.3


From 938e12c1caee7e34fcc6630f17f422ebdd824ec3 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 26 Sep 2008 17:06:22 -0600
Subject: gallium: SPU register comments

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 4165a971a2..61c7edeb60 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -40,10 +40,10 @@
 /** number of general-purpose SIMD registers */
 #define SPE_NUM_REGS  128
 
-/** Return Address register */
+/** Return Address register (aka $lr / Link Register) */
 #define SPE_REG_RA  0
 
-/** Stack Pointer register */
+/** Stack Pointer register (aka $sp) */
 #define SPE_REG_SP  1
 
 
-- 
cgit v1.2.3


From 6607f2cf19d083a979716a341e6e175aef7d6830 Mon Sep 17 00:00:00 2001
From: José Fonseca <jrfonseca@tungstengraphics.com>
Date: Mon, 29 Sep 2008 19:09:39 +0900
Subject: rtasm: Implement immediate group 1 instructions. Fix SIB emition.

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 66 +++++++++++++++++++++++++-----
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h | 11 ++---
 2 files changed, 62 insertions(+), 15 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 6d4c081e04..3bba9dcc07 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -240,7 +240,8 @@ static void emit_modrm( struct x86_function *p,
    /* Oh-oh we've stumbled into the SIB thing.
     */
    if (regmem.file == file_REG32 &&
-       regmem.idx == reg_SP) {
+       regmem.idx == reg_SP &&
+       regmem.mod != mod_REG) {
       emit_1ub(p, 0x24);		/* simplistic! */
    }
 
@@ -435,25 +436,70 @@ void x86_call( struct x86_function *p, struct x86_reg reg)
 }
 
 
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 {
    DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
    assert(dst.mod == mod_REG);
    emit_1ub(p, 0xb8 + dst.idx);
    emit_1i(p, imm);
 }
 
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm )
+/**
+ * Immediate group 1 instructions.
+ */
+static INLINE void 
+x86_group1_imm( struct x86_function *p, 
+                unsigned op, struct x86_reg dst, int imm )
 {
-   DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
    assert(dst.mod == mod_REG);
-   emit_1ub(p, 0x80);
-   emit_modrm_noreg(p, 0, dst);
-   emit_1ub(p, imm);
+   if(-0x80 <= imm && imm < 0x80) {
+      emit_1ub(p, 0x83);
+      emit_modrm_noreg(p, op, dst);
+      emit_1b(p, (char)imm);
+   }
+   else {
+      emit_1ub(p, 0x81);
+      emit_modrm_noreg(p, op, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 0, dst, imm);
+}
+
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 1, dst, imm);
+}
+
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 4, dst, imm);
+}
+
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 5, dst, imm);
+}
+
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 6, dst, imm);
+}
+
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 7, dst, imm);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index af94577aab..510aa1b0de 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -152,12 +152,13 @@ void x86_jmp( struct x86_function *p, int label );
 /* void x86_call( struct x86_function *p, void (*label)() ); */
 void x86_call( struct x86_function *p, struct x86_reg reg);
 
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm );
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm );
 
 
 /* Macro for sse_shufps() and sse2_pshufd():
-- 
cgit v1.2.3


From 102daee1b8971cf39235e220b9524bec1e4a7089 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 2 Oct 2008 12:46:01 +0100
Subject: rtasm: add prefetch instructions

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 26 ++++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  5 +++++
 2 files changed, 31 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 6d4c081e04..9085f4cc0e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -629,6 +629,32 @@ void x86_and( struct x86_function *p,
  * SSE instructions
  */
 
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 0, ptr);
+}
+
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 1, ptr);
+}
+
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 2, ptr);
+}
+
+
+
 
 void sse_movss( struct x86_function *p,
 		struct x86_reg dst,
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index af94577aab..2d7715f965 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -184,6 +184,11 @@ void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg ar
 void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
+
 void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-- 
cgit v1.2.3


From 66d4beb874606baab95fb6539de895eb373b0ccb Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Thu, 2 Oct 2008 12:46:01 +0100
Subject: rtasm: add prefetch instructions

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 26 ++++++++++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  5 +++++
 2 files changed, 31 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 3bba9dcc07..a5abbcde49 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -675,6 +675,32 @@ void x86_and( struct x86_function *p,
  * SSE instructions
  */
 
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 0, ptr);
+}
+
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 1, ptr);
+}
+
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 2, ptr);
+}
+
+
+
 
 void sse_movss( struct x86_function *p,
 		struct x86_reg dst,
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 510aa1b0de..86091e7f6b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -185,6 +185,11 @@ void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg ar
 void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
+
 void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-- 
cgit v1.2.3


From 6965532e14717f71a6f4353fb683c5070c6b7d7a Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Fri, 3 Oct 2008 13:50:34 +0100
Subject: rtasm: add sse_movntps

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 12 ++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_x86sse.h |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 9085f4cc0e..cc5871f873 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -653,6 +653,18 @@ void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
    emit_modrm_noreg(p, 2, ptr);
 }
 
+void sse_movntps( struct x86_function *p, 
+                  struct x86_reg dst,
+                  struct x86_reg src)
+{
+   DUMP_RR( dst, reg );
+
+   assert(dst.mod != mod_REG);
+   assert(src.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0x2b);
+   emit_modrm(p, src, dst);
+}
+
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 2d7715f965..af79f07dd3 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -189,6 +189,8 @@ void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
 void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
 void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
 
+void sse_movntps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
+
 void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
-- 
cgit v1.2.3


From afaa53040bd01ca86762e7d7b1a5a65810767921 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 3 Oct 2008 18:00:43 -0600
Subject: CELL: changes to generate SPU code for stenciling

This set of code changes are for stencil code generation
support.  Both one-sided and two-sided stenciling are supported.
In addition to the raw code generation changes, these changes had
to be made elsewhere in the system:

- Added new "register set" feature to the SPE assembly generation.
  A "register set" is a way to allocate multiple registers and free
  them all at the same time, delegating register allocation management
  to the spe_function unit.  It's quite useful in complex register
  allocation schemes (like stenciling).

- Added and improved SPE macro calculations.
  These are operations between registers and unsigned integer
  immediates.  In many cases, the calculation can be performed
  with a single instruction; the macros will generate the
  single instruction if possible, or generate a register load
  and register-to-register operation if not.  These macro
  functions are: spe_load_uint() (which has new ways to
  load a value in a single instruction), spe_and_uint(),
  spe_xor_uint(), spe_compare_equal_uint(), and spe_compare_greater_uint().

- Added facing to fragment generation.  While rendering, the rasterizer
  needs to be able to determine front- and back-facing fragments, in order
  to correctly apply two-sided stencil.  That requires these changes:
  - Added front_winding field to the cell_command_render block, so that
    the state tracker could communicate to the rasterizer what it
    considered to be the front-facing direction.
  - Added fragment facing as an input to the fragment function.
  - Calculated facing is passed during emit_quad().
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        | 246 +++++-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h        |  41 +-
 src/gallium/drivers/cell/common.h                  |   1 +
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 881 ++++++++++++++++++---
 src/gallium/drivers/cell/ppu/cell_render.c         |   1 +
 src/gallium/drivers/cell/ppu/cell_vbuf.c           |   1 +
 src/gallium/drivers/cell/spu/spu_main.h            |   3 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |  19 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   3 +-
 src/gallium/drivers/cell/spu/spu_render.c          |   4 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  35 +-
 src/gallium/drivers/cell/spu/spu_tri.h             |   2 +-
 12 files changed, 1091 insertions(+), 146 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 491141f190..8a87e9abb1 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -359,14 +359,21 @@ void _name (struct spe_function *p, int imm) \
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
+    register unsigned int i;
+
     p->store = align_malloc(code_size, 16);
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
 
+    p->set_count = 0;
+    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
-    p->regs[0] = ~7;
-    p->regs[1] = (1U << (80 - 64)) - 1;
+    p->regs[0] = p->regs[1] = p->regs[2] = 1;
+    for (i = 80; i <= 127; i++) {
+      p->regs[i] = 1;
+    }
 
     p->print = false;
     p->indent = 0;
@@ -398,12 +405,8 @@ int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      const uint64_t mask = (1ULL << (i % 64));
-      const unsigned idx = i / 64;
-
-      assert(idx < 2);
-      if ((p->regs[idx] & mask) != 0) {
-         p->regs[idx] &= ~mask;
+      if (p->regs[i] == 0) {
+         p->regs[i] = 1;
          return i;
       }
    }
@@ -417,31 +420,68 @@ int spe_allocate_available_register(struct spe_function *p)
  */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
-
    assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) != 0);
-
-   p->regs[idx] &= ~(1ULL << bit);
+   assert(p->regs[reg] == 0);
+   p->regs[reg] = 1;
    return reg;
 }
 
 
 /**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated".  Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 1);
 
-   assert(idx < 2);
+   p->regs[reg] = 0;
+}
 
-   assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) == 0);
+/**
+ * Start a new set of registers.  This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+   register unsigned int i;
+
+   /* Keep track of the set count.  If it ever wraps around to 0, 
+    * we're in trouble.
+    */
+   p->set_count++;
+   assert(p->set_count > 0);
+
+   /* Increment the allocation count of all registers currently
+    * allocated.  Then any registers that are allocated in this set
+    * will be the only ones with a count of 1; they'll all be released
+    * when the register set is released.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]++;
+   }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* If the set count drops below zero, we're in trouble. */
+   assert(p->set_count > 0);
+   p->set_count--;
 
-   p->regs[idx] |= (1ULL << bit);
+   /* Drop the allocation level of all registers.  Any allocated
+    * during this register set will drop to 0 and then become
+    * available.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0) p->regs[i]--;
+   }
 }
 
 
@@ -603,8 +643,10 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
 {
    /* If the whole value is in the lower 18 bits, use ila, which
     * doesn't sign-extend.  Otherwise, if the two halfwords of
-    * the constant are identical, use ilh.  Otherwise, we have
-    * to use ilhu followed by iohl.
+    * the constant are identical, use ilh.  Otherwise, if every byte of
+    * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+    * Bytes Immediate (fsmbi) to load the value in a single instruction.
+    * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
    if ((ui & 0xfffc0000) == ui) {
       spe_ila(p, rT, ui);
@@ -612,13 +654,171 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
    else if ((ui >> 16) == (ui & 0xffff)) {
       spe_ilh(p, rT, ui & 0xffff);
    }
+   else if (
+      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+   ) {
+      unsigned int mask = 0;
+      /* fsmbi duplicates each bit in the given mask eight times,
+       * using a 16-bit value to initialize a 16-byte quadword.
+       * Each 4-bit nybble of the mask corresponds to a full word
+       * of the result; look at the value and figure out the mask
+       * (replicated for each word in the quadword), and then
+       * form the "select mask" to get the value.
+       */
+      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+      spe_fsmbi(p, rT, mask);
+   }
    else {
+      /* The general case: this usually uses two instructions, but
+       * may use only one if the low-order 16 bits of each word are 0.
+       */
       spe_ilhu(p, rT, ui >> 16);
       if (ui & 0xffff)
          spe_iohl(p, rT, ui & 0xffff);
    }
 }
 
+/* This function is constructed identically to spe_sor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either And Byte Immediate
+    * (which uses the same constant across each byte), And Halfword Immediate
+    * (which sign-extends a 10-bit immediate to 16 bits and uses that
+    * across each halfword), or And Word Immediate (which sign-extends
+    * a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use And Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_andi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use And Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_andhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the And Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_andbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_and(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+/* This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either Exclusive Or Byte 
+    * Immediate (which uses the same constant across each byte), Exclusive 
+    * Or Halfword Immediate (which sign-extends a 10-bit immediate to 
+    * 16 bits and uses that across each halfword), or Exclusive Or Word 
+    * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   register unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use Exclusive Or Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_xori(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use Exclusive Or Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_xorhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the Exclusive Or Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_xorbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_xor(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 9 bits or less, it fits inside a
+    * Compare Equal Word Immediate instruction.
+    */
+   if ((ui & 0x000001ff) == ui) {
+      spe_ceqi(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_ceq(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 10 bits or less, it fits inside a
+    * Compare Logical Greater Than Word Immediate instruction.
+    */
+   if ((ui & 0x000003ff) == ui) {
+      spe_clgti(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_clgt(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 61c7edeb60..cd2e245409 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -53,17 +53,26 @@ struct spe_function
    uint num_inst;
    uint max_inst;
 
-    /**
-     * Mask of used / unused registers
-     *
-     * Each set bit corresponds to an available register.  Each cleared bit
-     * corresponds to an allocated register.
+   /**
+    * The "set count" reflects the number of nested register sets
+    * are allowed.  In the unlikely case that we exceed the set count,
+    * register allocation will start to be confused, which is critical
+    * enough that we check for it.
+    */
+   unsigned char set_count;
+
+   /** 
+    * Flags for used and unused registers.  Each byte corresponds to a
+    * register; a 0 in that byte means that the register is available.
+    * A value of 1 means that the register was allocated in the current
+    * register set.  Any other value N means that the register was allocated
+    * N register sets ago.
      *
      * \sa
      * spe_allocate_register, spe_allocate_available_register,
-     * spe_release_register
+     * spe_allocate_register_set, spe_release_register_set, spe_release_register, 
      */
-    uint64_t regs[SPE_NUM_REGS / 64];
+    unsigned char regs[SPE_NUM_REGS];
 
     boolean print; /**< print/dump instructions as they're emitted? */
     int indent;    /**< number of spaces to indent */
@@ -77,6 +86,8 @@ extern unsigned spe_code_size(const struct spe_function *p);
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
 
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
@@ -307,6 +318,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i);
 extern void
 spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
 
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 99329fd8e2..c223bc1744 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -227,6 +227,7 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 653afc235d..f920ae13b4 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,10 +54,12 @@
  * \param ifragZ_reg  register containing integer fragment Z values (in)
  * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
  * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
  */
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
-               struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
    /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
@@ -132,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
        * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
        */
       spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return true;
    }
+
+   return false;
 }
 
 
@@ -238,22 +243,34 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  * it and have to allocate and load it again unnecessarily.
  */
 static inline void
-setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
 {
    if (*is_already_set) return;
    *r = spe_allocate_available_register(f);
-   spe_load_float(f, *r, value);
-   *is_already_set = true;
 }
 
 static inline void
-release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
 {
     if (!*is_already_set) return;
     spe_release_register(f, r);
     *is_already_set = false;
 }
 
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   setup_optional_register(f, is_already_set, r);
+   spe_load_float(f, *r, value);
+}
+
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    release_optional_register(f, is_already_set, r);
+}
+
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
  * \param f          SPE function to append instruction onto.
@@ -1117,6 +1134,633 @@ gen_colormask(struct spe_function *f,
     spe_release_register(f, colormask_reg);
 }
 
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
+                 unsigned int mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+    * register, taking into account whether each fragment was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      /* stencil_pass = mask & (s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      /* stencil_fail = mask & ~stencil_pass */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* stencil_pass = mask & ~(s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* stencil_pass = mask & (s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_LESS: {
+      /* stencil_pass = mask & (reference > s) */
+      /* There's no convenient Compare Less Than Immediate instruction, so
+       * we'll have to do this one the harder way, by loading a register and 
+       * comparing directly.  Compare Logical Greater Than Word (clgt) 
+       * treats its operands as unsigned - no sign extension.
+       */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_LEQUAL:
+      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL: {
+      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
+      /* As above, we have to do this by loading a register */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      spe_move(f, stencil_pass_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = mask & 1 = mask */
+      spe_move(f, stencil_pass_reg, mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+    */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
+                   unsigned int fbS_reg, unsigned int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+                       unsigned int fbS_reg, 
+                       unsigned int *fail_reg, unsigned int *zfail_reg, 
+                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
+                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+{
+   unsigned zfail_op, back_zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(dsa->stencil[0].enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes.  In particular, that means that the "zfail_op" (and the backfacing
+    * counterpart, if active) are not considered - a failing stencil test will
+    * trigger the "fail_op", and a passing stencil test will trigger the
+    * "zpass_op".
+    *
+    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
+    * we keep them from being calculated.
+    */
+   if (dsa->depth.enabled) {
+      zfail_op = dsa->stencil[0].zfail_op;
+      back_zfail_op = dsa->stencil[1].zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+      back_zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == dsa->stencil[0].fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+
+   /* If two-sided stencil is enabled, we have more work to do. */
+   if (!dsa->stencil[1].enabled) {
+      /* This just flags that the registers need not be deallocated later */
+      *back_fail_reg = fbS_reg;
+      *back_zfail_reg = fbS_reg;
+      *back_zpass_reg = fbS_reg;
+   }
+   else {
+      /* Same calculations as above, but for the back stencil */
+      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_fail_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
+         *back_fail_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == zfail_op) {
+         *back_fail_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
+         *back_fail_reg = *zpass_reg;
+      }
+      else {
+         *back_fail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_fail_reg);
+      }
+
+      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zfail_reg = fbS_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].fail_op) {
+         *back_zfail_reg = *fail_reg;
+      }
+      else if (back_zfail_op == zfail_op) {
+         *back_zfail_reg = *zfail_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
+         *back_zfail_reg = *zpass_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[1].fail_op) {
+         *back_zfail_reg = *back_fail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zfail_reg);
+      }
+
+      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zpass_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
+         *back_zpass_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == zfail_op) {
+         *back_zpass_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
+         *back_zpass_reg = *zpass_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
+         *back_zpass_reg = *back_fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
+         *back_zpass_reg = *back_zfail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zpass_reg);
+      }
+   } /* End of calculations for back-facing stencil */
+}
+
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const int const facing_reg,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = false;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   unsigned int stencil_pass_reg, stencil_fail_reg;
+   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   unsigned int stencil_writemask_reg;
+   unsigned int zmask_reg;
+   unsigned int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_allocate_register_set(f);
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    */
+   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Trivial: don't need to calculate stencil values, and don't need to 
+       * write them back to the framebuffer.
+       */
+      need_to_calculate_stencil_values = false;
+      need_to_writemask_stencil_values = false;
+   }
+   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = false;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = true;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
+      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
+         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
+         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
+         spe_release_register(f, back_write_mask_reg);
+      }
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* If two-sided stenciling is on, generate code to run the stencil
+    * test on the backfacing stencil as well, and combine the two results
+    * into the one correct result based on facing.
+    */
+   if (dsa->stencil[1].enabled) {
+      unsigned int temp_reg = spe_allocate_available_register(f);
+      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
+      spe_release_register(f, temp_reg);
+   }
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
+      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
+
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      gen_get_stencil_values(f, dsa, fbS_reg, 
+         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
+         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
+         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
+
+      /* Tricky, tricky, tricky - the things we do to create optimal
+       * code...
+       *
+       * The various stencil values registers may overlap with each other
+       * and with fbS_reg arbitrarily (as any particular operation is
+       * only calculated once and stored in one register, no matter
+       * how many times it is used).  So we can't change the values 
+       * within those registers directly - if we change a value in a
+       * register that's being referenced by two different calculations,
+       * we've just unwittingly changed the second value as well...
+       *
+       * Avoid this by allocating new registers to hold the results
+       * (there may be 2, if the depth test is off, or 3, if it is on).
+       * These will be released as part of the register set.
+       */
+      if (!dsa->stencil[1].enabled) {
+         /* The easy case: if two-sided stenciling is *not* enabled, we
+          * just use the front-sided values.
+          */
+         stencil_fail_values = front_stencil_fail_values;
+         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
+         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
+      }
+      else { /* two-sided stencil enabled */
+         /* Allocate new registers for the needed merged values */
+         stencil_fail_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
+         if (dsa->depth.enabled) {
+            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
+            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
+         }
+         else {
+            stencil_pass_depth_fail_values = fbS_reg;
+         }
+         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
+      }
+   }
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_move(f, newS_reg, fbS_reg);
+         modified_buffers = true;
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+      }
+
+      /* Same for the stencil pass/depth pass mask */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+         spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+         spe_release_register(f, stencil_pass_depth_pass_mask);
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_release_register_set(f);
+
+   /* Return true if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
 /**
  * Generate SPE code to implement the fragment operations (alpha test,
  * depth test, stencil test, blending, colormask, and final
@@ -1156,6 +1800,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
+   const int facing_reg = 13; /* uint */
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1183,6 +1828,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
+   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1195,6 +1841,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
+      spe_comment(f, 0, "Computing tile location in memory");
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
       spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
@@ -1205,124 +1852,164 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
-
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
 
+   /* If we need the stencil buffers (because one- or two-sided stencil is
+    * enabled) or the depth buffer (because the depth test is enabled),
+    * go grab them.  Note that if either one- or two-sided stencil is
+    * enabled, dsa->stencil[0].enabled will be true.
+    */
    if (dsa->depth.enabled || dsa->stencil[0].enabled) {
       const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
       boolean write_depth_stencil;
 
-      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
-      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+      /* We may or may not need to allocate a register for Z or stencil values */
+      boolean fbS_reg_set = false, fbZ_reg_set = false;
+      unsigned int fbS_reg, fbZ_reg = 0;
+
+      spe_comment(f, 0, "Loading Z/stencil tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
       spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
 
-      if (dsa->depth.enabled) {
-         /* Extract Z bits from fbZS_reg into fbZ_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            int mask_reg = spe_allocate_available_register(f);
-            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
-            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
-            spe_release_register(f, mask_reg);
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_rotmi(f, fbZ_reg, fbZS_reg, -8);  /* fbZ = fbZS >> 8 */
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 32-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 16-bit Z values now */
-         }
-         else {
-            ASSERT(0);  /* invalid format */
-         }
-
-         /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 8 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 16 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
-         }
-      }
-      else {
-         /* no Z test, but set Z to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
+      /* From the Z/stencil buffer format, pull out the bits we need for
+       * Z and/or stencil.  We'll also convert the incoming fragment Z
+       * value in fragZ_reg from a floating point value in [0.0..1.0] to
+       * an unsigned integer value with the appropriate resolution.
+       */
+      switch(zs_format) {
+
+         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+         case PIPE_FORMAT_X8Z24_UNORM:
+            if (dsa->depth.enabled) {
+               /* We need the Z part at least */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* four 24-bit Z values in the low-order bits */
+               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* four 8-bit Z values in the high-order bits */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+            }
+            break;
+
+         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+         case PIPE_FORMAT_Z24X8_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* shift by 8 to get the upper 24-bit values */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* 8-bit stencil in the low-order bits - mask them out */
+               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+            }
+            break;
+
+         case PIPE_FORMAT_Z32_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 32-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            }
+            /* No stencil, so can't do anything there */
+            break;
+
+         case PIPE_FORMAT_Z16_UNORM:
+            if (dsa->depth.enabled) {
+               /* XXX Not sure this is correct, but it was here before, so we're
+                * going with it for now
+                */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 16-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+            }
+            /* No stencil */
+            break;
+
+         default:
+            ASSERT(0); /* invalid format */
       }
 
-
+      /* If stencil is enabled, use the stencil-specific code
+       * generator to generate both the stencil and depth (if needed)
+       * tests.  Otherwise, if only depth is enabled, generate
+       * a quick depth test.  The test generators themselves will
+       * report back whether the depth/stencil buffer has to be
+       * written back.
+       */
       if (dsa->stencil[0].enabled) {
-         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX extract with a shift */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* XXX extract with a mask */
-            ASSERT(0);
-         }
-      }
-      else {
-         /* no stencil test, but set to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
-      }
+         /* This will perform the stencil and depth tests, and update
+          * the mask_reg, fbZ_reg, and fbS_reg as required by the
+          * tests.
+          */
+         ASSERT(fbS_reg_set);
+         ASSERT(fbZ_reg_set);
+         spe_comment(f, 0, "Perform stencil test");
 
-      if (dsa->stencil[0].enabled) {
-         /* XXX this may involve depth testing too */
-         // gen_stencil_test(dsa, f, ... );
-         ASSERT(0);
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
-         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_comment(f, 0, "Perform depth test");
+         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
          spe_release_register(f, zmask_reg);
       }
-
-      /* do we need to write Z and/or Stencil back into framebuffer? */
-      write_depth_stencil = (dsa->depth.writemask |
-                             dsa->stencil[0].write_mask |
-                             dsa->stencil[1].write_mask);
+      else {
+         write_depth_stencil = false;
+      }
 
       if (write_depth_stencil) {
          /* Merge latest Z and Stencil values into fbZS_reg.
           * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
           * fbS_reg has four 8-bit Z values in bits [7..0].
           */
+         spe_comment(f, 0, "Storing depth/stencil values");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
+            else {
+               spe_move(f, fbZS_reg, fbZ_reg);
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
             spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set) {
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
             spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
@@ -1341,11 +2028,10 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
       }
 
-      spe_release_register(f, fbZ_reg);
-      spe_release_register(f, fbS_reg);
+      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+      release_optional_register(f, &fbS_reg_set, fbS_reg);
    }
 
-
    /* Get framebuffer quad/colors.  We'll need these for blending,
     * color masking, and to obey the quad/pixel mask.
     * Load: fbRGBA_reg = memory[color_tile + quad_offset]
@@ -1354,8 +2040,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
     */
    spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
 
-
    if (blend->blend_enable) {
+      spe_comment(f, 0, "Perform blending");
       gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
@@ -1369,19 +2055,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       int rgba_reg = spe_allocate_available_register(f);
 
       /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert fragment colors to framebuffer colors");
       gen_pack_colors(f, color_format,
                       fragR_reg, fragG_reg, fragB_reg, fragA_reg,
                       rgba_reg);
 
       if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
       if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
          gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
       }
 
-
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
        *    rgba[i] = rgba[i];
@@ -1393,6 +2081,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       /* Store updated quad in tile:
        * memory[color_tile + quad_offset] = rgba_reg;
        */
+      spe_comment(f, 0, "Store framebuffer colors");
       spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
 
       spe_release_register(f, rgba_reg);
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e..79cb8df82f 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
       struct cell_command_render *render = &cell_global.command[i].render;
       render->prim_type = PIPE_PRIM_TRIANGLES;
       render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
       render->vertex_size = cell->vertex_info->size * 4;
       render->xmin = cell->prim_buffer.xmin;
       render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b93..578ddf62dc 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
+      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 29a305232e..1cd577c23c 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -73,7 +73,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask);
+                                      vector unsigned int mask,
+                                      uint facing);
 
 /** Function for running fragment program */
 typedef void (*spu_fragment_program_func)(vector float *inputs,
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f107764fb2..d252fa6dc1 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask)
+                          vector unsigned int mask,
+                          uint facing)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
@@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       /* Form bitmask depending on color buffer format and colormask bits */
       switch (spu.fb.color_format) {
       case PIPE_FORMAT_A8R8G8B8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x00ff0000; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x0000ff00; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0x000000ff; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0xff000000; /* alpha */
          break;
       case PIPE_FORMAT_B8G8R8A8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x0000ff00; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x00ff0000; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0xff000000; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0x000000ff; /* alpha */
          break;
       default:
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index f817abf046..a61689c83a 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask);
+                          vector unsigned int mask,
+                          uint facing);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc98881..82dbeb26b7 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty);
+         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("SPU %u: RENDER done\n",
              spu.init.id);
 }
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 0a8fb56a62..6039cd80b2 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -118,6 +118,8 @@ struct setup_stage {
 
    float oneoverarea;
 
+   uint facing;
+
    uint tx, ty;
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
@@ -274,7 +276,7 @@ eval_z(float x, float y)
  * overall.
  */
 static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
 {
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
@@ -344,7 +346,8 @@ emit_quad( int x, int y, mask_t mask )
                              fragZ,
                              soa_frag[0], soa_frag[1],
                              soa_frag[2], soa_frag[3],
-                             mask);
+                             mask,
+                             setup.facing);
          }
 
       }
@@ -379,7 +382,8 @@ emit_quad( int x, int y, mask_t mask )
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask);
+                          mask,
+                          setup.facing);
       }
    }
 }
@@ -483,7 +487,7 @@ static void flush_spans( void )
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
 #if 1
-      emit_quad( x, setup.span.y, calculate_mask( x ) );
+      emit_quad( x, setup.span.y, calculate_mask( x ));
 #endif
    }
 
@@ -902,13 +906,28 @@ static void subtriangle( struct edge *eleft,
    eright->sy += lines;
 }
 
+static float
+determinant( const float *v0,
+             const float *v1,
+             const float *v2 )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
 
 /**
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -919,6 +938,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
+   /* Before we sort vertices, determine the facing of the triangle,
+    * which will be needed for front/back-face stencil application
+    */
+   float det = determinant(v0, v1, v2);
+   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c9..abc3d35160 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
 
 
 #endif /* SPU_TRI_H */
-- 
cgit v1.2.3


From 7053f8c902e904495dffbbf6ea55f414cec780e7 Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 6 Oct 2008 11:54:22 +0100
Subject: rtasm: fix debug build

---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index cc5871f873..dd26d4d9ed 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -657,7 +657,7 @@ void sse_movntps( struct x86_function *p,
                   struct x86_reg dst,
                   struct x86_reg src)
 {
-   DUMP_RR( dst, reg );
+   DUMP_RR( dst, src );
 
    assert(dst.mod != mod_REG);
    assert(src.mod == mod_REG);
-- 
cgit v1.2.3


From f7ee3c979261b4a2b77365b47c7147f69fbfd606 Mon Sep 17 00:00:00 2001
From: Brian <brian.paul@tungstengraphics.com>
Date: Mon, 6 Oct 2008 18:31:56 -0600
Subject: gallium: replace assertion with conditional/recovery code

The assertion failed when we ran out of exec memory.
Found with conform texcombine test.
---
 src/gallium/auxiliary/rtasm/rtasm_x86sse.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index dd26d4d9ed..ad9d8f8ced 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -370,7 +370,11 @@ void x86_jcc( struct x86_function *p,
    DUMP_I(cc);
    
    if (offset < 0) {
-      assert(p->csr - p->store > -offset);
+      /*assert(p->csr - p->store > -offset);*/
+      if (p->csr - p->store <= -offset) {
+         /* probably out of memory (using the error_overflow buffer) */
+         return;
+      }
    }
 
    if (offset <= 127 && offset >= -128) {
-- 
cgit v1.2.3


From 73d00b9e93a9e8a5fecb0de224552741e389fc11 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 16:33:04 -0600
Subject: gallium: better instruction printing for SPE code

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 46 ++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 10 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 8a87e9abb1..a6dd7ef311 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -164,6 +164,24 @@ rem_prefix(const char *longname)
 }
 
 
+static const char *
+reg_name(int reg)
+{
+   switch (reg) {
+   case SPE_REG_SP:
+      return "$sp";
+   case SPE_REG_RA:
+      return "$lr";
+   default:
+      {
+         static char buf[10];
+         sprintf(buf, "$%d", reg);
+         return buf;
+      }
+   }
+}
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
 		    unsigned rA, unsigned rB, const char *name)
 {
@@ -176,7 +194,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB);
+       printf("%s\t%s, %s, %s\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
     }
 }
 
@@ -194,7 +213,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB, rC);
+       printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
+              reg_name(rA), reg_name(rB), reg_name(rC));
     }
 }
 
@@ -211,7 +231,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
@@ -229,7 +250,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
@@ -248,10 +270,14 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     if (p->print) {
        indent(p);
        if (strcmp(name, "spe_lqd") == 0 ||
-           strcmp(name, "spe_stqd") == 0)
-          printf("%s\t$%d, 0x%x($%d)\n", rem_prefix(name), rT, imm, rA);
-       else
-          printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+           strcmp(name, "spe_stqd") == 0) {
+          printf("%s\t%s, %d(%s)\n",
+                 rem_prefix(name), reg_name(rT), imm, reg_name(rA));
+       }
+       else {
+          printf("%s\t%s, %s, 0x%x\n",
+                 rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+       }
     }
 }
 
@@ -267,7 +293,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
     }
 }
 
@@ -283,7 +309,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
     }
 }
 
-- 
cgit v1.2.3


From 5c57cbec32136c25f104872179d979098be9a1a7 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 16:35:40 -0600
Subject: gallium: asst. clean-ups

Don't use register qualifier.  Doxygen-ize comments.  Remove 'extern'.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a6dd7ef311..c442b1f6aa 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -385,7 +385,7 @@ void _name (struct spe_function *p, int imm) \
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
-    register unsigned int i;
+    unsigned int i;
 
     p->store = align_malloc(code_size, 16);
     p->num_inst = 0;
@@ -475,7 +475,7 @@ void spe_release_register(struct spe_function *p, int reg)
  */
 void spe_allocate_register_set(struct spe_function *p)
 {
-   register unsigned int i;
+   unsigned int i;
 
    /* Keep track of the set count.  If it ever wraps around to 0, 
     * we're in trouble.
@@ -489,7 +489,8 @@ void spe_allocate_register_set(struct spe_function *p)
     * when the register set is released.
     */
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      if (p->regs[i] > 0) p->regs[i]++;
+      if (p->regs[i] > 0)
+         p->regs[i]++;
    }
 }
 
@@ -506,7 +507,8 @@ void spe_release_register_set(struct spe_function *p)
     * available.
     */
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      if (p->regs[i] > 0) p->regs[i]--;
+      if (p->regs[i] > 0)
+         p->regs[i]--;
    }
 }
 
@@ -525,7 +527,7 @@ spe_indent(struct spe_function *p, int spaces)
 }
 
 
-extern void
+void
 spe_comment(struct spe_function *p, int rel_indent, const char *s)
 {
    if (p->print) {
@@ -710,10 +712,12 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
    }
 }
 
-/* This function is constructed identically to spe_sor_uint() below.
+/**
+ * This function is constructed identically to spe_sor_uint() below.
  * Changes to one should be made in the other.
  */
-void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
 {
    /* If we can, emit a single instruction, either And Byte Immediate
     * (which uses the same constant across each byte), And Halfword Immediate
@@ -723,7 +727,7 @@ void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int
     *
     * Otherwise, we'll need to use a temporary register.
     */
-   register unsigned int tmp;
+   unsigned int tmp;
 
    /* If the upper 23 bits are all 0s or all 1s, sign extension
     * will work and we can use And Word Immediate
@@ -760,10 +764,12 @@ void spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int
    spe_release_register(p, tmp_reg);
 }
 
-/* This function is constructed identically to spe_and_uint() above.
+/**
+ * This function is constructed identically to spe_and_uint() above.
  * Changes to one should be made in the other.
  */
-void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
 {
    /* If we can, emit a single instruction, either Exclusive Or Byte 
     * Immediate (which uses the same constant across each byte), Exclusive 
@@ -773,7 +779,7 @@ void spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int
     *
     * Otherwise, we'll need to use a temporary register.
     */
-   register unsigned int tmp;
+   unsigned int tmp;
 
    /* If the upper 23 bits are all 0s or all 1s, sign extension
     * will work and we can use Exclusive Or Word Immediate
-- 
cgit v1.2.3


From d48a92e88040470f93e2186f8eb23e4797a09860 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 8 Oct 2008 20:44:32 -0600
Subject: cell: implement function calls from shader code.  fslight demo runs
 now.

Used for SIN, COS, EXP2, LOG2, POW instructions.  TEX next.

Fixed some bugs in MIN, MAX, DP3, DP4, DPH instructions.

In rtasm code:
  Special-case spe_lqd(), spe_stqd() functions so they take byte offsets but
  low-order 4 bits are shifted out.  This makes things consistant with SPU
  assembly language conventions.
  Added spe_get_registers_used() function.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  76 ++++++++++--
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  11 +-
 src/gallium/drivers/cell/ppu/cell_gen_fp.c       | 141 +++++++++++++++--------
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |  30 ++---
 4 files changed, 182 insertions(+), 76 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index c442b1f6aa..9274bc5e3c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -174,9 +174,12 @@ reg_name(int reg)
       return "$lr";
    default:
       {
-         static char buf[10];
-         sprintf(buf, "$%d", reg);
-         return buf;
+         /* cycle through four buffers to handle multiple calls per printf */
+         static char buf[4][10];
+         static int b = 0;
+         b = (b + 1) % 4;
+         sprintf(buf[b], "$%d", reg);
+         return buf[b];
       }
    }
 }
@@ -269,15 +272,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       if (strcmp(name, "spe_lqd") == 0 ||
-           strcmp(name, "spe_stqd") == 0) {
-          printf("%s\t%s, %d(%s)\n",
-                 rem_prefix(name), reg_name(rT), imm, reg_name(rA));
-       }
-       else {
-          printf("%s\t%s, %s, 0x%x\n",
-                 rem_prefix(name), reg_name(rT), reg_name(rA), imm);
-       }
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
@@ -379,6 +375,7 @@ void _name (struct spe_function *p, int imm) \
 #include "rtasm_ppc_spe.h"
 
 
+
 /**
  * Initialize an spe_function.
  * \param code_size  size of instruction buffer to allocate, in bytes.
@@ -513,6 +510,20 @@ void spe_release_register_set(struct spe_function *p)
 }
 
 
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+   unsigned i, num = 0;
+   /* only count registers in the range available to callers */
+   for (i = 2; i < 80; i++) {
+      if (p->regs[i]) {
+         used[num++] = i;
+      }
+   }
+   return num;
+}
+
+
 void
 spe_print_code(struct spe_function *p, boolean enable)
 {
@@ -539,6 +550,46 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
 }
 
 
+/**
+ * Load quad word.
+ * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   p->print = FALSE;
+   assert(offset % 4 == 0);
+   emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   p->print = FALSE;
+   assert(offset % 4 == 0);
+   emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
 /**
  * For branch instructions:
  * \param d  if 1, disable interupts if branch is taken
@@ -764,6 +815,7 @@ spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
    spe_release_register(p, tmp_reg);
 }
 
+
 /**
  * This function is constructed identically to spe_and_uint() above.
  * Changes to one should be made in the other.
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index cd2e245409..47dadb343c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -89,6 +89,9 @@ extern void spe_release_register(struct spe_function *p, int reg);
 extern void spe_allocate_register_set(struct spe_function *p);
 extern void spe_release_register_set(struct spe_function *p);
 
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
 extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
@@ -128,11 +131,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 /* Memory load / store instructions
  */
-EMIT_RI10(spe_lqd,  0x034);
 EMIT_RR  (spe_lqx,  0x1c4);
 EMIT_RI16(spe_lqa,  0x061);
 EMIT_RI16(spe_lqr,  0x067);
-EMIT_RI10(spe_stqd, 0x024);
 EMIT_RR  (spe_stqx, 0x144);
 EMIT_RI16(spe_stqa, 0x041);
 EMIT_RI16(spe_stqr, 0x047);
@@ -290,6 +291,12 @@ EMIT_RI16(spe_brz,       0x040);
 EMIT_RI16(spe_brhnz,     0x046);
 EMIT_RI16(spe_brhz,      0x044);
 
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
 extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 3065869d04..640ebcadbb 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -84,6 +84,8 @@ struct codegen
    /** Index of execution mask register */
    int exec_mask_reg;
 
+   int frame_size;  /**< Stack frame size, in words */
+
    struct spe_function *f;
    boolean error;
 };
@@ -208,7 +210,7 @@ get_src_reg(struct codegen *gen,
             reg = get_itemp(gen);
             reg_is_itemp = TRUE;
             /* Load:  reg = memory[(machine_reg) + offset] */
-            spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
          }
          break;
       case TGSI_FILE_IMMEDIATE:
@@ -221,7 +223,7 @@ get_src_reg(struct codegen *gen,
             reg = get_itemp(gen);
             reg_is_itemp = TRUE;
             /* Load:  reg = memory[(machine_reg) + offset] */
-            spe_lqd(gen->f, reg, gen->constants_reg, offset);
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
          }
          break;
       default:
@@ -325,6 +327,7 @@ store_dest_reg(struct codegen *gen,
       }
       else {
          /* we're not inside a condition or loop: do nothing special */
+
       }
       break;
    case TGSI_FILE_OUTPUT:
@@ -337,17 +340,17 @@ store_dest_reg(struct codegen *gen,
             /* First read the current value from memory:
              * Load:  curval = memory[(machine_reg) + offset]
              */
-            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
             /* Mix curval with newvalue according to exec mask:
              * d[i] = mask_reg[i] ? value_reg : d_reg
              */
             spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
             /* Store: memory[(machine_reg) + offset] = curval */
-            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
          }
          else {
             /* Store: memory[(machine_reg) + offset] = reg */
-            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
          }
       }
       break;
@@ -357,6 +360,41 @@ store_dest_reg(struct codegen *gen,
 }
 
 
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 256+128; /* XXX temporary */
+
+   spe_comment(gen->f, -4, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* save stack pointer    # stqd $sp,-frameSize($sp) */
+   spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+   /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   spe_comment(gen->f, -4, "Function epilogue:");
+
+   /* restore stack pointer    # ai $sp,$sp,frameSize */
+   spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
 static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
@@ -588,6 +626,7 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
    int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
    int tmp_reg = get_itemp(gen);
+
    /* t = x0 * x1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
@@ -603,7 +642,9 @@ emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
 
@@ -623,6 +664,7 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
    int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
    int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
    int tmp_reg = get_itemp(gen);
+
    /* t = x0 * x1 */
    spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
 
@@ -643,6 +685,8 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
          store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
@@ -683,6 +727,8 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
          store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
@@ -1112,9 +1158,6 @@ emit_function_call(struct codegen *gen,
    uint addr;
    int ch;
 
-   /* XXX temporary value */
-   const int frameSize = 64; /* stack frame (activation record) size */
-
    assert(num_args <= 3);
 
    /* lookup function address */
@@ -1136,48 +1179,45 @@ emit_function_call(struct codegen *gen,
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         int s_regs[3];
-         uint a;
+         int s_regs[3], d_reg;
+         ubyte usedRegs[SPE_NUM_REGS];
+         uint a, i, numUsed;
+
          for (a = 0; a < num_args; a++) {
             s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
          }
+         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
-         /* Basically:
-          * save registers on stack
-          * move parameters to registers 3, 4, 5...
-          * call function
-          * save return value (reg 3)
-          * restore registers from stack
-          */
+         numUsed = spe_get_registers_used(gen->f, usedRegs);
+         assert(numUsed < gen->frame_size / 16 - 32);
 
-         /* XXX hack: load first function param */
-         spe_move(gen->f, 3, s_regs[0]);
-
-         /* save $lr on stack     # stqd $lr,16($sp) */
-         spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-         /* save stack pointer    # stqd $sp,-frameSize($sp) */
-         spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
-
-         /* XXX save registers to stack here */
+         /* save registers to stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            int offset = 2 + i;
+            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
 
-         /* adjust stack pointer  # ai $sp,$sp,-frameSize */
-         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
+         /* setup function arguments */
+         for (a = 0; a < num_args; a++) {
+            spe_move(gen->f, 3 + a, s_regs[a]);
+         }
 
          /* branch to function, save return addr */
          spe_brasl(gen->f, SPE_REG_RA, addr);
 
-         /* restore stack pointer # ai $sp,$sp,frameSize */
-         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, frameSize);
-
-         /* XXX restore registers from stack here */
-
-         /* restore $lr           # lqd $lr,16($sp) */
-         spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-
-         /* XXX hack: save function's return value */
+         /* save function's return value */
          spe_move(gen->f, d_reg, 3);
 
+         /* restore registers from stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            if (reg != d_reg) {
+               int offset = 2 + i;
+               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+         }
+
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -1202,10 +1242,11 @@ emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
          int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
 
          /* d = (s1 > s2) ? s1 : s2 */
-         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+         spe_fcgt(gen->f, tmp_reg, s1_reg, s2_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1230,10 +1271,11 @@ emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
          int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
          int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
 
          /* d = (s2 > s1) ? s1 : s2 */
-         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+         spe_fcgt(gen->f, tmp_reg, s2_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s2_reg, s1_reg, tmp_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1346,8 +1388,7 @@ static boolean
 emit_END(struct codegen *gen)
 {
    spe_comment(gen->f, -4, "END:");
-   /* return from function call */
-   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+   emit_epilogue(gen);
    return true;
 }
 
@@ -1420,6 +1461,10 @@ emit_instruction(struct codegen *gen,
       return emit_function_call(gen, inst, "spu_sin", 1);
    case TGSI_OPCODE_POW:
       return emit_function_call(gen, inst, "spu_pow", 2);
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_function_call(gen, inst, "spu_exp2", 1);
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_function_call(gen, inst, "spu_log2", 1);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
@@ -1532,6 +1577,7 @@ emit_declaration(struct cell_context *cell,
 }
 
 
+
 /**
  * Translate TGSI shader code to SPE instructions.  This is done when
  * the state tracker gives us a new shader (via pipe->create_fs_state()).
@@ -1571,12 +1617,14 @@ cell_gen_fragment_program(struct cell_context *cell,
 
    tgsi_parse_init(&parse, tokens);
 
+   emit_prologue(&gen);
+
    while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
       tgsi_parse_token(&parse);
 
       switch (parse.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-         if (!emit_immediate(&gen,  &parse.FullToken.FullImmediate))
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
             gen.error = true;
          break;
 
@@ -1595,7 +1643,6 @@ cell_gen_fragment_program(struct cell_context *cell,
       }
    }
 
-
    if (gen.error) {
       /* terminate the SPE code */
       return emit_END(&gen);
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 566df7f59e..18969005b0 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -73,8 +73,8 @@ emit_matrix_transpose(struct spe_function *p,
    int col3;
 
 
-   spe_lqd(p, shuf_hi, shuf_ptr, 3);
-   spe_lqd(p, shuf_lo, shuf_ptr, 4);
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
    spe_shufb(p, t1, row0, row2, shuf_hi);
    spe_shufb(p, t2, row0, row2, shuf_lo);
 
@@ -122,13 +122,13 @@ emit_matrix_transpose(struct spe_function *p,
     */
    switch (count) {
    case 4:
-      spe_stqd(p, col3, dest_ptr, 3);
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
    case 3:
-      spe_stqd(p, col2, dest_ptr, 2);
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
    case 2:
-      spe_stqd(p, col1, dest_ptr, 1);
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
    case 1:
-      spe_stqd(p, col0, dest_ptr, 0);
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
    }
 
 
@@ -166,17 +166,17 @@ emit_fetch(struct spe_function *p,
    float scale_signed = 0.0;
    float scale_unsigned = 0.0;
 
-   spe_lqd(p, v0, in_ptr, 0 + offset[0]);
-   spe_lqd(p, v1, in_ptr, 1 + offset[0]);
-   spe_lqd(p, v2, in_ptr, 2 + offset[0]);
-   spe_lqd(p, v3, in_ptr, 3 + offset[0]);
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
    offset[0] += 4;
    
    switch (bytes) {
    case 1:
       scale_signed = 1.0f / 127.0f;
       scale_unsigned = 1.0f / 255.0f;
-      spe_lqd(p, tmp, shuf_ptr, 1);
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -185,7 +185,7 @@ emit_fetch(struct spe_function *p,
    case 2:
       scale_signed = 1.0f / 32767.0f;
       scale_unsigned = 1.0f / 65535.0f;
-      spe_lqd(p, tmp, shuf_ptr, 2);
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -241,11 +241,11 @@ emit_fetch(struct spe_function *p,
 
    switch (count) {
    case 1:
-      spe_stqd(p, float_zero, out_ptr, 1);
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
    case 2:
-      spe_stqd(p, float_zero, out_ptr, 2);
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
    case 3:
-      spe_stqd(p, float_one, out_ptr, 3);
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
    }
 
    if (float_zero != -1) {
-- 
cgit v1.2.3


From 7ac1fc77661faf0897507fef0437fe69d0ba53ac Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Thu, 9 Oct 2008 19:54:46 -0600
Subject: cell: fix incorrect bitmask in spe_load_uint()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 9274bc5e3c..cc35f0ba5b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -727,7 +727,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
     * Bytes Immediate (fsmbi) to load the value in a single instruction.
     * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
-   if ((ui & 0xfffc0000) == ui) {
+   if ((ui & 0x3ffff) == ui) {
       spe_ila(p, rT, ui);
    }
    else if ((ui >> 16) == (ui & 0xffff)) {
-- 
cgit v1.2.3


From adeed0f90fdd46ea139d5c4b3b75d5dc79b2a0c7 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:13:13 -0600
Subject: CELL: fixing stencil bugs

These are the defects found and fixed so far.  Several more have
been observed; I'm working on them.

- Fixed an error in spe_load_uint() that caused incorrect values to be
  loaded if the given unsigned value had the low 18 bits as 0,
  and that caused inefficient code to be emitted if the given value
  had the high 14 bits as 0.

- Fixed a problem in stencil code generation where optional registers
  weren't tracked correctly.

- Fixed a problem that the stencil function NEVER was acting as ALWAYS.

- Fixed several problems that could occur if stenciling were enabled but
  depth was disabled.

- Fixed a problem with two-sided stencil writemask handling that could
  cause a stencil writemask to not be applied.

- Fixed several state permutations that were incorrectly flagged as
  not requiring stencil values to be calculated.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |  4 +-
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c | 88 +++++++++++++++++++-----
 2 files changed, 72 insertions(+), 20 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index cc35f0ba5b..9bf3b9bf0c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -727,7 +727,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
     * Bytes Immediate (fsmbi) to load the value in a single instruction.
     * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
-   if ((ui & 0x3ffff) == ui) {
+   if ((ui & 0x0003ffff) == ui) {
       spe_ila(p, rT, ui);
    }
    else if ((ui >> 16) == (ui & 0xffff)) {
@@ -764,7 +764,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
 }
 
 /**
- * This function is constructed identically to spe_sor_uint() below.
+ * This function is constructed identically to spe_xor_uint() below.
  * Changes to one should be made in the other.
  */
 void
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index de170d1036..4e1e53ecdc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -247,6 +247,7 @@ setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigne
 {
    if (*is_already_set) return;
    *r = spe_allocate_available_register(f);
+   *is_already_set = true;
 }
 
 static inline void
@@ -1157,7 +1158,6 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
       /* stencil_pass = mask & (s == reference) */
       spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
       spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
-      /* stencil_fail = mask & ~stencil_pass */
       break;
 
    case PIPE_FUNC_NOTEQUAL:
@@ -1207,7 +1207,6 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
    case PIPE_FUNC_NEVER:
       /* stencil_pass = mask & 0 = 0 */
       spe_load_uint(f, stencil_pass_reg, 0);
-      spe_move(f, stencil_pass_reg, mask_reg);  /* zmask = mask */
       break;
 
    case PIPE_FUNC_ALWAYS:
@@ -1483,6 +1482,10 @@ gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_a
    } /* End of calculations for back-facing stencil */
 }
 
+/* Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled.  This function must not use
+ * the register if depth is not enabled.
+ */
 static boolean
 gen_stencil_depth_test(struct spe_function *f, 
                        const struct pipe_depth_stencil_alpha_state *dsa, 
@@ -1522,6 +1525,7 @@ gen_stencil_depth_test(struct spe_function *f,
     * have to spend the complexity to track the more difficult variant
     * register usage scenarios.
     */
+   spe_comment(f, 0, "Allocating stencil register set");
    spe_allocate_register_set(f);
 
    /* Calculate the writemask.  If the writemask is trivial (either
@@ -1538,7 +1542,7 @@ gen_stencil_depth_test(struct spe_function *f,
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
@@ -1556,10 +1560,12 @@ gen_stencil_depth_test(struct spe_function *f,
        * writemask, we'll have to generate code that merges the
        * two masks into a single effective mask based on fragment facing.
        */
+      spe_comment(f, 0, "Computing stencil writemask");
       stencil_writemask_reg = spe_allocate_available_register(f);
       spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
       if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
          unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Resolving two-sided stencil writemask");
          spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
          spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
          spe_release_register(f, back_write_mask_reg);
@@ -1575,6 +1581,7 @@ gen_stencil_depth_test(struct spe_function *f,
     * This test will *not* change the value in mask_reg (because we don't
     * yet know whether to apply the two-sided stencil or one-sided stencil).
     */
+   spe_comment(f, 0, "Running basic stencil test");
    stencil_pass_reg = spe_allocate_available_register(f);
    gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
 
@@ -1584,6 +1591,7 @@ gen_stencil_depth_test(struct spe_function *f,
     */
    if (dsa->stencil[1].enabled) {
       unsigned int temp_reg = spe_allocate_available_register(f);
+      spe_comment(f, 0, "Running backface stencil test");
       gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
       spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
       spe_release_register(f, temp_reg);
@@ -1597,6 +1605,7 @@ gen_stencil_depth_test(struct spe_function *f,
     * stencil test, and because the depth test will update the 
     * mask of valid fragments based on the results of the depth test).
     */
+   spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
    stencil_fail_reg = spe_allocate_available_register(f);
    spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
    /* Now remove the stenciled-out pixels from the valid fragment mask,
@@ -1623,6 +1632,7 @@ gen_stencil_depth_test(struct spe_function *f,
        * This function will allocate a variant number of registers that
        * will be released as part of the register set.
        */
+      spe_comment(f, 0, "Computing stencil values");
       gen_get_stencil_values(f, dsa, fbS_reg, 
          &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
          &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
@@ -1652,6 +1662,7 @@ gen_stencil_depth_test(struct spe_function *f,
          stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
       }
       else { /* two-sided stencil enabled */
+         spe_comment(f, 0, "Resolving backface stencil values");
          /* Allocate new registers for the needed merged values */
          stencil_fail_values = spe_allocate_available_register(f);
          spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
@@ -1678,11 +1689,13 @@ gen_stencil_depth_test(struct spe_function *f,
     * on the results of the test.
     */
    if (dsa->depth.enabled) {
+      spe_comment(f, 0, "Running stencil depth test");
       zmask_reg = spe_allocate_available_register(f);
       modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
    }
 
    if (need_to_calculate_stencil_values) {
+
       /* If we need to writemask the stencil values before going into
        * the stencil buffer, we'll have to use a new register to
        * hold the new values.  If not, we can just keep using the
@@ -1690,8 +1703,8 @@ gen_stencil_depth_test(struct spe_function *f,
        */
       if (need_to_writemask_stencil_values) {
          newS_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Saving current stencil values for writemasking");
          spe_move(f, newS_reg, fbS_reg);
-         modified_buffers = true;
       }
       else {
          newS_reg = fbS_reg;
@@ -1699,7 +1712,9 @@ gen_stencil_depth_test(struct spe_function *f,
 
       /* Merge in the selected stencil fail values */
       if (stencil_fail_values != fbS_reg) {
+         spe_comment(f, 0, "Loading stencil fail values");
          spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+         modified_buffers = true;
       }
 
       /* Same for the stencil pass/depth fail values.  If this calculation
@@ -1714,20 +1729,36 @@ gen_stencil_depth_test(struct spe_function *f,
           * set above if we're here.
           */
          unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Loading stencil pass/depth fail values");
          spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
 
          spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
 
          spe_release_register(f, stencil_pass_depth_fail_mask);
+         modified_buffers = true;
       }
 
-      /* Same for the stencil pass/depth pass mask */
+      /* Same for the stencil pass/depth pass mask.  Note that we
+       * *can* get here with zmask_reg being unset (if the depth
+       * test is off but the stencil test is on).  In this case,
+       * we assume the depth test passes, and don't need to mask
+       * the stencil pass mask with the Z mask.
+       */
       if (stencil_pass_depth_pass_values != fbS_reg) {
-         unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
-         spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
-
-         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
-         spe_release_register(f, stencil_pass_depth_pass_mask);
+         if (dsa->depth.enabled) {
+            unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+            /* We'll need a separate register */
+            spe_comment(f, 0, "Loading stencil pass/depth pass values");
+            spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+            spe_release_register(f, stencil_pass_depth_pass_mask);
+         }
+         else {
+            /* We can use the same stencil-pass register */
+            spe_comment(f, 0, "Loading stencil pass values");
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+         }
+         modified_buffers = true;
       }
 
       /* Almost done.  If we need to writemask, do it now, leaving the
@@ -1736,14 +1767,16 @@ gen_stencil_depth_test(struct spe_function *f,
        * so there's nothing more to do.
        */
 
-      if (need_to_writemask_stencil_values) {
+      if (need_to_writemask_stencil_values && modified_buffers) {
          /* The Select Bytes command makes a fine writemask.  Where
           * the mask is 0, the first (original) values are retained,
           * effectively masking out changes.  Where the mask is 1, the
           * second (new) values are retained, incorporating changes.
           */
+         spe_comment(f, 0, "Writemasking new stencil values");
          spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
       }
+
    } /* done calculating stencil values */
 
    /* The stencil and/or depth values have been applied, and the
@@ -1752,6 +1785,7 @@ gen_stencil_depth_test(struct spe_function *f,
     * of registers that we didn't bother tracking.  Release all
     * those registers as part of the register set, and go home.
     */
+   spe_comment(f, 0, "Releasing stencil register set");
    spe_release_register_set(f);
 
    /* Return true if we could have modified the stencil and/or
@@ -1869,7 +1903,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       boolean fbS_reg_set = false, fbZ_reg_set = false;
       unsigned int fbS_reg, fbZ_reg = 0;
 
-      spe_comment(f, 0, "Fetch quad's Z/stencil values from tile");
+      spe_comment(f, 0, "Fetching Z/stencil quad from tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
@@ -1973,13 +2007,18 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
           * tests.
           */
          ASSERT(fbS_reg_set);
-         ASSERT(fbZ_reg_set);
          spe_comment(f, 0, "Perform stencil test");
 
+         /* Note that fbZ_reg may not be set on entry, if stenciling
+          * is enabled but there's no Z-buffer.  The 
+          * gen_stencil_depth_test() function must ignore the
+          * fbZ_reg register if depth is not enabled.
+          */
          write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
+         ASSERT(fbZ_reg_set);
          spe_comment(f, 0, "Perform depth test");
          write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
          spe_release_register(f, zmask_reg);
@@ -1996,26 +2035,39 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_comment(f, 0, "Store quad's depth/stencil values in tile");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            if (fbS_reg_set) {
+            if (fbS_reg_set && fbZ_reg_set) {
                spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
                spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
             }
+            else if (fbS_reg_set) {
+               spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            }
             else {
                spe_move(f, fbZS_reg, fbZ_reg);
             }
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            if (fbS_reg_set) {
+            if (fbS_reg_set && fbZ_reg_set) {
+               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
                spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
             }
+            else if (fbS_reg_set) {
+               spe_move(f, fbZS_reg, fbS_reg);
+            }
+            else {
+               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            if (fbZ_reg_set) {
+               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            if (fbZ_reg_set) {
+               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_S8_UNORM) {
             ASSERT(0);   /* XXX to do */
-- 
cgit v1.2.3


From 78c67a726fff052abeb03417283504a5dd521665 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:35:56 -0600
Subject: cell: fix assertions in spe_lqd(), spe_stqd()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 9bf3b9bf0c..5b0f6bdd48 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -559,7 +559,7 @@ void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
    const boolean pSave = p->print;
 
    p->print = FALSE;
-   assert(offset % 4 == 0);
+   assert(offset % 16 == 0);
    emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
    p->print = pSave;
 
@@ -579,7 +579,7 @@ void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
    const boolean pSave = p->print;
 
    p->print = FALSE;
-   assert(offset % 4 == 0);
+   assert(offset % 16 == 0);
    emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
    p->print = pSave;
 
-- 
cgit v1.2.3


From f42ef6f39d213b4c6315ba95791c16ca2b1a4b21 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:44:52 -0600
Subject: cell: additional 'offset' checking in spe_lqd(), spe_stqd()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 5b0f6bdd48..d0bacd08a6 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -552,14 +552,19 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
 
 /**
  * Load quad word.
- * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
  */
 void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
 {
    const boolean pSave = p->print;
 
-   p->print = FALSE;
+   /* offset must be a multiple of 16 */
    assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
    emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
    p->print = pSave;
 
@@ -572,14 +577,19 @@ void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
 
 /**
  * Store quad word.
- * NOTE: imm is in bytes and the least significant 4 bits must be zero!
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
  */
 void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
 {
    const boolean pSave = p->print;
 
-   p->print = FALSE;
+   /* offset must be a multiple of 16 */
    assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
    emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
    p->print = pSave;
 
-- 
cgit v1.2.3


From d3403b5482ee1c0faa0f42b8782ee3093a2f7b5e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Fri, 10 Oct 2008 14:57:57 -0600
Subject: cell: add emit_RI10s() which does range checking on the 10-bit signed
 immediate field

This type of checking should be expanded to cover more instructions...
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 16 ++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 24 ++++++++++++++----------
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index d0bacd08a6..dea1aed032 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -278,6 +278,16 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
+/** As above, but do range checking on signed immediate value */
+static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
+                       unsigned rA, int imm, const char *name)
+{
+    assert(imm <= 511);
+    assert(imm >= -512);
+    emit_RI10(p, op, rT, rA, imm, name);
+}
+
+
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
 		      int imm, const char *name)
 {
@@ -354,6 +364,12 @@ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
    emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
+#define EMIT_RI10s(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__);             \
+}
+
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 47dadb343c..d6a3c02f20 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -119,6 +119,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT_RI10(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
 			   int imm)
+#define EMIT_RI10s(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
 #define EMIT_RI16(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, int imm)
 #define EMIT_RI18(_name, _op) \
@@ -163,7 +166,7 @@ EMIT_RI16(spe_fsmbi, 0x065);
 EMIT_RR  (spe_ah,      0x0c8);
 EMIT_RI10(spe_ahi,     0x01d);
 EMIT_RR  (spe_a,       0x0c0);
-EMIT_RI10(spe_ai,      0x01c);
+EMIT_RI10s(spe_ai,      0x01c);
 EMIT_RR  (spe_sfh,     0x048);
 EMIT_RI10(spe_sfhi,    0x00d);
 EMIT_RR  (spe_sf,      0x040);
@@ -201,19 +204,19 @@ EMIT_R   (spe_xshw,    0x2ae);
 EMIT_R   (spe_xswd,    0x2a6);
 EMIT_RR  (spe_and,     0x0c1);
 EMIT_RR  (spe_andc,    0x2c1);
-EMIT_RI10(spe_andbi,   0x016);
-EMIT_RI10(spe_andhi,   0x015);
-EMIT_RI10(spe_andi,    0x014);
+EMIT_RI10s(spe_andbi,   0x016);
+EMIT_RI10s(spe_andhi,   0x015);
+EMIT_RI10s(spe_andi,    0x014);
 EMIT_RR  (spe_or,      0x041);
 EMIT_RR  (spe_orc,     0x2c9);
-EMIT_RI10(spe_orbi,    0x006);
-EMIT_RI10(spe_orhi,    0x005);
-EMIT_RI10(spe_ori,     0x004);
+EMIT_RI10s(spe_orbi,    0x006);
+EMIT_RI10s(spe_orhi,    0x005);
+EMIT_RI10s(spe_ori,     0x004);
 EMIT_R   (spe_orx,     0x1f0);
 EMIT_RR  (spe_xor,     0x241);
-EMIT_RI10(spe_xorbi,   0x026);
-EMIT_RI10(spe_xorhi,   0x025);
-EMIT_RI10(spe_xori,    0x024);
+EMIT_RI10s(spe_xorbi,   0x026);
+EMIT_RI10s(spe_xorhi,   0x025);
+EMIT_RI10s(spe_xori,    0x024);
 EMIT_RR  (spe_nand,    0x0c9);
 EMIT_RR  (spe_nor,     0x049);
 EMIT_RR  (spe_eqv,     0x249);
@@ -422,6 +425,7 @@ EMIT_R   (spe_wrch,       0x10d);
 #undef EMIT_RI7
 #undef EMIT_RI8
 #undef EMIT_RI10
+#undef EMIT_RI10s
 #undef EMIT_RI16
 #undef EMIT_RI18
 #undef EMIT_I16
-- 
cgit v1.2.3


From e0c6653a5fda956119239ef921daf1e3b950dfc8 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 10:35:38 -0600
Subject: cell: implement many more PPC instructions for code gen

---
 src/gallium/auxiliary/rtasm/Makefile    |   1 +
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 603 ++++++++++++++++++++++++++++++--
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 141 +++++++-
 3 files changed, 704 insertions(+), 41 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
index 39b8a4dbd7..252dc5274a 100644
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -7,6 +7,7 @@ C_SOURCES = \
 	rtasm_cpu.c \
 	rtasm_execmem.c \
 	rtasm_x86sse.c \
+	rtasm_ppc.c \
 	rtasm_ppc_spe.c
 
 include ../../Makefile.template
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 534a23568d..4a94ed0460 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -23,10 +23,19 @@
 
 /**
  * PPC code generation.
+ * For reference, see http://www.power.org/resources/reading/PowerISA_V2.05.pdf
+ * ABI info: http://www.cs.utsa.edu/~whaley/teach/cs6463FHPO/LEC/lec12_ho.pdf
+ *
+ * Other PPC refs:
+ * http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF778525699600719DF2
+ * http://www.ibm.com/developerworks/eserver/library/es-archguide-v2.html
+ * http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
+ *
  * \author Brian Paul
  */
 
 
+#include <stdio.h>
 #include "util/u_memory.h"
 #include "pipe/p_debug.h"
 #include "rtasm_ppc.h"
@@ -35,30 +44,125 @@
 void
 ppc_init_func(struct ppc_function *p, unsigned max_inst)
 {
-    p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
-    p->num_inst = 0;
-    p->max_inst = max_inst;
-    p->vec_used = ~0;
+   uint i;
+
+   p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
+   p->num_inst = 0;
+   p->max_inst = max_inst;
+   p->fp_used = ~0x0;
+   p->vec_used = ~0x0;
+
+   /* only allow using gp registers 7..12 for now */
+   p->reg_used = 0x0;
+   for (i = 7; i < 13; i++)
+      p->reg_used |= (1 << i);
 }
 
 
 void
 ppc_release_func(struct ppc_function *p)
 {
-    assert(p->num_inst <= p->max_inst);
-    if (p->store != NULL) {
-        align_free(p->store);
-    }
-    p->store = NULL;
+   assert(p->num_inst <= p->max_inst);
+   if (p->store != NULL) {
+      align_free(p->store);
+   }
+   p->store = NULL;
+}
+
+
+void (*ppc_get_func(struct ppc_function *p))(void)
+{
+#if 0
+   DUMP_END();
+   if (DISASSEM && p->store)
+      debug_printf("disassemble %p %p\n", p->store, p->csr);
+
+   if (p->store == p->error_overflow)
+      return (void (*)(void)) NULL;
+   else
+#endif
+      return (void (*)(void)) p->store;
+}
+
+
+void
+ppc_dump_func(const struct ppc_function *p)
+{
+   uint i;
+   for (i = 0; i < p->num_inst; i++) {
+      debug_printf("%3u: 0x%08x\n", i, p->store[i]);
+   }
+}
+
+
+/**
+ * Allocate a general purpose register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->reg_used & mask) != 0) {
+         p->reg_used &= ~mask;
+         return i;
+      }
+   }
+   return -1;
 }
 
 
 /**
- * Alloate a vector register.
+ * Mark the given general purpose register as "unallocated".
+ */
+void
+ppc_release_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   assert((p->reg_used & (1 << reg)) == 0);
+   p->reg_used |= (1 << reg);
+}
+
+
+/**
+ * Allocate a floating point register.
  * \return register index or -1 if none left.
  */
 int
-ppc_allocate_vec_register(struct ppc_function *p, int reg)
+ppc_allocate_fp_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_FP_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->fp_used & mask) != 0) {
+         p->fp_used &= ~mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given floating point register as "unallocated".
+ */
+void
+ppc_release_fp_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_FP_REGS);
+   assert((p->fp_used & (1 << reg)) == 0);
+   p->fp_used |= (1 << reg);
+}
+
+
+/**
+ * Allocate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p)
 {
    unsigned i;
    for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
@@ -68,7 +172,6 @@ ppc_allocate_vec_register(struct ppc_function *p, int reg)
          return i;
       }
    }
-
    return -1;
 }
 
@@ -81,7 +184,6 @@ ppc_release_vec_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_VEC_REGS);
    assert((p->vec_used & (1 << reg)) == 0);
-
    p->vec_used |= (1 << reg);
 }
 
@@ -98,6 +200,20 @@ union vx_inst {
    } inst;
 };
 
+static inline void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vx_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
 union vxr_inst {
    uint32_t bits;
    struct {
@@ -110,6 +226,21 @@ union vxr_inst {
    } inst;
 };
 
+static inline void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vxr_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.rC = 0;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
 union va_inst {
    uint32_t bits;
    struct {
@@ -122,49 +253,204 @@ union va_inst {
    } inst;
 };
 
-
 static inline void
-emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
 {
-   union vx_inst inst;
+   union va_inst inst;
    inst.inst.op = 4;
    inst.inst.vD = vD;
    inst.inst.vA = vA;
    inst.inst.vB = vB;
+   inst.inst.vC = vC;
    inst.inst.op2 = op2;
    p->store[p->num_inst++] = inst.bits;
    assert(p->num_inst <= p->max_inst);
 };
 
-static inline void
-emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+
+union i_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned li:24;
+      unsigned aa:1;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
 {
-   union vxr_inst inst;
-   inst.inst.op = 4;
-   inst.inst.vD = vD;
-   inst.inst.vA = vA;
-   inst.inst.vB = vB;
-   inst.inst.rC = 0;
+   union i_inst inst;
+   inst.inst.op = op;
+   inst.inst.li = li;
+   inst.inst.aa = aa;
+   inst.inst.lk = lk;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+union xl_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned bo:5;
+      unsigned bi:5;
+      unsigned unused:3;
+      unsigned bh:2;
+      unsigned op2:10;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
+        uint op2, uint lk)
+{
+   union xl_inst inst;
+   inst.inst.op = op;
+   inst.inst.bo = bo;
+   inst.inst.bi = bi;
+   inst.inst.unused = 0x0;
+   inst.inst.bh = bh;
    inst.inst.op2 = op2;
+   inst.inst.lk = lk;
    p->store[p->num_inst++] = inst.bits;
    assert(p->num_inst <= p->max_inst);
+}
+
+static INLINE void
+dump_xl(const char *name, uint inst)
+{
+   union xl_inst i;
+
+   i.bits = inst;
+   debug_printf("%s = 0x%08x\n", name, inst);
+   debug_printf(" op: %d 0x%x\n", i.inst.op, i.inst.op);
+   debug_printf(" bo: %d 0x%x\n", i.inst.bo, i.inst.bo);
+   debug_printf(" bi: %d 0x%x\n", i.inst.bi, i.inst.bi);
+   debug_printf(" unused: %d 0x%x\n", i.inst.unused, i.inst.unused);
+   debug_printf(" bh: %d 0x%x\n", i.inst.bh, i.inst.bh);
+   debug_printf(" op2: %d 0x%x\n", i.inst.op2, i.inst.op2);
+   debug_printf(" lk: %d 0x%x\n", i.inst.lk, i.inst.lk);
+}
+
+
+union x_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vrs:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned op2:10;
+      unsigned unused:1;
+   } inst;
 };
 
-static inline void
-emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+static INLINE void
+emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
 {
-   union va_inst inst;
-   inst.inst.op = 4;
-   inst.inst.vD = vD;
-   inst.inst.vA = vA;
-   inst.inst.vB = vB;
-   inst.inst.vC = vC;
+   union x_inst inst;
+   inst.inst.op = op;
+   inst.inst.vrs = vrs;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
    inst.inst.op2 = op2;
+   inst.inst.unused = 0x0;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+union d_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned si:16;
+   } inst;
+};
+
+static inline void
+emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
+{
+   union d_inst inst;
+   assert(si >= -32768);
+   assert(si <= 32767);
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.si = (unsigned) (si & 0xffff);
    p->store[p->num_inst++] = inst.bits;
    assert(p->num_inst <= p->max_inst);
 };
 
 
+union a_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned frt:5;
+      unsigned fra:5;
+      unsigned frb:5;
+      unsigned unused:5;
+      unsigned op2:5;
+      unsigned rc:1;
+   } inst;
+};
+
+static inline void
+emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
+       uint rc)
+{
+   union a_inst inst;
+   inst.inst.op = op;
+   inst.inst.frt = frt;
+   inst.inst.fra = fra;
+   inst.inst.frb = frb;
+   inst.inst.unused = 0x0;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+union xo_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned oe:1;
+      unsigned op2:9;
+      unsigned rc:1;
+   } inst;
+};
+
+static INLINE void
+emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
+        uint op2, uint rc)
+{
+   union xo_inst inst;
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
+   inst.inst.oe = oe;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+}
+
+
+
+
 
 /**
  ** float vector arithmetic
@@ -172,7 +458,7 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
 
 /** vector float add */
 void
-ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB)
+ppc_vaddfp(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
    emit_vx(p, 10, vD, vA, vB);
 }
@@ -198,11 +484,11 @@ ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
    emit_vx(p, 1034, vD, vA, vB);
 }
 
-/** vector float mult add */
+/** vector float mult add: vD = vA * vB + vC */
 void
 ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
 {
-   emit_va(p, 46, vD, vA, vB, vC);
+   emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
 }
 
 /** vector float compare greater than */
@@ -282,13 +568,26 @@ ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
    emit_vx(p, 586, vD, 0, vB);
 }
 
+/** vector store: store vR at mem[vA+vB] */
+void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 231);
+}
+
+/** vector load: vR = mem[vA+vB] */
+void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 103);
+}
+
 
 
 /**
- ** bitwise operations
+ ** vector bitwise operations
  **/
 
-
 /** vector and */
 void
 ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
@@ -324,6 +623,14 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
    emit_vx(p, 1220, vD, vA, vB);
 }
 
+/** Pseudo-instruction: vector move */
+void
+ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
+{
+   ppc_vor(p, vD, vA, vA);
+}
+
+
 
 /**
  ** Vector shuffle / select / splat / etc
@@ -363,3 +670,225 @@ ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
 {
    emit_vx(p, 652, vD, imm, vB);
 }
+
+/** vector splat signed immediate word */
+void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm)
+{
+   assert(imm >= -16);
+   assert(imm < 15);
+   emit_vx(p, 908, vD, imm, 0);
+}
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 388, vD, vA, vB);
+}
+
+
+
+
+/**
+ ** integer arithmetic
+ **/
+
+/** rt = ra + imm */
+void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 14, rt, ra, imm);
+}
+
+/** rt = ra + (imm << 16) */
+void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 15, rt, ra, imm);
+}
+
+/** rt = ra + rb */
+void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_xo(p, 31, rt, ra, rb, 0, 266, 0);
+}
+
+/** rt = ra AND ra */
+void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 28);  /* note argument order */
+}
+
+/** rt = ra AND imm */
+void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 28, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra OR ra */
+void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 444);  /* note argument order */
+}
+
+/** rt = ra OR imm */
+void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 24, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra XOR ra */
+void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 316);  /* note argument order */
+}
+
+/** rt = ra XOR imm */
+void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 26, ra, rt, imm);  /* note argument order */
+}
+
+/** pseudo instruction: move: rt = ra */
+void
+ppc_mr(struct ppc_function *p, uint rt, uint ra)
+{
+   ppc_or(p, rt, ra, ra);
+}
+
+/** pseudo instruction: load immediate: rt = imm */
+void
+ppc_li(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addi(p, rt, 0, imm);
+}
+
+/** rt = imm << 16 */
+void
+ppc_lis(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addis(p, rt, 0, imm);
+}
+
+/** rt = imm */
+void
+ppc_load_int(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_lis(p, rt, (imm >> 16));          /* rt = imm >> 16 */
+   ppc_ori(p, rt, rt, (imm & 0xffff));   /* rt = rt | (imm & 0xffff) */
+}
+
+
+
+
+/**
+ ** integer load/store
+ **/
+
+/** store rs at memory[(ra)+d],
+ * then update ra = (ra)+d
+ */
+void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 37, rs, ra, d);
+}
+
+/** store rs at memory[(ra)+d] */
+void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 36, rs, ra, d);
+}
+
+/** Load rt = mem[(ra)+d];  then zero set high 32 bits to zero. */
+void
+ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d)
+{
+   emit_d(p, 32, rt, ra, d);
+}
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+/** add: frt = fra + frb */
+void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 21, 0);
+}
+
+/** sub: frt = fra - frb */
+void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 20, 0);
+}
+
+/** convert to int: rt = (int) ra */
+void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint fra)
+{
+   emit_x(p, 63, rt, 0, fra, 15);
+}
+
+/** store frs at mem[(ra)+offset] */
+void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset)
+{
+   emit_d(p, 52, frs, ra, offset);
+}
+
+/** store frs at mem[(ra)+(rb)] */
+void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb)
+{
+   emit_x(p, 31, frs, ra, rb, 983);
+}
+
+/** load frt = mem[(ra)+offset] */
+void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset)
+{
+   emit_d(p, 48, frt, ra, offset);
+}
+
+
+
+
+
+/**
+ ** branch instructions
+ **/
+
+/** BLR: Branch to link register (p. 35) */
+void
+ppc_blr(struct ppc_function *p)
+{
+   emit_i(p, 18, 0, 0, 1);
+}
+
+/** Branch Conditional to Link Register (p. 36) */
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg)
+{
+   emit_xl(p, 19, condOp, condReg, branchHint, 16, 0);
+}
+
+/** Pseudo instruction: return from subroutine */
+void
+ppc_return(struct ppc_function *p)
+{
+   ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index ed14e943df..6370b60494 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -36,27 +36,46 @@
 
 #define PPC_INST_SIZE 4  /**< 4 bytes / instruction */
 
+#define PPC_NUM_REGS 32
+#define PPC_NUM_FP_REGS 32
 #define PPC_NUM_VEC_REGS 32
 
+/** Stack pointer register */
+#define PPC_REG_SP 1
+
+/** Branch conditions */
+#define BRANCH_COND_ALWAYS       0x14  /* binary 1z1zz (z=ignored) */
+
+/** Branch hints */
+#define BRANCH_HINT_SUB_RETURN   0x0   /* binary 00 */
+
 
 struct ppc_function
 {
    uint32_t *store;  /**< instruction buffer */
    uint num_inst;
    uint max_inst;
-   uint32_t vec_used;   /** used/free vector registers bitmask */
    uint32_t reg_used;   /** used/free general-purpose registers bitmask */
+   uint32_t fp_used;   /** used/free floating point registers bitmask */
+   uint32_t vec_used;   /** used/free vector registers bitmask */
 };
 
 
 extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
 extern void ppc_release_func(struct ppc_function *p);
-
-extern int ppc_allocate_vec_register(struct ppc_function *p, int reg);
+extern void (*ppc_get_func( struct ppc_function *p ))( void );
+extern void ppc_dump_func(const struct ppc_function *p);
+
+extern int ppc_allocate_register(struct ppc_function *p);
+extern void ppc_release_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_fp_register(struct ppc_function *p);
+extern void ppc_release_fp_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_vec_register(struct ppc_function *p);
 extern void ppc_release_vec_register(struct ppc_function *p, int reg);
 
 
+
 /**
  ** float vector arithmetic
  **/
@@ -126,9 +145,18 @@ extern void
 ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
 
 
+/** vector store: store vR at mem[vA+vB] */
+extern void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** vector load: vR = mem[vA+vB] */
+extern void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+
 
 /**
- ** bitwise operations
+ ** vector bitwise operations
  **/
 
 
@@ -152,6 +180,10 @@ ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
 extern void
 ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 
+/** Pseudo-instruction: vector move */
+extern void
+ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
+
 
 /**
  ** Vector shuffle / select / splat / etc
@@ -177,5 +209,106 @@ ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
 extern void
 ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
 
+/** vector splat signed immediate word */
+extern void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm);
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+extern void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+
+/**
+ ** scalar arithmetic
+ **/
+
+extern void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_mr(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_li(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_lis(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_load_int(struct ppc_function *p, uint rt, int imm);
+
+
+
+/**
+ ** scalar load/store
+ **/
+
+extern void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_lwz(struct ppc_function *p, uint rs, uint ra, int d);
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+extern void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset);
+
+extern void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb);
+
+
+
+/**
+ ** branch instructions
+ **/
+
+extern void
+ppc_blr(struct ppc_function *p);
+
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg);
+
+extern void
+ppc_return(struct ppc_function *p);
+
 
 #endif /* RTASM_PPC_H */
-- 
cgit v1.2.3


From 049f57f86a2cb8ff08fba819c581a034ca7ea52c Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 11:06:39 -0600
Subject: gallium: added ppc_lvewx()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 7 +++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 4 ++++
 2 files changed, 11 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 4a94ed0460..aaec2d2191 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -582,6 +582,13 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
    emit_x(p, 31, vR, vA, vB, 103);
 }
 
+/** load vector element word: vR = mem_word[vA+vB] */
+void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 71);
+}
+
 
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 6370b60494..53d5746dc8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -153,6 +153,10 @@ ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
+/** load vector element word: vR = mem_word[vA+vB] */
+extern void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
 
 
 /**
-- 
cgit v1.2.3


From ebdc399d83d6bd2f4e3594874483dbca5f9f5c0e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 13:57:56 -0600
Subject: gallium: fix-up confusing register allocation masks in rtasm_ppc.c

Plus, add ppc_reserve_register() func.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 56 ++++++++++++++++++++-------------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  1 +
 2 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index aaec2d2191..2d9f4e079e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -49,13 +49,15 @@ ppc_init_func(struct ppc_function *p, unsigned max_inst)
    p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
    p->num_inst = 0;
    p->max_inst = max_inst;
-   p->fp_used = ~0x0;
-   p->vec_used = ~0x0;
-
-   /* only allow using gp registers 7..12 for now */
    p->reg_used = 0x0;
-   for (i = 7; i < 13; i++)
-      p->reg_used |= (1 << i);
+   p->fp_used = 0x0;
+   p->vec_used = 0x0;
+
+   /* only allow using gp registers 3..12 for now */
+   for (i = 0; i < 3; i++)
+      ppc_reserve_register(p, i);
+   for (i = 12; i < PPC_NUM_REGS; i++)
+      ppc_reserve_register(p, i);
 }
 
 
@@ -95,6 +97,18 @@ ppc_dump_func(const struct ppc_function *p)
 }
 
 
+/**
+ * Mark a register as being unavailable.
+ */
+int
+ppc_reserve_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   p->reg_used |= (1 << reg);
+   return reg;
+}
+
+
 /**
  * Allocate a general purpose register.
  * \return register index or -1 if none left.
@@ -105,8 +119,8 @@ ppc_allocate_register(struct ppc_function *p)
    unsigned i;
    for (i = 0; i < PPC_NUM_REGS; i++) {
       const uint64_t mask = 1 << i;
-      if ((p->reg_used & mask) != 0) {
-         p->reg_used &= ~mask;
+      if ((p->reg_used & mask) == 0) {
+         p->reg_used |= mask;
          return i;
       }
    }
@@ -121,8 +135,8 @@ void
 ppc_release_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_REGS);
-   assert((p->reg_used & (1 << reg)) == 0);
-   p->reg_used |= (1 << reg);
+   assert(p->reg_used & (1 << reg));
+   p->reg_used &= ~(1 << reg);
 }
 
 
@@ -136,8 +150,8 @@ ppc_allocate_fp_register(struct ppc_function *p)
    unsigned i;
    for (i = 0; i < PPC_NUM_FP_REGS; i++) {
       const uint64_t mask = 1 << i;
-      if ((p->fp_used & mask) != 0) {
-         p->fp_used &= ~mask;
+      if ((p->fp_used & mask) == 0) {
+         p->fp_used |= mask;
          return i;
       }
    }
@@ -152,8 +166,8 @@ void
 ppc_release_fp_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_FP_REGS);
-   assert((p->fp_used & (1 << reg)) == 0);
-   p->fp_used |= (1 << reg);
+   assert(p->fp_used & (1 << reg));
+   p->fp_used &= ~(1 << reg);
 }
 
 
@@ -167,8 +181,8 @@ ppc_allocate_vec_register(struct ppc_function *p)
    unsigned i;
    for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
       const uint64_t mask = 1 << i;
-      if ((p->vec_used & mask) != 0) {
-         p->vec_used &= ~mask;
+      if ((p->vec_used & mask) == 0) {
+         p->vec_used |= mask;
          return i;
       }
    }
@@ -183,8 +197,8 @@ void
 ppc_release_vec_register(struct ppc_function *p, int reg)
 {
    assert(reg < PPC_NUM_VEC_REGS);
-   assert((p->vec_used & (1 << reg)) == 0);
-   p->vec_used |= (1 << reg);
+   assert(p->vec_used & (1 << reg));
+   p->vec_used &= ~(1 << reg);
 }
 
 
@@ -582,11 +596,11 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
    emit_x(p, 31, vR, vA, vB, 103);
 }
 
-/** load vector element word: vR = mem_word[vA+vB] */
+/** load vector element word: vR = mem_word[ra+rb] */
 void
-ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB)
+ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
 {
-   emit_x(p, 31, vR, vA, vB, 71);
+   emit_x(p, 31, vr, ra, rb, 71);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 53d5746dc8..85679b4886 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -67,6 +67,7 @@ extern void ppc_release_func(struct ppc_function *p);
 extern void (*ppc_get_func( struct ppc_function *p ))( void );
 extern void ppc_dump_func(const struct ppc_function *p);
 
+extern int ppc_reserve_register(struct ppc_function *p, int reg);
 extern int ppc_allocate_register(struct ppc_function *p);
 extern void ppc_release_register(struct ppc_function *p, int reg);
 extern int ppc_allocate_fp_register(struct ppc_function *p);
-- 
cgit v1.2.3


From b06d0720194dfecaf45dc97cbd178411aed5205f Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 14:48:33 -0600
Subject: gallium: added ppc_vload_float(), for limited cases

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 18 ++++++++++++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  4 ++++
 2 files changed, 22 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 2d9f4e079e..65df676eae 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -603,6 +603,24 @@ ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
    emit_x(p, 31, vr, ra, rb, 71);
 }
 
+/** vector load float: vr = splats(imm) */
+void
+ppc_vload_float(struct ppc_function *p, uint vr, float imm)
+{
+   if (imm == 0.0f) {
+      ppc_vxor(p, vr, vr, vr);
+   }
+   else if (imm == 1.0f) {
+      /* use 2^0=1 to get 1.0 */
+      ppc_vxor(p, vr, vr, vr);  /* vr = {0,0,0,0} */
+      ppc_vexptefp(p, vr, vr);  /* vr = 0^0 */
+   }
+   else {
+      assert(0);
+   }
+}
+
+
 
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 85679b4886..9f1e3fcd84 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -158,6 +158,10 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
+/** vector load float: vr = splats(imm) */
+extern void
+ppc_vload_float(struct ppc_function *p, uint vr, float imm);
+
 
 
 /**
-- 
cgit v1.2.3


From 3026616c48487a7561d8545c08950539f0ad51d1 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:17:11 -0600
Subject: gallium: added ppc_vzero()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 8 ++++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 5 +++++
 2 files changed, 13 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 65df676eae..51d9b53657 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -669,6 +669,14 @@ ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
    ppc_vor(p, vD, vA, vA);
 }
 
+/** Set vector register to {0,0,0,0} */
+void
+ppc_vzero(struct ppc_function *p, uint vr)
+{
+   ppc_vxor(p, vr, vr, vr);
+}
+
+
 
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 9f1e3fcd84..f194d3be13 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -193,6 +193,11 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 extern void
 ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
 
+/** Set vector register to {0,0,0,0} */
+extern void
+ppc_vzero(struct ppc_function *p, uint vr);
+
+
 
 /**
  ** Vector shuffle / select / splat / etc
-- 
cgit v1.2.3


From f8ab4feb75f4a592e23859813c093dcdbd4b8988 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 22 Oct 2008 17:21:43 -0600
Subject: gallium: remove ppc_vload_float(), rename ppc_vecmove() ->
 ppc_vmove().

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 19 +------------------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  6 +-----
 src/gallium/auxiliary/tgsi/tgsi_ppc.c   |  2 +-
 3 files changed, 3 insertions(+), 24 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 51d9b53657..7dd8263749 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -603,23 +603,6 @@ ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
    emit_x(p, 31, vr, ra, rb, 71);
 }
 
-/** vector load float: vr = splats(imm) */
-void
-ppc_vload_float(struct ppc_function *p, uint vr, float imm)
-{
-   if (imm == 0.0f) {
-      ppc_vxor(p, vr, vr, vr);
-   }
-   else if (imm == 1.0f) {
-      /* use 2^0=1 to get 1.0 */
-      ppc_vxor(p, vr, vr, vr);  /* vr = {0,0,0,0} */
-      ppc_vexptefp(p, vr, vr);  /* vr = 0^0 */
-   }
-   else {
-      assert(0);
-   }
-}
-
 
 
@@ -664,7 +647,7 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
 
 /** Pseudo-instruction: vector move */
 void
-ppc_vecmove(struct ppc_function *p, uint vD, uint vA)
+ppc_vmove(struct ppc_function *p, uint vD, uint vA)
 {
    ppc_vor(p, vD, vA, vA);
 }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index f194d3be13..f938d8d759 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -158,10 +158,6 @@ ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
 extern void
 ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
 
-/** vector load float: vr = splats(imm) */
-extern void
-ppc_vload_float(struct ppc_function *p, uint vr, float imm);
-
 
 
 /**
@@ -191,7 +187,7 @@ ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
 
 /** Pseudo-instruction: vector move */
 extern void
-ppc_vecmove(struct ppc_function *p, uint vD, uint vA);
+ppc_vmove(struct ppc_function *p, uint vD, uint vA);
 
 /** Set vector register to {0,0,0,0} */
 extern void
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 6b05fd16cf..96beec0cc6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -229,7 +229,7 @@ emit_fetch(struct gen_context *gen,
    case TGSI_EXTSWIZZLE_ONE:
       {
          int one_vec = gen_one_vec(gen);
-         ppc_vecmove(gen->f, dst_vec, one_vec);
+         ppc_vmove(gen->f, dst_vec, one_vec);
       }
       break;
    default:
-- 
cgit v1.2.3


From 6b69e3c71741d99a54c6f4dcb605a3c241239aeb Mon Sep 17 00:00:00 2001
From: Michel Dänzer <michel@tungstengraphics.com>
Date: Thu, 23 Oct 2008 10:28:48 +0200
Subject: scons: ppc support.

---
 SConstruct                             | 2 ++
 common.py                              | 3 ++-
 scons/gallium.py                       | 1 +
 src/gallium/auxiliary/draw/SConscript  | 1 +
 src/gallium/auxiliary/rtasm/SConscript | 1 +
 src/gallium/auxiliary/tgsi/SConscript  | 1 +
 src/mesa/SConscript                    | 4 ++++
 7 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/SConstruct b/SConstruct
index c1dc624651..8c96817dae 100644
--- a/SConstruct
+++ b/SConstruct
@@ -70,12 +70,14 @@ platform = env['platform']
 
 # derived options
 x86 = machine == 'x86'
+ppc = machine == 'ppc'
 gcc = platform in ('linux', 'freebsd', 'darwin')
 msvc = platform in ('windows', 'winddk')
 
 Export([
 	'debug', 
 	'x86', 
+	'ppc', 
 	'dri', 
 	'llvm',
 	'platform',
diff --git a/common.py b/common.py
index dd64e0f434..cc2582f1a4 100644
--- a/common.py
+++ b/common.py
@@ -24,6 +24,7 @@ _machine_map = {
 	'i486': 'x86',
 	'i586': 'x86',
 	'i686': 'x86',
+	'ppc' : 'ppc',
 	'x86_64': 'x86_64',
 }
 if 'PROCESSOR_ARCHITECTURE' in os.environ:
@@ -56,7 +57,7 @@ def AddOptions(opts):
 	opts.Add(BoolOption('profile', 'profile build', 'no'))
 	#opts.Add(BoolOption('quiet', 'quiet command lines', 'no'))
 	opts.Add(EnumOption('machine', 'use machine-specific assembly code', default_machine,
-											 allowed_values=('generic', 'x86', 'x86_64')))
+											 allowed_values=('generic', 'ppc', 'x86', 'x86_64')))
 	opts.Add(EnumOption('platform', 'target platform', default_platform,
 											 allowed_values=('linux', 'cell', 'windows', 'winddk', 'wince')))
 	opts.Add(BoolOption('llvm', 'use LLVM', 'no'))
diff --git a/scons/gallium.py b/scons/gallium.py
index 3631607e66..2a42bdf2bb 100644
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -175,6 +175,7 @@ def generate(env):
     machine = env['machine']
     platform = env['platform']
     x86 = env['machine'] == 'x86'
+    ppc = env['machine'] == 'ppc'
     gcc = env['platform'] in ('linux', 'freebsd', 'darwin')
     msvc = env['platform'] in ('windows', 'winddk', 'wince')
 
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
index 544a04918b..5f05aa324a 100644
--- a/src/gallium/auxiliary/draw/SConscript
+++ b/src/gallium/auxiliary/draw/SConscript
@@ -38,6 +38,7 @@ draw = env.ConvenienceLibrary(
 		'draw_vs_aos_machine.c',
 		'draw_vs_exec.c',
 		'draw_vs_llvm.c',
+		'draw_vs_ppc.c',
 		'draw_vs_sse.c',
 		'draw_vs_varient.c'
 	])
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
index 8ea25922aa..eb48368acc 100644
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -6,6 +6,7 @@ rtasm = env.ConvenienceLibrary(
 		'rtasm_cpu.c',
 		'rtasm_execmem.c',
 		'rtasm_x86sse.c',
+		'rtasm_ppc.c',
 		'rtasm_ppc_spe.c',
 	])
 
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
index 45bf3f6d57..8200cce42f 100644
--- a/src/gallium/auxiliary/tgsi/SConscript
+++ b/src/gallium/auxiliary/tgsi/SConscript
@@ -12,6 +12,7 @@ tgsi = env.ConvenienceLibrary(
 		'tgsi_parse.c',
 		'tgsi_sanity.c',
 		'tgsi_scan.c',
+		'tgsi_ppc.c',
 		'tgsi_sse2.c',
 		'tgsi_text.c',
 		'tgsi_transform.c',
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index af8dfcb493..89b98b37ab 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -283,6 +283,10 @@ if env['platform'] != 'winddk':
 			'x86-64/glapi_x86-64.S'
 		]
 	elif gcc and env['machine'] == 'ppc':
+		env.Append(CPPDEFINES = [
+			'USE_PPC_ASM', 
+			'USE_VMX_ASM', 
+		])
 		mesa_sources += [
 			'ppc/common_ppc.c',
 		]
-- 
cgit v1.2.3


From 7640264064c2cbc9922f7f3df51f7caa7b449e8e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 11:03:51 -0600
Subject: gallium: added ppc_vnmsubfp()

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 7 +++++++
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 7dd8263749..a90b5587b0 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -505,6 +505,13 @@ ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
    emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
 }
 
+/** vector float negative mult subtract: vD = vA - vB * vC */
+void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 47, vD, vB, vA, vC); /* note arg order */
+}
+
 /** vector float compare greater than */
 void
 ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index f938d8d759..561e139bce 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -97,10 +97,14 @@ ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
 extern void
 ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
 
-/** vector float mult add */
+/** vector float mult add: vD = vA * vB + vC */
 extern void
 ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
 
+/** vector float negative mult subtract: vD = vA - vB * vC */
+extern void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
 /** vector float compare greater than */
 extern void
 ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
-- 
cgit v1.2.3


From 09570d2e737a4c9f3f24edd78af3b897ee261733 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 14:08:13 -0600
Subject: gallium: test for PIPE_OS_LINUX instead of __linux__

---
 src/gallium/auxiliary/rtasm/rtasm_execmem.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 19087589a8..864bd4d3fe 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -38,12 +38,13 @@
 #include "rtasm_execmem.h"
 
 
-#if defined(__linux__)
+#if defined(PIPE_OS_LINUX)
+
 
 /*
  * Allocate a large block of memory which can hold code then dole it out
  * in pieces by means of the generic memory manager code.
-*/
+ */
 
 #include <unistd.h>
 #include <sys/mman.h>
@@ -113,7 +114,7 @@ rtasm_exec_free(void *addr)
 }
 
 
-#else
+#else /* PIPE_OS_LINUX */
 
 /*
  * Just use regular memory.
@@ -133,4 +134,4 @@ rtasm_exec_free(void *addr)
 }
 
 
-#endif
+#endif /* PIPE_OS_LINUX */
-- 
cgit v1.2.3


From 3ad56968f09397a8dd417eae025b9506efaf8414 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 14:19:12 -0600
Subject: gallium: prefix memory manager functions with u_ to differentiate
 from functions in mesa/main/mm.c

---
 src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c | 10 +++++-----
 src/gallium/auxiliary/rtasm/rtasm_execmem.c     |  8 ++++----
 src/gallium/auxiliary/util/u_mm.c               | 12 ++++++------
 src/gallium/auxiliary/util/u_mm.h               | 12 ++++++------
 4 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index fe80ca30ee..6e10cf1806 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -100,7 +100,7 @@ mm_buffer_destroy(struct pb_buffer *buf)
    assert(buf->base.refcount == 0);
    
    pipe_mutex_lock(mm->mutex);
-   mmFreeMem(mm_buf->block);
+   u_mmFreeMem(mm_buf->block);
    FREE(buf);
    pipe_mutex_unlock(mm->mutex);
 }
@@ -175,14 +175,14 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    
    mm_buf->mgr = mm;
    
-   mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+   mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
    if(!mm_buf->block) {
       debug_printf("warning: heap full\n");
 #if 0
       mmDumpMemInfo(mm->heap);
 #endif
       
-      mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+      mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
       if(!mm_buf->block) {
          FREE(mm_buf);
          pipe_mutex_unlock(mm->mutex);
@@ -213,7 +213,7 @@ mm_bufmgr_destroy(struct pb_manager *mgr)
    
    pipe_mutex_lock(mm->mutex);
 
-   mmDestroy(mm->heap);
+   u_mmDestroy(mm->heap);
    
    pb_unmap(mm->buffer);
    pb_reference(&mm->buffer, NULL);
@@ -254,7 +254,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    if(!mm->map)
       goto failure;
 
-   mm->heap = mmInit(0, size); 
+   mm->heap = u_mmInit(0, size); 
    if (!mm->heap)
       goto failure;
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 864bd4d3fe..df353633e8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -63,7 +63,7 @@ static void
 init_heap(void)
 {
    if (!exec_heap)
-      exec_heap = mmInit( 0, EXEC_HEAP_SIZE );
+      exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE );
    
    if (!exec_mem)
       exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE, 
@@ -84,7 +84,7 @@ rtasm_exec_malloc(size_t size)
 
    if (exec_heap) {
       size = (size + 31) & ~31;
-      block = mmAllocMem( exec_heap, size, 32, 0 );
+      block = u_mmAllocMem( exec_heap, size, 32, 0 );
    }
 
    if (block)
@@ -104,10 +104,10 @@ rtasm_exec_free(void *addr)
    pipe_mutex_lock(exec_mutex);
 
    if (exec_heap) {
-      struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+      struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
    
       if (block)
-	 mmFreeMem(block);
+	 u_mmFreeMem(block);
    }
 
    pipe_mutex_unlock(exec_mutex);
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
index 0f51dd5977..592ace00fc 100644
--- a/src/gallium/auxiliary/util/u_mm.c
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -31,7 +31,7 @@
 
 
 void
-mmDumpMemInfo(const struct mem_block *heap)
+u_mmDumpMemInfo(const struct mem_block *heap)
 {
    debug_printf("Memory heap %p:\n", (void *)heap);
    if (heap == 0) {
@@ -58,7 +58,7 @@ mmDumpMemInfo(const struct mem_block *heap)
 }
 
 struct mem_block *
-mmInit(int ofs, int size)
+u_mmInit(int ofs, int size)
 {
    struct mem_block *heap, *block;
   
@@ -165,7 +165,7 @@ SliceBlock(struct mem_block *p,
 
 
 struct mem_block *
-mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
 {
    struct mem_block *p;
    const int mask = (1 << align2)-1;
@@ -198,7 +198,7 @@ mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
 
 
 struct mem_block *
-mmFindBlock(struct mem_block *heap, int start)
+u_mmFindBlock(struct mem_block *heap, int start)
 {
    struct mem_block *p;
 
@@ -237,7 +237,7 @@ Join2Blocks(struct mem_block *p)
 }
 
 int
-mmFreeMem(struct mem_block *b)
+u_mmFreeMem(struct mem_block *b)
 {
    if (!b)
       return 0;
@@ -266,7 +266,7 @@ mmFreeMem(struct mem_block *b)
 
 
 void
-mmDestroy(struct mem_block *heap)
+u_mmDestroy(struct mem_block *heap)
 {
    struct mem_block *p;
 
diff --git a/src/gallium/auxiliary/util/u_mm.h b/src/gallium/auxiliary/util/u_mm.h
index b226b101cb..ce20e48763 100644
--- a/src/gallium/auxiliary/util/u_mm.h
+++ b/src/gallium/auxiliary/util/u_mm.h
@@ -49,7 +49,7 @@ struct mem_block {
  * input: total size in bytes
  * return: a heap pointer if OK, NULL if error
  */
-extern struct mem_block *mmInit(int ofs, int size);
+extern struct mem_block *u_mmInit(int ofs, int size);
 
 /**
  * Allocate 'size' bytes with 2^align2 bytes alignment,
@@ -61,7 +61,7 @@ extern struct mem_block *mmInit(int ofs, int size);
  *		startSearch = linear offset from start of heap to begin search
  * return: pointer to the allocated block, 0 if error
  */
-extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2, 
+extern struct mem_block *u_mmAllocMem(struct mem_block *heap, int size, int align2, 
                             int startSearch);
 
 /**
@@ -69,23 +69,23 @@ extern struct mem_block *mmAllocMem(struct mem_block *heap, int size, int align2
  * input: pointer to a block
  * return: 0 if OK, -1 if error
  */
-extern int mmFreeMem(struct mem_block *b);
+extern int u_mmFreeMem(struct mem_block *b);
 
 /**
  * Free block starts at offset
  * input: pointer to a heap, start offset
  * return: pointer to a block
  */
-extern struct mem_block *mmFindBlock(struct mem_block *heap, int start);
+extern struct mem_block *u_mmFindBlock(struct mem_block *heap, int start);
 
 /**
  * destroy MM
  */
-extern void mmDestroy(struct mem_block *mmInit);
+extern void u_mmDestroy(struct mem_block *mmInit);
 
 /**
  * For debuging purpose.
  */
-extern void mmDumpMemInfo(const struct mem_block *mmInit);
+extern void u_mmDumpMemInfo(const struct mem_block *mmInit);
 
 #endif
-- 
cgit v1.2.3


From 8828d52348d81e1b9ec985200a430554873b5f4e Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 14:28:57 -0600
Subject: gallium: fix alignment parameter passed to u_mmAllocMem()

Was 32, now 5.  The param is expressed as a power of two exponent.
The net effect is that the alignment was a no-op on X86 but on PPC we
always got the same memory address everytime rtasm_exec_malloc() was called.
---
 src/gallium/auxiliary/rtasm/rtasm_execmem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index df353633e8..be7433baf8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -83,8 +83,8 @@ rtasm_exec_malloc(size_t size)
    init_heap();
 
    if (exec_heap) {
-      size = (size + 31) & ~31;
-      block = u_mmAllocMem( exec_heap, size, 32, 0 );
+      size = (size + 31) & ~31;  /* next multiple of 32 bytes */
+      block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
    }
 
    if (block)
-- 
cgit v1.2.3


From a5d920297a2affe34c535d30a2c49588f92f69ad Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 16:26:10 -0600
Subject: gallium: use execmem for PPC code, grow instruction buffer as needed

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 70 +++++++++++++++++++++++----------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |  1 +
 src/gallium/auxiliary/tgsi/tgsi_ppc.c   |  8 ++++
 3 files changed, 58 insertions(+), 21 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index a90b5587b0..e73ed71a0b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -38,6 +38,7 @@
 #include <stdio.h>
 #include "util/u_memory.h"
 #include "pipe/p_debug.h"
+#include "rtasm_execmem.h"
 #include "rtasm_ppc.h"
 
 
@@ -46,9 +47,9 @@ ppc_init_func(struct ppc_function *p, unsigned max_inst)
 {
    uint i;
 
-   p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
    p->num_inst = 0;
-   p->max_inst = max_inst;
+   p->max_inst = 100; /* first guess at buffer size */
+   p->store = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
    p->reg_used = 0x0;
    p->fp_used = 0x0;
    p->vec_used = 0x0;
@@ -66,12 +67,19 @@ ppc_release_func(struct ppc_function *p)
 {
    assert(p->num_inst <= p->max_inst);
    if (p->store != NULL) {
-      align_free(p->store);
+      rtasm_exec_free(p->store);
    }
    p->store = NULL;
 }
 
 
+uint
+ppc_num_instructions(const struct ppc_function *p)
+{
+   return p->num_inst;
+}
+
+
 void (*ppc_get_func(struct ppc_function *p))(void)
 {
 #if 0
@@ -202,6 +210,35 @@ ppc_release_vec_register(struct ppc_function *p, int reg)
 }
 
 
+/**
+ * Append instruction to instruction buffer.  Grow buffer if out of room.
+ */
+static void
+emit_instruction(struct ppc_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * PPC_INST_SIZE);
+      }
+      rtasm_exec_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
 
 union vx_inst {
    uint32_t bits;
@@ -223,8 +260,7 @@ emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.vA = vA;
    inst.inst.vB = vB;
    inst.inst.op2 = op2;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -250,8 +286,7 @@ emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.vB = vB;
    inst.inst.rC = 0;
    inst.inst.op2 = op2;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -277,8 +312,7 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
    inst.inst.vB = vB;
    inst.inst.vC = vC;
    inst.inst.op2 = op2;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -300,8 +334,7 @@ emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
    inst.inst.li = li;
    inst.inst.aa = aa;
    inst.inst.lk = lk;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 
@@ -330,8 +363,7 @@ emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
    inst.inst.bh = bh;
    inst.inst.op2 = op2;
    inst.inst.lk = lk;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 static INLINE void
@@ -373,8 +405,7 @@ emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
    inst.inst.rb = rb;
    inst.inst.op2 = op2;
    inst.inst.unused = 0x0;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 
@@ -398,8 +429,7 @@ emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
    inst.inst.rt = rt;
    inst.inst.ra = ra;
    inst.inst.si = (unsigned) (si & 0xffff);
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -428,8 +458,7 @@ emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
    inst.inst.unused = 0x0;
    inst.inst.op2 = op2;
    inst.inst.rc = rc;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 };
 
 
@@ -458,8 +487,7 @@ emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
    inst.inst.oe = oe;
    inst.inst.op2 = op2;
    inst.inst.rc = rc;
-   p->store[p->num_inst++] = inst.bits;
-   assert(p->num_inst <= p->max_inst);
+   emit_instruction(p, inst.bits);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 561e139bce..d0477dec94 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -64,6 +64,7 @@ struct ppc_function
 
 extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
 extern void ppc_release_func(struct ppc_function *p);
+extern uint ppc_num_instructions(const struct ppc_function *p);
 extern void (*ppc_get_func( struct ppc_function *p ))( void );
 extern void ppc_dump_func(const struct ppc_function *p);
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index 5d13070922..a92b1902e3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1315,6 +1315,14 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
 
    tgsi_parse_free( &parse );
 
+   if (ppc_num_instructions(func) == 0) {
+      /* ran out of memory for instructions */
+      ok = FALSE;
+   }
+
+   if (!ok)
+      debug_printf("TGSI->PPC translation failed\n");
+
    return ok;
 }
 
-- 
cgit v1.2.3


From 725ba94ce5701aa8690c7ab2ea792dda86cbbe7a Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 16:35:59 -0600
Subject: gallium: no longer pass max_inst to ppc_init_func()

---
 src/gallium/auxiliary/draw/draw_vs_ppc.c | 2 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc.c  | 2 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc.h  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index d720c7bbd5..8b75136144 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -197,7 +197,7 @@ draw_create_vs_ppc(struct draw_context *draw,
    vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
                                       sizeof(float), 16);
 
-   ppc_init_func( &vs->ppc_program, 2000 ); /* XXX fix limit */
+   ppc_init_func( &vs->ppc_program );
 
    if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
 			&vs->ppc_program, 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index e73ed71a0b..6d11263be8 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -43,7 +43,7 @@
 
 
 void
-ppc_init_func(struct ppc_function *p, unsigned max_inst)
+ppc_init_func(struct ppc_function *p)
 {
    uint i;
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index d0477dec94..afb4704c39 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -62,7 +62,7 @@ struct ppc_function
 
 
-extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
+extern void ppc_init_func(struct ppc_function *p);
 extern void ppc_release_func(struct ppc_function *p);
 extern uint ppc_num_instructions(const struct ppc_function *p);
 extern void (*ppc_get_func( struct ppc_function *p ))( void );
-- 
cgit v1.2.3


From f952aac1da432336f330122cacc30a87f52b4101 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 29 Oct 2008 16:56:28 -0600
Subject: gallium: grow SPE instruction buffer as needed

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 57 +++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 16 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index dea1aed032..f8568f690b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -185,6 +185,34 @@ reg_name(int reg)
 }
 
 
+static void
+emit_instruction(struct spe_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = align_malloc(p->max_inst * SPE_INST_SIZE, 16);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * SPE_INST_SIZE);
+      }
+      align_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
 		    unsigned rA, unsigned rB, const char *name)
 {
@@ -193,8 +221,7 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, %s\n",
@@ -212,8 +239,7 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rC = rC;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
@@ -230,8 +256,7 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i7 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, 0x%x\n",
@@ -249,8 +274,7 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i8 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, 0x%x\n",
@@ -268,8 +292,7 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i10 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, %s, 0x%x\n",
@@ -295,8 +318,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i16 = imm;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
@@ -311,8 +333,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i18 = imm;
     inst.inst.rT = rT;
-    p->store[p->num_inst++] = inst.bits;
-    assert(p->num_inst <= p->max_inst);
+    emit_instruction(p, inst.bits);
     if (p->print) {
        indent(p);
        printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
@@ -394,15 +415,19 @@ void _name (struct spe_function *p, int imm) \
 
 /**
  * Initialize an spe_function.
- * \param code_size  size of instruction buffer to allocate, in bytes.
+ * \param code_size  initial size of instruction buffer to allocate, in bytes.
+ *                   If zero, use a default.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     unsigned int i;
 
-    p->store = align_malloc(code_size, 16);
+    if (!code_size)
+       code_size = 64;
+
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
+    p->store = align_malloc(code_size, 16);
 
     p->set_count = 0;
     memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
-- 
cgit v1.2.3


From 90027f85786406133a5180998a75fb612b6a221e Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Tue, 11 Nov 2008 13:57:10 -0700
Subject: CELL: two-sided stencil fixes

With these changes, the tests/stencil_twoside test now works.

- Eliminate blending from the stencil_twoside test, as it produces an
  unneeded dependency on having blending working

- The spe_splat() function will now work if the register being splatted
  and the destination register are the same

- Separate fragment code generated for front-facing and back-facing
  fragments.  Often these are the same; if two-sided stenciling is on,
  they can be different.  This is easier and faster than generating
  code that does both tests and merges the results.

- Fixed a cut/paste bug where if the back Z-pass stencil operation
  were different from all the other operations, the back Z-fail
  results were incorrect.
---
 progs/tests/stencil_twoside.c                      |   2 -
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c        |   7 +-
 src/gallium/drivers/cell/common.h                  |   6 +-
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c   | 239 ++++++---------------
 src/gallium/drivers/cell/ppu/cell_gen_fragment.h   |   2 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c     |  19 +-
 src/gallium/drivers/cell/spu/spu_command.c         |   6 +-
 src/gallium/drivers/cell/spu/spu_main.c            |   6 +-
 src/gallium/drivers/cell/spu/spu_main.h            |  10 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.c |   3 +-
 src/gallium/drivers/cell/spu/spu_per_fragment_op.h |   3 +-
 src/gallium/drivers/cell/spu/spu_tri.c             |  20 +-
 12 files changed, 115 insertions(+), 208 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/progs/tests/stencil_twoside.c b/progs/tests/stencil_twoside.c
index be9d9a776a..8826c46fc2 100644
--- a/progs/tests/stencil_twoside.c
+++ b/progs/tests/stencil_twoside.c
@@ -115,7 +115,6 @@ static void Display( void )
    glVertex2f(-1,  1);
    glEnd();
 
-
    if (use20syntax) {
       stencil_func_separate(GL_FRONT, GL_ALWAYS, 0, ~0);
       stencil_func_separate(GL_BACK, GL_ALWAYS, 0, ~0);
@@ -279,7 +278,6 @@ static void Init( void )
    stencil_op_separate = glutGetProcAddress( "glStencilOpSeparate" );
 
    printf("\nAll 5 squares should be the same color.\n");
-   glEnable( GL_BLEND );
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index f8568f690b..1bd9f1c8dd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -958,9 +958,12 @@ spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsig
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
+   /* Use a temporary, just in case rT == rA */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
    /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
-   spe_ila(p, rT, 0x00010203);
-   spe_shufb(p, rT, rA, rA, rT);
+   spe_ila(p, tmp_reg, 0x00010203);
+   spe_shufb(p, rT, rA, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
 }
 
 
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 87488ea2d7..a670ed3c6e 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -130,6 +130,9 @@
 #define CELL_FENCE_EMITTED   1
 #define CELL_FENCE_SIGNALLED 2
 
+#define CELL_FACING_FRONT    0
+#define CELL_FACING_BACK     1
+
 struct cell_fence
 {
    /** There's a 16-byte status qword per SPU */
@@ -160,7 +163,8 @@ struct cell_command_fragment_ops
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
-   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS];
+   unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index d9c3ff3f4d..6e425eafaa 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1412,144 +1412,72 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
  * and released by the corresponding spe_release_register_set() call.
  */
 static void
-gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *stencil,
+                       const unsigned int depth_enabled,
                        unsigned int fbS_reg, 
                        unsigned int *fail_reg, unsigned int *zfail_reg, 
-                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
-                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+                       unsigned int *zpass_reg)
 {
-   unsigned zfail_op, back_zfail_op;
+   unsigned zfail_op;
 
    /* Stenciling had better be enabled here */
-   ASSERT(dsa->stencil[0].enabled);
+   ASSERT(stencil->enabled);
 
    /* If the depth test is not enabled, it is treated as though it always
-    * passes.  In particular, that means that the "zfail_op" (and the backfacing
-    * counterpart, if active) are not considered - a failing stencil test will
-    * trigger the "fail_op", and a passing stencil test will trigger the
-    * "zpass_op".
+    * passes, which means that the zfail_op is not considered - a
+    * failing stencil test triggers the fail_op, and a passing one
+    * triggers the zpass_op
     *
-    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
-    * we keep them from being calculated.
+    * As an optimization, override calculation of the zfail_op values
+    * if they aren't going to be used.  By setting the value of
+    * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+    * to match the incoming stencil values, and no calculation will
+    * be done.
     */
-   if (dsa->depth.enabled) {
-      zfail_op = dsa->stencil[0].zfail_op;
-      back_zfail_op = dsa->stencil[1].zfail_op;
+   if (depth_enabled) {
+      zfail_op = stencil->zfail_op;
    }
    else {
       zfail_op = PIPE_STENCIL_OP_KEEP;
-      back_zfail_op = PIPE_STENCIL_OP_KEEP;
    }
 
    /* One-sided or front-facing stencil */
-   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
       *fail_reg = fbS_reg;
    }
    else {
       *fail_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->fail_op, stencil->ref_value, 
          0xff, fbS_reg, *fail_reg);
    }
 
+   /* Check the possibly overridden value, not the structure value */
    if (zfail_op == PIPE_STENCIL_OP_KEEP) {
       *zfail_reg = fbS_reg;
    }
-   else if (zfail_op == dsa->stencil[0].fail_op) {
+   else if (zfail_op == stencil->fail_op) {
       *zfail_reg = *fail_reg;
    }
    else {
       *zfail_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, 
          0xff, fbS_reg, *zfail_reg);
    }
 
-   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+   if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
       *zpass_reg = fbS_reg;
    }
-   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+   else if (stencil->zpass_op == stencil->fail_op) {
       *zpass_reg = *fail_reg;
    }
-   else if (dsa->stencil[0].zpass_op == zfail_op) {
+   else if (stencil->zpass_op == zfail_op) {
       *zpass_reg = *zfail_reg;
    }
    else {
       *zpass_reg = spe_allocate_available_register(f);
-      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+      gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, 
          0xff, fbS_reg, *zpass_reg);
    }
-
-   /* If two-sided stencil is enabled, we have more work to do. */
-   if (!dsa->stencil[1].enabled) {
-      /* This just flags that the registers need not be deallocated later */
-      *back_fail_reg = fbS_reg;
-      *back_zfail_reg = fbS_reg;
-      *back_zpass_reg = fbS_reg;
-   }
-   else {
-      /* Same calculations as above, but for the back stencil */
-      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
-         *back_fail_reg = fbS_reg;
-      }
-      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
-         *back_fail_reg = *fail_reg;
-      }
-      else if (dsa->stencil[1].fail_op == zfail_op) {
-         *back_fail_reg = *zfail_reg;
-      }
-      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
-         *back_fail_reg = *zpass_reg;
-      }
-      else {
-         *back_fail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_fail_reg);
-      }
-
-      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
-         *back_zfail_reg = fbS_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[0].fail_op) {
-         *back_zfail_reg = *fail_reg;
-      }
-      else if (back_zfail_op == zfail_op) {
-         *back_zfail_reg = *zfail_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
-         *back_zfail_reg = *zpass_reg;
-      }
-      else if (back_zfail_op == dsa->stencil[1].fail_op) {
-         *back_zfail_reg = *back_fail_reg;
-      }
-      else {
-         *back_zfail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_zfail_reg);
-      }
-
-      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
-         *back_zpass_reg = fbS_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
-         *back_zpass_reg = *fail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == zfail_op) {
-         *back_zpass_reg = *zfail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
-         *back_zpass_reg = *zpass_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
-         *back_zpass_reg = *back_fail_reg;
-      }
-      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
-         *back_zpass_reg = *back_zfail_reg;
-      }
-      else {
-         *back_zfail_reg = spe_allocate_available_register(f);
-         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
-            0xff, fbS_reg, *back_zpass_reg);
-      }
-   } /* End of calculations for back-facing stencil */
 }
 
 /* Note that fbZ_reg may *not* be set on entry, if in fact
@@ -1559,7 +1487,7 @@ gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_a
 static boolean
 gen_stencil_depth_test(struct spe_function *f, 
                        const struct pipe_depth_stencil_alpha_state *dsa, 
-                       const int const facing_reg,
+                       const uint facing,
                        const int mask_reg, const int fragZ_reg, 
                        const int fbZ_reg, const int fbS_reg)
 {
@@ -1571,6 +1499,8 @@ gen_stencil_depth_test(struct spe_function *f,
    boolean need_to_calculate_stencil_values;
    boolean need_to_writemask_stencil_values;
 
+   struct pipe_stencil_state *stencil;
+
    /* Registers.  We may or may not actually allocate these, depending
     * on whether the state values indicate that we need them.
     */
@@ -1598,6 +1528,20 @@ gen_stencil_depth_test(struct spe_function *f,
    spe_comment(f, 0, "Allocating stencil register set");
    spe_allocate_register_set(f);
 
+   /* The facing we're given is the fragment facing; it doesn't
+    * exactly match the stencil facing.  If stencil is enabled,
+    * but two-sided stencil is *not* enabled, we use the same
+    * stencil settings for both front- and back-facing fragments.
+    * We only use the "back-facing" stencil for backfacing fragments
+    * if two-sided stenciling is enabled.
+    */
+   if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+      stencil = &dsa->stencil[1];
+   }
+   else {
+      stencil = &dsa->stencil[0];
+   }
+
    /* Calculate the writemask.  If the writemask is trivial (either
     * all 0s, meaning that we don't need to calculate any stencil values
     * because they're not going to change the stencil anyway, or all 1s,
@@ -1608,24 +1552,20 @@ gen_stencil_depth_test(struct spe_function *f,
     * Note that if the backface stencil is *not* enabled, the backface
     * stencil will have the same values as the frontface stencil.
     */
-   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[0].zfail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].zfail_op == PIPE_STENCIL_OP_KEEP &&
-       dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
-       /* No changes to any stencil values */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
        need_to_calculate_stencil_values = false;
        need_to_writemask_stencil_values = false;
     }
-    else if (dsa->stencil[0].write_mask == 0x0 && dsa->stencil[1].write_mask == 0x0) {
+    else if (stencil->write_mask == 0x0) {
       /* All changes are writemasked out, so no need to calculate
        * what those changes might be, and no need to write anything back.
        */
       need_to_calculate_stencil_values = false;
       need_to_writemask_stencil_values = false;
    }
-   else if (dsa->stencil[0].write_mask == 0xff && dsa->stencil[1].write_mask == 0xff) {
+   else if (stencil->write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
@@ -1645,14 +1585,7 @@ gen_stencil_depth_test(struct spe_function *f,
        */
       spe_comment(f, 0, "Computing stencil writemask");
       stencil_writemask_reg = spe_allocate_available_register(f);
-      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
-      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
-         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
-         spe_comment(f, 0, "Resolving two-sided stencil writemask");
-         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
-         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
-         spe_release_register(f, back_write_mask_reg);
-      }
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask);
    }
 
    /* At least one-sided stenciling must be on.  Generate code that
@@ -1666,19 +1599,7 @@ gen_stencil_depth_test(struct spe_function *f,
     */
    spe_comment(f, 0, "Running basic stencil test");
    stencil_pass_reg = spe_allocate_available_register(f);
-   gen_stencil_test(f, &dsa->stencil[0], 0xff, mask_reg, fbS_reg, stencil_pass_reg);
-
-   /* If two-sided stenciling is on, generate code to run the stencil
-    * test on the backfacing stencil as well, and combine the two results
-    * into the one correct result based on facing.
-    */
-   if (dsa->stencil[1].enabled) {
-      unsigned int temp_reg = spe_allocate_available_register(f);
-      spe_comment(f, 0, "Running backface stencil test");
-      gen_stencil_test(f, &dsa->stencil[1], 0xff, mask_reg, fbS_reg, temp_reg);
-      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
-      spe_release_register(f, temp_reg);
-   }
+   gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
 
    /* Generate code that, given the mask of valid fragments and the
     * mask of valid fragments that passed the stencil test, computes
@@ -1698,9 +1619,6 @@ gen_stencil_depth_test(struct spe_function *f,
 
    /* We may not need to calculate stencil values, if the writemask is off */
    if (need_to_calculate_stencil_values) {
-      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
-      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
-
       /* Generate code that calculates exactly which stencil values we need,
        * without calculating the same value twice (say, if two different
        * stencil ops have the same value).  This code will work for one-sided
@@ -1715,51 +1633,11 @@ gen_stencil_depth_test(struct spe_function *f,
        * This function will allocate a variant number of registers that
        * will be released as part of the register set.
        */
-      spe_comment(f, 0, "Computing stencil values");
-      gen_get_stencil_values(f, dsa, fbS_reg, 
-         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
-         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
-         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
-
-      /* Tricky, tricky, tricky - the things we do to create optimal
-       * code...
-       *
-       * The various stencil values registers may overlap with each other
-       * and with fbS_reg arbitrarily (as any particular operation is
-       * only calculated once and stored in one register, no matter
-       * how many times it is used).  So we can't change the values 
-       * within those registers directly - if we change a value in a
-       * register that's being referenced by two different calculations,
-       * we've just unwittingly changed the second value as well...
-       *
-       * Avoid this by allocating new registers to hold the results
-       * (there may be 2, if the depth test is off, or 3, if it is on).
-       * These will be released as part of the register set.
-       */
-      if (!dsa->stencil[1].enabled) {
-         /* The easy case: if two-sided stenciling is *not* enabled, we
-          * just use the front-sided values.
-          */
-         stencil_fail_values = front_stencil_fail_values;
-         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
-         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
-      }
-      else { /* two-sided stencil enabled */
-         spe_comment(f, 0, "Resolving backface stencil values");
-         /* Allocate new registers for the needed merged values */
-         stencil_fail_values = spe_allocate_available_register(f);
-         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
-         if (dsa->depth.enabled) {
-            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
-            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
-         }
-         else {
-            stencil_pass_depth_fail_values = fbS_reg;
-         }
-         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
-         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
-      }
-   }
+      spe_comment(f, 0, facing == CELL_FACING_FRONT ? "Computing front-facing stencil values" : "Computing back-facing stencil values");
+      gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, 
+         &stencil_fail_values, &stencil_pass_depth_fail_values, 
+         &stencil_pass_depth_pass_values);
+   }  
 
    /* We now have all the stencil values we need.  We also need 
     * the results of the depth test to figure out which
@@ -1896,10 +1774,12 @@ gen_stencil_depth_test(struct spe_function *f,
  * should be much faster.
  *
  * \param cell  the rendering context (in)
+ * \param facing whether the generated code is for front-facing or 
+ *              back-facing fragments
  * \param f     the generated function (out)
  */
 void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f)
 {
    const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
    const struct pipe_blend_state *blend = cell->blend;
@@ -1917,7 +1797,8 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
-   const int facing_reg = 13; /* uint */
+
+   ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1945,7 +1826,6 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
-   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1969,6 +1849,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
+   /* Generate the alpha test, if needed. */
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
@@ -2095,7 +1976,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
           * gen_stencil_depth_test() function must ignore the
           * fbZ_reg register if depth is not enabled.
           */
-         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
index b59de198dc..2fabfdfb08 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -31,7 +31,7 @@
 
 
 extern void
-cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f);
+cell_gen_fragment_function(struct cell_context *cell, uint facing, struct spe_function *f);
 
 
 #endif /* CELL_GEN_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index dd2d7f7d1e..031b27f11f 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -75,23 +75,29 @@ lookup_fragment_ops(struct cell_context *cell)
     * If not found, create/save new fragment ops command.
     */
    if (!ops) {
-      struct spe_function spe_code;
+      struct spe_function spe_code_front, spe_code_back;
 
       if (0)
          debug_printf("**** Create New Fragment Ops\n");
 
       /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
 
-      /* generate new code */
-      cell_gen_fragment_function(cell, &spe_code);
+      /* generate new code.  Always generate new code for both front-facing
+       * and back-facing fragments, even if it's the same code in both
+       * cases.
+       */
+      cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+      cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
 
       /* alloc new fragment ops command */
       ops = CALLOC_STRUCT(cell_command_fragment_ops);
 
       /* populate the new cell_command_fragment_ops object */
       ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(ops->code, spe_code.store, spe_code_size(&spe_code));
+      memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front));
+      memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back));
       ops->dsa = *cell->depth_stencil;
       ops->blend = *cell->blend;
 
@@ -99,7 +105,8 @@ lookup_fragment_ops(struct cell_context *cell)
       util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
 
       /* release rtasm buffer */
-      spe_release_func(&spe_code);
+      spe_release_func(&spe_code_front);
+      spe_release_func(&spe_code_back);
    }
    else {
       if (0)
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d726622d94..d5faf4e3aa 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -214,7 +214,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
    /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
    /* Copy state info (for fallback case only) */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
@@ -234,7 +235,8 @@ cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
     * raw state records that the fallback code requires.
     */
    if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+      spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front;
+      spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back;
    }
    else {
       /* otherwise, the default fallback code remains in place */
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index c8bb251905..7033f6037d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -63,7 +63,8 @@ one_time_init(void)
     * This will normally be overriden by a code-gen'd function
     * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
     */
-   spu.fragment_ops = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
 }
 
 
@@ -90,7 +91,8 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0);
    ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 692790c9f3..24cf7d77ce 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -85,8 +85,7 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask,
-                                      uint facing);
+                                      vector unsigned int mask);
 
 /** Function for running fragment program */
 typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
@@ -170,9 +169,10 @@ struct spu_global
    ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
-   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-   /** Current fragment ops function */
-   spu_fragment_ops_func fragment_ops;
+   uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+   spu_fragment_ops_func fragment_ops[2];
 
    /** Current fragment program machine code, at 8-byte boundary */
    uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f8ffc70492..683664e8a4 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -75,8 +75,7 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask,
-                          uint facing)
+                          vector unsigned int mask)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index a61689c83a..f817abf046 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,8 +38,7 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask,
-                          uint facing);
+                          vector unsigned int mask);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 5f908159bb..22e51a86ae 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -275,15 +275,20 @@ emit_quad( int x, int y, mask_t mask)
 
          /* Execute per-fragment/quad operations, including:
           * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
           */
-         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
                           fragZ,
                           outputs[0*4+0],
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask,
-                          setup.facing);
+                          mask);
       }
    }
 }
@@ -519,7 +524,14 @@ setup_sort_vertices(const struct vertex_header *v0,
 
    setup.oneOverArea = 1.0f / area;
 
-   /* The product of area * sign indicates front/back orientation (0/1) */
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
+    */
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
    setup.facing = (area * sign > 0.0f)
       ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
 
-- 
cgit v1.2.3


From 8fee30064e35488bccf8e6e7478d56ca783ebac1 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Wed, 12 Nov 2008 18:13:58 +0100
Subject: rtasm: Compile only for GALLIUM_CELL.

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 6d11263be8..5e7bc02ed3 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -42,6 +42,8 @@
 #include "rtasm_ppc.h"
 
 
+#ifdef GALLIUM_CELL
+
 void
 ppc_init_func(struct ppc_function *p)
 {
@@ -957,3 +959,5 @@ ppc_return(struct ppc_function *p)
 {
    ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0);
 }
+
+#endif /* GALLIUM_CELL */
-- 
cgit v1.2.3


From 87f77105ce7207d601ee95bc29ca8c0ea1731d78 Mon Sep 17 00:00:00 2001
From: Michal Krol <michal@tungstengraphics.com>
Date: Wed, 12 Nov 2008 18:44:20 +0100
Subject: rtasm: Use INLINE keyword. Compile for all platforms, not only
 GALLIUM_CELL.

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index 5e7bc02ed3..b65bfa7bbd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -42,8 +42,6 @@
 #include "rtasm_ppc.h"
 
 
-#ifdef GALLIUM_CELL
-
 void
 ppc_init_func(struct ppc_function *p)
 {
@@ -253,7 +251,7 @@ union vx_inst {
    } inst;
 };
 
-static inline void
+static INLINE void
 emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
 {
    union vx_inst inst;
@@ -278,7 +276,7 @@ union vxr_inst {
    } inst;
 };
 
-static inline void
+static INLINE void
 emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
 {
    union vxr_inst inst;
@@ -304,7 +302,7 @@ union va_inst {
    } inst;
 };
 
-static inline void
+static INLINE void
 emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
 {
    union va_inst inst;
@@ -421,7 +419,7 @@ union d_inst {
    } inst;
 };
 
-static inline void
+static INLINE void
 emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
 {
    union d_inst inst;
@@ -448,7 +446,7 @@ union a_inst {
    } inst;
 };
 
-static inline void
+static INLINE void
 emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
        uint rc)
 {
@@ -959,5 +957,3 @@ ppc_return(struct ppc_function *p)
 {
    ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0);
 }
-
-#endif /* GALLIUM_CELL */
-- 
cgit v1.2.3


From 7f15e34cfadbeb460d22f9549511694c2bd27495 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 12 Nov 2008 11:01:40 -0700
Subject: cell: fix typo in EMIT_ macro

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d6a3c02f20..4cde080a2c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -100,7 +100,7 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #endif /* RTASM_PPC_SPE_H */
 
 #ifndef EMIT_
-#define EMIT_(name, _op) \
+#define EMIT_(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT)
 #define EMIT_R(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA)
-- 
cgit v1.2.3


From 1cd15f03706f921f3a9995a4ee860b91496f4bd2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 12 Nov 2008 11:05:34 -0700
Subject: cell: move semicolons to silence warnings w/ other compilers

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 378 ++++++++++++++--------------
 1 file changed, 189 insertions(+), 189 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 4cde080a2c..f1500cef29 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -101,198 +101,198 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 #ifndef EMIT_
 #define EMIT_(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT)
+    extern void _name (struct spe_function *p, unsigned rT);
 #define EMIT_R(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA)
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA);
 #define EMIT_RR(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   unsigned rB)
+                       unsigned rB);
 #define EMIT_RRR(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   unsigned rB, unsigned rC)
+                       unsigned rB, unsigned rC);
 #define EMIT_RI7(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI8(_name, _op, bias) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI10(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI10s(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-			   int imm)
+                       int imm);
 #define EMIT_RI16(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm)
+    extern void _name (struct spe_function *p, unsigned rT, int imm);
 #define EMIT_RI18(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm)
+    extern void _name (struct spe_function *p, unsigned rT, int imm);
 #define EMIT_I16(_name, _op) \
-    extern void _name (struct spe_function *p, int imm)
+    extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
 #endif /* EMIT_ */
 
 
 /* Memory load / store instructions
  */
-EMIT_RR  (spe_lqx,  0x1c4);
-EMIT_RI16(spe_lqa,  0x061);
-EMIT_RI16(spe_lqr,  0x067);
-EMIT_RR  (spe_stqx, 0x144);
-EMIT_RI16(spe_stqa, 0x041);
-EMIT_RI16(spe_stqr, 0x047);
-EMIT_RI7 (spe_cbd,  0x1f4);
-EMIT_RR  (spe_cbx,  0x1d4);
-EMIT_RI7 (spe_chd,  0x1f5);
-EMIT_RI7 (spe_chx,  0x1d5);
-EMIT_RI7 (spe_cwd,  0x1f6);
-EMIT_RI7 (spe_cwx,  0x1d6);
-EMIT_RI7 (spe_cdd,  0x1f7);
-EMIT_RI7 (spe_cdx,  0x1d7);
+EMIT_RR  (spe_lqx,  0x1c4)
+EMIT_RI16(spe_lqa,  0x061)
+EMIT_RI16(spe_lqr,  0x067)
+EMIT_RR  (spe_stqx, 0x144)
+EMIT_RI16(spe_stqa, 0x041)
+EMIT_RI16(spe_stqr, 0x047)
+EMIT_RI7 (spe_cbd,  0x1f4)
+EMIT_RR  (spe_cbx,  0x1d4)
+EMIT_RI7 (spe_chd,  0x1f5)
+EMIT_RI7 (spe_chx,  0x1d5)
+EMIT_RI7 (spe_cwd,  0x1f6)
+EMIT_RI7 (spe_cwx,  0x1d6)
+EMIT_RI7 (spe_cdd,  0x1f7)
+EMIT_RI7 (spe_cdx,  0x1d7)
 
 
 /* Constant formation instructions
  */
-EMIT_RI16(spe_ilh,   0x083);
-EMIT_RI16(spe_ilhu,  0x082);
-EMIT_RI16(spe_il,    0x081);
-EMIT_RI18(spe_ila,   0x021);
-EMIT_RI16(spe_iohl,  0x0c1);
-EMIT_RI16(spe_fsmbi, 0x065);
+EMIT_RI16(spe_ilh,   0x083)
+EMIT_RI16(spe_ilhu,  0x082)
+EMIT_RI16(spe_il,    0x081)
+EMIT_RI18(spe_ila,   0x021)
+EMIT_RI16(spe_iohl,  0x0c1)
+EMIT_RI16(spe_fsmbi, 0x065)
 
 
 /* Integer and logical instructions
  */
-EMIT_RR  (spe_ah,      0x0c8);
-EMIT_RI10(spe_ahi,     0x01d);
-EMIT_RR  (spe_a,       0x0c0);
-EMIT_RI10s(spe_ai,      0x01c);
-EMIT_RR  (spe_sfh,     0x048);
-EMIT_RI10(spe_sfhi,    0x00d);
-EMIT_RR  (spe_sf,      0x040);
-EMIT_RI10(spe_sfi,     0x00c);
-EMIT_RR  (spe_addx,    0x340);
-EMIT_RR  (spe_cg,      0x0c2);
-EMIT_RR  (spe_cgx,     0x342);
-EMIT_RR  (spe_sfx,     0x341);
-EMIT_RR  (spe_bg,      0x042);
-EMIT_RR  (spe_bgx,     0x343);
-EMIT_RR  (spe_mpy,     0x3c4);
-EMIT_RR  (spe_mpyu,    0x3cc);
-EMIT_RI10(spe_mpyi,    0x074);
-EMIT_RI10(spe_mpyui,   0x075);
-EMIT_RRR (spe_mpya,    0x00c);
-EMIT_RR  (spe_mpyh,    0x3c5);
-EMIT_RR  (spe_mpys,    0x3c7);
-EMIT_RR  (spe_mpyhh,   0x3c6);
-EMIT_RR  (spe_mpyhha,  0x346);
-EMIT_RR  (spe_mpyhhu,  0x3ce);
-EMIT_RR  (spe_mpyhhau, 0x34e);
-EMIT_R   (spe_clz,     0x2a5);
-EMIT_R   (spe_cntb,    0x2b4);
-EMIT_R   (spe_fsmb,    0x1b6);
-EMIT_R   (spe_fsmh,    0x1b5);
-EMIT_R   (spe_fsm,     0x1b4);
-EMIT_R   (spe_gbb,     0x1b2);
-EMIT_R   (spe_gbh,     0x1b1);
-EMIT_R   (spe_gb,      0x1b0);
-EMIT_RR  (spe_avgb,    0x0d3);
-EMIT_RR  (spe_absdb,   0x053);
-EMIT_RR  (spe_sumb,    0x253);
-EMIT_R   (spe_xsbh,    0x2b6);
-EMIT_R   (spe_xshw,    0x2ae);
-EMIT_R   (spe_xswd,    0x2a6);
-EMIT_RR  (spe_and,     0x0c1);
-EMIT_RR  (spe_andc,    0x2c1);
-EMIT_RI10s(spe_andbi,   0x016);
-EMIT_RI10s(spe_andhi,   0x015);
-EMIT_RI10s(spe_andi,    0x014);
-EMIT_RR  (spe_or,      0x041);
-EMIT_RR  (spe_orc,     0x2c9);
-EMIT_RI10s(spe_orbi,    0x006);
-EMIT_RI10s(spe_orhi,    0x005);
-EMIT_RI10s(spe_ori,     0x004);
-EMIT_R   (spe_orx,     0x1f0);
-EMIT_RR  (spe_xor,     0x241);
-EMIT_RI10s(spe_xorbi,   0x026);
-EMIT_RI10s(spe_xorhi,   0x025);
-EMIT_RI10s(spe_xori,    0x024);
-EMIT_RR  (spe_nand,    0x0c9);
-EMIT_RR  (spe_nor,     0x049);
-EMIT_RR  (spe_eqv,     0x249);
-EMIT_RRR (spe_selb,    0x008);
-EMIT_RRR (spe_shufb,   0x00b);
+EMIT_RR  (spe_ah,      0x0c8)
+EMIT_RI10(spe_ahi,     0x01d)
+EMIT_RR  (spe_a,       0x0c0)
+EMIT_RI10s(spe_ai,      0x01c)
+EMIT_RR  (spe_sfh,     0x048)
+EMIT_RI10(spe_sfhi,    0x00d)
+EMIT_RR  (spe_sf,      0x040)
+EMIT_RI10(spe_sfi,     0x00c)
+EMIT_RR  (spe_addx,    0x340)
+EMIT_RR  (spe_cg,      0x0c2)
+EMIT_RR  (spe_cgx,     0x342)
+EMIT_RR  (spe_sfx,     0x341)
+EMIT_RR  (spe_bg,      0x042)
+EMIT_RR  (spe_bgx,     0x343)
+EMIT_RR  (spe_mpy,     0x3c4)
+EMIT_RR  (spe_mpyu,    0x3cc)
+EMIT_RI10(spe_mpyi,    0x074)
+EMIT_RI10(spe_mpyui,   0x075)
+EMIT_RRR (spe_mpya,    0x00c)
+EMIT_RR  (spe_mpyh,    0x3c5)
+EMIT_RR  (spe_mpys,    0x3c7)
+EMIT_RR  (spe_mpyhh,   0x3c6)
+EMIT_RR  (spe_mpyhha,  0x346)
+EMIT_RR  (spe_mpyhhu,  0x3ce)
+EMIT_RR  (spe_mpyhhau, 0x34e)
+EMIT_R   (spe_clz,     0x2a5)
+EMIT_R   (spe_cntb,    0x2b4)
+EMIT_R   (spe_fsmb,    0x1b6)
+EMIT_R   (spe_fsmh,    0x1b5)
+EMIT_R   (spe_fsm,     0x1b4)
+EMIT_R   (spe_gbb,     0x1b2)
+EMIT_R   (spe_gbh,     0x1b1)
+EMIT_R   (spe_gb,      0x1b0)
+EMIT_RR  (spe_avgb,    0x0d3)
+EMIT_RR  (spe_absdb,   0x053)
+EMIT_RR  (spe_sumb,    0x253)
+EMIT_R   (spe_xsbh,    0x2b6)
+EMIT_R   (spe_xshw,    0x2ae)
+EMIT_R   (spe_xswd,    0x2a6)
+EMIT_RR  (spe_and,     0x0c1)
+EMIT_RR  (spe_andc,    0x2c1)
+EMIT_RI10s(spe_andbi,   0x016)
+EMIT_RI10s(spe_andhi,   0x015)
+EMIT_RI10s(spe_andi,    0x014)
+EMIT_RR  (spe_or,      0x041)
+EMIT_RR  (spe_orc,     0x2c9)
+EMIT_RI10s(spe_orbi,    0x006)
+EMIT_RI10s(spe_orhi,    0x005)
+EMIT_RI10s(spe_ori,     0x004)
+EMIT_R   (spe_orx,     0x1f0)
+EMIT_RR  (spe_xor,     0x241)
+EMIT_RI10s(spe_xorbi,   0x026)
+EMIT_RI10s(spe_xorhi,   0x025)
+EMIT_RI10s(spe_xori,    0x024)
+EMIT_RR  (spe_nand,    0x0c9)
+EMIT_RR  (spe_nor,     0x049)
+EMIT_RR  (spe_eqv,     0x249)
+EMIT_RRR (spe_selb,    0x008)
+EMIT_RRR (spe_shufb,   0x00b)
 
 
 /* Shift and rotate instructions
  */
-EMIT_RR  (spe_shlh,      0x05f);
-EMIT_RI7 (spe_shlhi,     0x07f);
-EMIT_RR  (spe_shl,       0x05b);
-EMIT_RI7 (spe_shli,      0x07b);
-EMIT_RR  (spe_shlqbi,    0x1db);
-EMIT_RI7 (spe_shlqbii,   0x1fb);
-EMIT_RR  (spe_shlqby,    0x1df);
-EMIT_RI7 (spe_shlqbyi,   0x1ff);
-EMIT_RR  (spe_shlqbybi,  0x1cf);
-EMIT_RR  (spe_roth,      0x05c);
-EMIT_RI7 (spe_rothi,     0x07c);
-EMIT_RR  (spe_rot,       0x058);
-EMIT_RI7 (spe_roti,      0x078);
-EMIT_RR  (spe_rotqby,    0x1dc);
-EMIT_RI7 (spe_rotqbyi,   0x1fc);
-EMIT_RR  (spe_rotqbybi,  0x1cc);
-EMIT_RR  (spe_rotqbi,    0x1d8);
-EMIT_RI7 (spe_rotqbii,   0x1f8);
-EMIT_RR  (spe_rothm,     0x05d);
-EMIT_RI7 (spe_rothmi,    0x07d);
-EMIT_RR  (spe_rotm,      0x059);
-EMIT_RI7 (spe_rotmi,     0x079);
-EMIT_RR  (spe_rotqmby,   0x1dd);
-EMIT_RI7 (spe_rotqmbyi,  0x1fd);
-EMIT_RR  (spe_rotqmbybi, 0x1cd);
-EMIT_RR  (spe_rotqmbi,   0x1c9);
-EMIT_RI7 (spe_rotqmbii,  0x1f9);
-EMIT_RR  (spe_rotmah,    0x05e);
-EMIT_RI7 (spe_rotmahi,   0x07e);
-EMIT_RR  (spe_rotma,     0x05a);
-EMIT_RI7 (spe_rotmai,    0x07a);
+EMIT_RR  (spe_shlh,      0x05f)
+EMIT_RI7 (spe_shlhi,     0x07f)
+EMIT_RR  (spe_shl,       0x05b)
+EMIT_RI7 (spe_shli,      0x07b)
+EMIT_RR  (spe_shlqbi,    0x1db)
+EMIT_RI7 (spe_shlqbii,   0x1fb)
+EMIT_RR  (spe_shlqby,    0x1df)
+EMIT_RI7 (spe_shlqbyi,   0x1ff)
+EMIT_RR  (spe_shlqbybi,  0x1cf)
+EMIT_RR  (spe_roth,      0x05c)
+EMIT_RI7 (spe_rothi,     0x07c)
+EMIT_RR  (spe_rot,       0x058)
+EMIT_RI7 (spe_roti,      0x078)
+EMIT_RR  (spe_rotqby,    0x1dc)
+EMIT_RI7 (spe_rotqbyi,   0x1fc)
+EMIT_RR  (spe_rotqbybi,  0x1cc)
+EMIT_RR  (spe_rotqbi,    0x1d8)
+EMIT_RI7 (spe_rotqbii,   0x1f8)
+EMIT_RR  (spe_rothm,     0x05d)
+EMIT_RI7 (spe_rothmi,    0x07d)
+EMIT_RR  (spe_rotm,      0x059)
+EMIT_RI7 (spe_rotmi,     0x079)
+EMIT_RR  (spe_rotqmby,   0x1dd)
+EMIT_RI7 (spe_rotqmbyi,  0x1fd)
+EMIT_RR  (spe_rotqmbybi, 0x1cd)
+EMIT_RR  (spe_rotqmbi,   0x1c9)
+EMIT_RI7 (spe_rotqmbii,  0x1f9)
+EMIT_RR  (spe_rotmah,    0x05e)
+EMIT_RI7 (spe_rotmahi,   0x07e)
+EMIT_RR  (spe_rotma,     0x05a)
+EMIT_RI7 (spe_rotmai,    0x07a)
 
 
 /* Compare, branch, and halt instructions
  */
-EMIT_RR  (spe_heq,       0x3d8);
-EMIT_RI10(spe_heqi,      0x07f);
-EMIT_RR  (spe_hgt,       0x258);
-EMIT_RI10(spe_hgti,      0x04f);
-EMIT_RR  (spe_hlgt,      0x2d8);
-EMIT_RI10(spe_hlgti,     0x05f);
-EMIT_RR  (spe_ceqb,      0x3d0);
-EMIT_RI10(spe_ceqbi,     0x07e);
-EMIT_RR  (spe_ceqh,      0x3c8);
-EMIT_RI10(spe_ceqhi,     0x07d);
-EMIT_RR  (spe_ceq,       0x3c0);
-EMIT_RI10(spe_ceqi,      0x07c);
-EMIT_RR  (spe_cgtb,      0x250);
-EMIT_RI10(spe_cgtbi,     0x04e);
-EMIT_RR  (spe_cgth,      0x248);
-EMIT_RI10(spe_cgthi,     0x04d);
-EMIT_RR  (spe_cgt,       0x240);
-EMIT_RI10(spe_cgti,      0x04c);
-EMIT_RR  (spe_clgtb,     0x2d0);
-EMIT_RI10(spe_clgtbi,    0x05e);
-EMIT_RR  (spe_clgth,     0x2c8);
-EMIT_RI10(spe_clgthi,    0x05d);
-EMIT_RR  (spe_clgt,      0x2c0);
-EMIT_RI10(spe_clgti,     0x05c);
-EMIT_I16 (spe_br,        0x064);
-EMIT_I16 (spe_bra,       0x060);
-EMIT_RI16(spe_brsl,      0x066);
-EMIT_RI16(spe_brasl,     0x062);
-EMIT_RI16(spe_brnz,      0x042);
-EMIT_RI16(spe_brz,       0x040);
-EMIT_RI16(spe_brhnz,     0x046);
-EMIT_RI16(spe_brhz,      0x044);
+EMIT_RR  (spe_heq,       0x3d8)
+EMIT_RI10(spe_heqi,      0x07f)
+EMIT_RR  (spe_hgt,       0x258)
+EMIT_RI10(spe_hgti,      0x04f)
+EMIT_RR  (spe_hlgt,      0x2d8)
+EMIT_RI10(spe_hlgti,     0x05f)
+EMIT_RR  (spe_ceqb,      0x3d0)
+EMIT_RI10(spe_ceqbi,     0x07e)
+EMIT_RR  (spe_ceqh,      0x3c8)
+EMIT_RI10(spe_ceqhi,     0x07d)
+EMIT_RR  (spe_ceq,       0x3c0)
+EMIT_RI10(spe_ceqi,      0x07c)
+EMIT_RR  (spe_cgtb,      0x250)
+EMIT_RI10(spe_cgtbi,     0x04e)
+EMIT_RR  (spe_cgth,      0x248)
+EMIT_RI10(spe_cgthi,     0x04d)
+EMIT_RR  (spe_cgt,       0x240)
+EMIT_RI10(spe_cgti,      0x04c)
+EMIT_RR  (spe_clgtb,     0x2d0)
+EMIT_RI10(spe_clgtbi,    0x05e)
+EMIT_RR  (spe_clgth,     0x2c8)
+EMIT_RI10(spe_clgthi,    0x05d)
+EMIT_RR  (spe_clgt,      0x2c0)
+EMIT_RI10(spe_clgti,     0x05c)
+EMIT_I16 (spe_br,        0x064)
+EMIT_I16 (spe_bra,       0x060)
+EMIT_RI16(spe_brsl,      0x066)
+EMIT_RI16(spe_brasl,     0x062)
+EMIT_RI16(spe_brnz,      0x042)
+EMIT_RI16(spe_brz,       0x040)
+EMIT_RI16(spe_brhnz,     0x046)
+EMIT_RI16(spe_brhz,      0x044)
 
 extern void
 spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
@@ -375,46 +375,46 @@ spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
 
 /* Floating-point instructions
  */
-EMIT_RR  (spe_fa,         0x2c4);
-EMIT_RR  (spe_dfa,        0x2cc);
-EMIT_RR  (spe_fs,         0x2c5);
-EMIT_RR  (spe_dfs,        0x2cd);
-EMIT_RR  (spe_fm,         0x2c6);
-EMIT_RR  (spe_dfm,        0x2ce);
-EMIT_RRR (spe_fma,        0x00e);
-EMIT_RR  (spe_dfma,       0x35c);
-EMIT_RRR (spe_fnms,       0x00d);
-EMIT_RR  (spe_dfnms,      0x35e);
-EMIT_RRR (spe_fms,        0x00f);
-EMIT_RR  (spe_dfms,       0x35d);
-EMIT_RR  (spe_dfnma,      0x35f);
-EMIT_R   (spe_frest,      0x1b8);
-EMIT_R   (spe_frsqest,    0x1b9);
-EMIT_RR  (spe_fi,         0x3d4);
-EMIT_RI8 (spe_csflt,      0x1da, 155);
-EMIT_RI8 (spe_cflts,      0x1d8, 173);
-EMIT_RI8 (spe_cuflt,      0x1db, 155);
-EMIT_RI8 (spe_cfltu,      0x1d9, 173);
-EMIT_R   (spe_frds,       0x3b9);
-EMIT_R   (spe_fesd,       0x3b8);
-EMIT_RR  (spe_dfceq,      0x3c3);
-EMIT_RR  (spe_dfcmeq,     0x3cb);
-EMIT_RR  (spe_dfcgt,      0x2c3);
-EMIT_RR  (spe_dfcmgt,     0x2cb);
-EMIT_RI7 (spe_dftsv,      0x3bf);
-EMIT_RR  (spe_fceq,       0x3c2);
-EMIT_RR  (spe_fcmeq,      0x3ca);
-EMIT_RR  (spe_fcgt,       0x2c2);
-EMIT_RR  (spe_fcmgt,      0x2ca);
-EMIT_R   (spe_fscrwr,     0x3ba);
-EMIT_    (spe_fscrrd,     0x398);
+EMIT_RR  (spe_fa,         0x2c4)
+EMIT_RR  (spe_dfa,        0x2cc)
+EMIT_RR  (spe_fs,         0x2c5)
+EMIT_RR  (spe_dfs,        0x2cd)
+EMIT_RR  (spe_fm,         0x2c6)
+EMIT_RR  (spe_dfm,        0x2ce)
+EMIT_RRR (spe_fma,        0x00e)
+EMIT_RR  (spe_dfma,       0x35c)
+EMIT_RRR (spe_fnms,       0x00d)
+EMIT_RR  (spe_dfnms,      0x35e)
+EMIT_RRR (spe_fms,        0x00f)
+EMIT_RR  (spe_dfms,       0x35d)
+EMIT_RR  (spe_dfnma,      0x35f)
+EMIT_R   (spe_frest,      0x1b8)
+EMIT_R   (spe_frsqest,    0x1b9)
+EMIT_RR  (spe_fi,         0x3d4)
+EMIT_RI8 (spe_csflt,      0x1da, 155)
+EMIT_RI8 (spe_cflts,      0x1d8, 173)
+EMIT_RI8 (spe_cuflt,      0x1db, 155)
+EMIT_RI8 (spe_cfltu,      0x1d9, 173)
+EMIT_R   (spe_frds,       0x3b9)
+EMIT_R   (spe_fesd,       0x3b8)
+EMIT_RR  (spe_dfceq,      0x3c3)
+EMIT_RR  (spe_dfcmeq,     0x3cb)
+EMIT_RR  (spe_dfcgt,      0x2c3)
+EMIT_RR  (spe_dfcmgt,     0x2cb)
+EMIT_RI7 (spe_dftsv,      0x3bf)
+EMIT_RR  (spe_fceq,       0x3c2)
+EMIT_RR  (spe_fcmeq,      0x3ca)
+EMIT_RR  (spe_fcgt,       0x2c2)
+EMIT_RR  (spe_fcmgt,      0x2ca)
+EMIT_R   (spe_fscrwr,     0x3ba)
+EMIT_    (spe_fscrrd,     0x398)
 
 
 /* Channel instructions
  */
-EMIT_R   (spe_rdch,       0x00d);
-EMIT_R   (spe_rdchcnt,    0x00f);
-EMIT_R   (spe_wrch,       0x10d);
+EMIT_R   (spe_rdch,       0x00d)
+EMIT_R   (spe_rdchcnt,    0x00f)
+EMIT_R   (spe_wrch,       0x10d)
 
 
 #ifdef UNDEF_EMIT_MACROS
-- 
cgit v1.2.3


From b44ec717c831bb2e3363ee79ae1faca7e0665bea Mon Sep 17 00:00:00 2001
From: Brian Paul <brian.paul@tungstengraphics.com>
Date: Wed, 12 Nov 2008 11:09:12 -0700
Subject: gallium: add missing prototypes

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index afb4704c39..08212a2a25 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -244,6 +244,9 @@ ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb);
 extern void
 ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm);
 
+extern void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm);
+
 extern void
 ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb);
 
@@ -310,6 +313,9 @@ ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset);
 extern void
 ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb);
 
+extern void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset);
+
 
 
 /**
-- 
cgit v1.2.3


From 2c29a6896a4a026ed3568db9caf90f422b711d8b Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Thu, 13 Nov 2008 11:22:12 -0700
Subject: CELL: fix stencil twiddling, stencil invert

Many stencil tests were failing because of a failure to read the
stencil buffer, due to "twiddling" (or "untwiddling") "an unsupported
texture format".  This is fixed for the case of a stencil/Z S824Z format
(which twiddles just like the 32-bit color formats).

tests/stencilwrap.c was failing on the GL_INVERT test, because
the emitted code for "spe_xori" turned out not to be an actual
"xori" instruction, but rather a "stqd" instruction, because
of a typo in the rtasm code.  This is now fixed, and
tests/stencil_wrap now works.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h | 6 +++---
 src/gallium/drivers/cell/ppu/cell_texture.c | 6 ++++--
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index f1500cef29..7c211ffc51 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -214,9 +214,9 @@ EMIT_RI10s(spe_orhi,    0x005)
 EMIT_RI10s(spe_ori,     0x004)
 EMIT_R   (spe_orx,     0x1f0)
 EMIT_RR  (spe_xor,     0x241)
-EMIT_RI10s(spe_xorbi,   0x026)
-EMIT_RI10s(spe_xorhi,   0x025)
-EMIT_RI10s(spe_xori,    0x024)
+EMIT_RI10s(spe_xorbi,   0x046)
+EMIT_RI10s(spe_xorhi,   0x045)
+EMIT_RI10s(spe_xori,    0x044)
 EMIT_RR  (spe_nand,    0x0c9)
 EMIT_RR  (spe_nor,     0x049)
 EMIT_RR  (spe_eqv,     0x249)
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index ae88d06912..47cd9605c8 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -314,6 +314,7 @@ cell_twiddle_texture(struct pipe_screen *screen,
    switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
    case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
       {
          int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
          int offset = bufWidth * bufHeight * 4 * surface->face;
@@ -337,7 +338,7 @@ cell_twiddle_texture(struct pipe_screen *screen,
       }
       break;
    default:
-      printf("Cell: twiddle unsupported texture format\n");
+      printf("Cell: twiddle unsupported texture format 0x%x\n", ct->base.format);
       ;
    }
 
@@ -363,6 +364,7 @@ cell_untwiddle_texture(struct pipe_screen *screen,
    switch (ct->base.format) {
    case PIPE_FORMAT_A8R8G8B8_UNORM:
    case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
       {
          int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
          int offset = surface->stride * texHeight * 4 * surface->face;
@@ -382,7 +384,7 @@ cell_untwiddle_texture(struct pipe_screen *screen,
    default:
       {
          ct->untiled_data[level] = NULL;
-         printf("Cell: untwiddle unsupported texture format\n");
+         printf("Cell: untwiddle unsupported texture format 0x%x\n", ct->base.format);
       }
    }
 
-- 
cgit v1.2.3


From 11fc390f6478526d4f0bdb4b7e628284da31b3b9 Mon Sep 17 00:00:00 2001
From: Robert Ellison <papillo@tungstengraphics.com>
Date: Fri, 21 Nov 2008 11:42:14 -0700
Subject: CELL: use variant-length fragment ops programs

This is a set of changes that optimizes the memory use of fragment
operation programs (by using and transmitting only as much memory as is
needed for the fragment ops programs, instead of maximal sizes), as well
as eliminate the dependency on hard-coded maximal program sizes.  State
that is not dependent on fragment facing (i.e. that isn't using
two-sided stenciling) will only save and transmit a single
fragment operation program, instead of two identical programs.

- Added the ability to emit a LNOP (No Operation (Load)) instruction.
  This is used to pad the generated fragment operations programs to
  a multiple of 8 bytes, which is necessary for proper operation of
  the dual instruction pipeline, and also required for proper SPU-side
  decoding.

- Added the ability to allocate and manage a variant-length
  struct cell_command_fragment_ops.  This structure now puts the
  generated function field at the end, where it can be as large
  as necessary.

- On the PPU side, we now combine the generated front-facing and
  back-facing code into a single variant-length buffer (and only use one
  if the two sets of code are identical) for transmission to the SPU.

- On the SPU side, we pull the correct sizes out of the buffer,
  allocate a new code buffer if the one we have isn't large enough,
  and save the code to that buffer.  The buffer is deallocated when
  the SPU exits.

- Commented out the emit_fetch() static function, which was not being used.
---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c      |   7 +-
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h      |  11 ++-
 src/gallium/auxiliary/util/u_memory.h            |   2 +
 src/gallium/drivers/cell/common.h                |  31 +++++--
 src/gallium/drivers/cell/ppu/cell_gen_fragment.c |   7 +-
 src/gallium/drivers/cell/ppu/cell_state_emit.c   |  77 ++++++++++++++--
 src/gallium/drivers/cell/ppu/cell_vertex_fetch.c |   3 +
 src/gallium/drivers/cell/spu/spu_command.c       | 111 +++++++++++++++++------
 src/gallium/drivers/cell/spu/spu_command.h       |  32 ++++++-
 src/gallium/drivers/cell/spu/spu_main.c          |  15 +--
 src/gallium/drivers/cell/spu/spu_main.h          |   4 +-
 11 files changed, 232 insertions(+), 68 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 1bd9f1c8dd..b9a75ae559 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -341,7 +341,11 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-
+#define EMIT(_name, _op) \
+void _name (struct spe_function *p) \
+{ \
+   emit_RR(p, _op, 0, 0, 0, __FUNCTION__); \
+}
 
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
@@ -713,7 +717,6 @@ hbrr;
 #if 0
 stop;
 EMIT_RR  (spe_stopd, 0x140);
-EMIT_    (spe_lnop,  0x001);
 EMIT_    (spe_nop,   0x201);
 sync;
 EMIT_    (spe_dsync, 0x003);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 7c211ffc51..f9ad2acacd 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -99,7 +99,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 #endif /* RTASM_PPC_SPE_H */
 
-#ifndef EMIT_
+#ifndef EMIT
+#define EMIT(_name, _op) \
+    extern void _name (struct spe_function *p);
 #define EMIT_(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT);
 #define EMIT_R(_name, _op) \
@@ -129,7 +131,7 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT_I16(_name, _op) \
     extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
-#endif /* EMIT_ */
+#endif /* EMIT */
 
 
 /* Memory load / store instructions
@@ -294,6 +296,10 @@ EMIT_RI16(spe_brz,       0x040)
 EMIT_RI16(spe_brhnz,     0x046)
 EMIT_RI16(spe_brhz,      0x044)
 
+/* Control instructions
+ */
+EMIT     (spe_lnop,      0x001)
+
 extern void
 spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
 
@@ -418,6 +424,7 @@ EMIT_R   (spe_wrch,       0x10d)
 
 
 #ifdef UNDEF_EMIT_MACROS
+#undef EMIT
 #undef EMIT_
 #undef EMIT_R
 #undef EMIT_RR
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index 857102719d..1a6b596421 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -151,6 +151,8 @@ REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
 
 #define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
 
+#define CALLOC_VARIANT_LENGTH_STRUCT(T,more_size)   ((struct T *) CALLOC(1, sizeof(struct T) + more_size))
+
 
 /**
  * Return memory on given byte alignment
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index a670ed3c6e..98554d7f52 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -121,11 +121,6 @@
 #define CELL_DEBUG_CMD                  (1 << 5)
 #define CELL_DEBUG_CACHE                (1 << 6)
 
-/** Max instructions for doing per-fragment operations */
-#define SPU_MAX_FRAGMENT_OPS_INSTS 128
-
-
-
 #define CELL_FENCE_IDLE      0
 #define CELL_FENCE_EMITTED   1
 #define CELL_FENCE_SIGNALLED 2
@@ -153,18 +148,36 @@ struct cell_command_fence
 
 /**
  * Command to specify per-fragment operations state and generated code.
- * Note that the dsa, blend, blend_color fields are really only needed
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
  * for the fallback/C per-pixel code.  They're not used when we generate
- * dynamic SPU fragment code (which is the normal case).
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
  */
 struct cell_command_fragment_ops
 {
    uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
    struct pipe_blend_state blend;
    struct pipe_blend_color blend_color;
-   unsigned code_front[SPU_MAX_FRAGMENT_OPS_INSTS];
-   unsigned code_back[SPU_MAX_FRAGMENT_OPS_INSTS];
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 82336d6635..2c64eb1bcc 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1776,7 +1776,10 @@ gen_stencil_depth_test(struct spe_function *f,
  * \param cell  the rendering context (in)
  * \param facing whether the generated code is for front-facing or 
  *              back-facing fragments
- * \param f     the generated function (out)
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
  */
 void
 cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f)
@@ -1808,8 +1811,6 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
    int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
-   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-
    if (cell->debug_flags & CELL_DEBUG_ASM) {
       spe_print_code(f, true);
       spe_indent(f, 8);
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 031b27f11f..0a0af81f53 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -76,30 +76,86 @@ lookup_fragment_ops(struct cell_context *cell)
     */
    if (!ops) {
       struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
 
       if (0)
          debug_printf("**** Create New Fragment Ops\n");
 
-      /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code_front, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      spe_init_func(&spe_code_back, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
 
-      /* generate new code.  Always generate new code for both front-facing
+      /* Generate new code.  Always generate new code for both front-facing
        * and back-facing fragments, even if it's the same code in both
        * cases.
        */
       cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
       cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
 
-      /* alloc new fragment ops command */
-      ops = CALLOC_STRUCT(cell_command_fragment_ops);
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
 
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
       /* populate the new cell_command_fragment_ops object */
       ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(ops->code_front, spe_code_front.store, spe_code_size(&spe_code_front));
-      memcpy(ops->code_back, spe_code_back.store, spe_code_size(&spe_code_back));
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
       ops->dsa = *cell->depth_stencil;
       ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
 
       /* insert cell_command_fragment_ops object into keymap/cache */
       util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
@@ -200,9 +256,10 @@ cell_emit_state(struct cell_context *cell)
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
       struct cell_command_fragment_ops *fops, *fops_cmd;
-      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+      /* Note that cell_command_fragment_ops is a variant-sized record */
       fops = lookup_fragment_ops(cell);
-      memcpy(fops_cmd, fops, sizeof(*fops));
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 18969005b0..9cba537d9e 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -145,6 +145,8 @@ emit_matrix_transpose(struct spe_function *p,
 }
 
 
+#if 0
+/* This appears to not be used currently */
 static void
 emit_fetch(struct spe_function *p,
 	   unsigned in_ptr, unsigned *offset,
@@ -256,6 +258,7 @@ emit_fetch(struct spe_function *p,
       spe_release_register(p, float_one);
    }
 }
+#endif
 
 
 void cell_update_vertex_fetch(struct draw_context *draw)
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index d5faf4e3aa..8500d19754 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -210,45 +210,72 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 static void
 cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
 {
-   static int warned = 0;
-
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
-   /* Copy SPU code from batch buffer to spu buffer */
-   memcpy(spu.fragment_ops_code_front, fops->code_front, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   memcpy(spu.fragment_ops_code_back, fops->code_back, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
-   /* Copy state info (for fallback case only) */
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
    memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
    memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
    memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
 
-   /* Parity twist!  For now, always use the fallback code by default,
-    * only switching to codegen when specifically requested.  This
-    * allows us to develop freely without risking taking down the
-    * branch.
-    *
-    * Later, the parity of this check will be reversed, so that
-    * codegen is *always* used, unless we specifically indicate that
-    * we don't want it.
-    *
-    * Eventually, the option will be removed completely, because in
-    * final code we'll always use codegen and won't even provide the
-    * raw state records that the fallback code requires.
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
     */
-   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) == 0) {
-      spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) spu.fragment_ops_code_front;
-      spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) spu.fragment_ops_code_back;
-   }
-   else {
-      /* otherwise, the default fallback code remains in place */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
       if (!warned) {
          fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
          warned = 1;
       }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
    }
 
-   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
-}
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
 
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
 
 static void
 cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
@@ -588,7 +615,8 @@ cmd_batch(uint opcode)
             struct cell_command_fragment_ops *fops
                = (struct cell_command_fragment_ops *) &buffer[pos];
             cmd_state_fragment_ops(fops);
-            pos += sizeof(*fops) / 8;
+            /* This is a variant-sized command */
+            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_PROGRAM:
@@ -756,3 +784,32 @@ command_loop(void)
    if (spu.init.debug_flags & CELL_DEBUG_CACHE)
       spu_dcache_report();
 }
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
index 853e9aa549..83dcdade28 100644
--- a/src/gallium/drivers/cell/spu/spu_command.h
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -1,7 +1,35 @@
-
-
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 extern void
 command_loop(void);
 
+extern void
+spu_command_init(void);
 
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index 7033f6037d..97c86d194d 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -58,17 +58,8 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
-
-   /* Install default/fallback fragment processing function.
-    * This will normally be overriden by a code-gen'd function
-    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
-    */
-   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
-   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
 }
 
-
-
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -91,11 +82,11 @@ main(main_param_t speid, main_param_t argp)
 
    ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
    ASSERT(sizeof(struct cell_command_render) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_front) % 8 == 0);
-   ASSERT(((unsigned long) &spu.fragment_ops_code_back) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
    ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
 
    one_time_init();
+   spu_command_init();
 
    D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
    D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
@@ -120,5 +111,7 @@ main(main_param_t speid, main_param_t argp)
 
    command_loop();
 
+   spu_command_close();
+
    return 0;
 }
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 24cf7d77ce..33767e7c51 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -169,8 +169,8 @@ struct spu_global
    ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
    /** Current fragment ops machine code, at 8-byte boundary */
-   uint fragment_ops_code_front[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
-   uint fragment_ops_code_back[SPU_MAX_FRAGMENT_OPS_INSTS] ALIGN8_ATTRIB;
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
    /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
    spu_fragment_ops_func fragment_ops[2];
 
-- 
cgit v1.2.3


From 42d00790029da4bc7e77f68c8f1c22ac9c417e42 Mon Sep 17 00:00:00 2001
From: José Fonseca <jfonseca@vmware.com>
Date: Tue, 30 Dec 2008 17:06:51 +0000
Subject: rtasm: Remove spurious semi-colons after function bodies.

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index b65bfa7bbd..e9015ec2eb 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -261,7 +261,7 @@ emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.vB = vB;
    inst.inst.op2 = op2;
    emit_instruction(p, inst.bits);
-};
+}
 
 
 union vxr_inst {
@@ -287,7 +287,7 @@ emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.rC = 0;
    inst.inst.op2 = op2;
    emit_instruction(p, inst.bits);
-};
+}
 
 
 union va_inst {
@@ -313,7 +313,7 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
    inst.inst.vC = vC;
    inst.inst.op2 = op2;
    emit_instruction(p, inst.bits);
-};
+}
 
 
 union i_inst {
@@ -430,7 +430,7 @@ emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
    inst.inst.ra = ra;
    inst.inst.si = (unsigned) (si & 0xffff);
    emit_instruction(p, inst.bits);
-};
+}
 
 
 union a_inst {
@@ -459,7 +459,7 @@ emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
    inst.inst.op2 = op2;
    inst.inst.rc = rc;
    emit_instruction(p, inst.bits);
-};
+}
 
 
 union xo_inst {
-- 
cgit v1.2.3


From 2b26a92cd34f8d83cc0ae621d1cfeb3955de57fa Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 9 Jan 2009 20:57:14 -0700
Subject: gallium: s/false/FALSE/

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index b9a75ae559..071bc2015c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -443,7 +443,7 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
       p->regs[i] = 1;
     }
 
-    p->print = false;
+    p->print = FALSE;
     p->indent = 0;
 }
 
-- 
cgit v1.2.3


From 7acaeb87750226e7407908bc2dfa9989049202fa Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 9 Jan 2009 21:42:17 -0700
Subject: gallium: added comment/annotation support to PPC rtasm

---
 src/gallium/auxiliary/rtasm/rtasm_ppc.c | 242 ++++++++++++++++++++++++--------
 src/gallium/auxiliary/rtasm/rtasm_ppc.h |   7 +
 2 files changed, 187 insertions(+), 62 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index e9015ec2eb..1bb9026205 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -1,6 +1,7 @@
 /**************************************************************************
  *
  * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ * Copyright (C) 2009 VMware, Inc.  All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -47,6 +48,8 @@ ppc_init_func(struct ppc_function *p)
 {
    uint i;
 
+   memset(p, 0, sizeof(*p));
+
    p->num_inst = 0;
    p->max_inst = 100; /* first guess at buffer size */
    p->store = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
@@ -54,6 +57,9 @@ ppc_init_func(struct ppc_function *p)
    p->fp_used = 0x0;
    p->vec_used = 0x0;
 
+   p->print = FALSE;
+   p->indent = 0;
+
    /* only allow using gp registers 3..12 for now */
    for (i = 0; i < 3; i++)
       ppc_reserve_register(p, i);
@@ -105,6 +111,42 @@ ppc_dump_func(const struct ppc_function *p)
 }
 
 
+void
+ppc_print_code(struct ppc_function *p, boolean enable)
+{
+   p->print = enable;
+}
+
+
+void
+ppc_indent(struct ppc_function *p, int spaces)
+{
+   p->indent += spaces;
+}
+
+
+static void
+indent(const struct ppc_function *p)
+{
+   int i;
+   for (i = 0; i < p->indent; i++) {
+      putchar(' ');
+   }
+}
+
+
+void
+ppc_comment(struct ppc_function *p, int rel_indent, const char *s)
+{
+   if (p->print) {
+      p->indent += rel_indent;
+      indent(p);
+      p->indent -= rel_indent;
+      printf("# %s\n", s);
+   }
+}
+
+
 /**
  * Mark a register as being unavailable.
  */
@@ -132,6 +174,7 @@ ppc_allocate_register(struct ppc_function *p)
          return i;
       }
    }
+   printf("OUT OF PPC registers!\n");
    return -1;
 }
 
@@ -163,6 +206,7 @@ ppc_allocate_fp_register(struct ppc_function *p)
          return i;
       }
    }
+   printf("OUT OF PPC FP registers!\n");
    return -1;
 }
 
@@ -194,6 +238,7 @@ ppc_allocate_vec_register(struct ppc_function *p)
          return i;
       }
    }
+   printf("OUT OF PPC VEC registers!\n");
    return -1;
 }
 
@@ -252,7 +297,8 @@ union vx_inst {
 };
 
 static INLINE void
-emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB,
+        const char *format, boolean transpose)
 {
    union vx_inst inst;
    inst.inst.op = 4;
@@ -261,6 +307,13 @@ emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.vB = vB;
    inst.inst.op2 = op2;
    emit_instruction(p, inst.bits);
+   if (p->print) {
+      indent(p);
+      if (transpose)
+         printf(format, vD, vB, vA);
+      else
+         printf(format, vD, vA, vB);
+   }
 }
 
 
@@ -277,7 +330,8 @@ union vxr_inst {
 };
 
 static INLINE void
-emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB,
+         const char *format)
 {
    union vxr_inst inst;
    inst.inst.op = 4;
@@ -287,6 +341,10 @@ emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
    inst.inst.rC = 0;
    inst.inst.op2 = op2;
    emit_instruction(p, inst.bits);
+   if (p->print) {
+      indent(p);
+      printf(format, vD, vA, vB);
+   }
 }
 
 
@@ -303,7 +361,8 @@ union va_inst {
 };
 
 static INLINE void
-emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC,
+        const char *format)
 {
    union va_inst inst;
    inst.inst.op = 4;
@@ -313,6 +372,10 @@ emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
    inst.inst.vC = vC;
    inst.inst.op2 = op2;
    emit_instruction(p, inst.bits);
+   if (p->print) {
+      indent(p);
+      printf(format, vD, vA, vB, vC);
+   }
 }
 
 
@@ -396,7 +459,8 @@ union x_inst {
 };
 
 static INLINE void
-emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
+emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2,
+       const char *format)
 {
    union x_inst inst;
    inst.inst.op = op;
@@ -406,6 +470,10 @@ emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
    inst.inst.op2 = op2;
    inst.inst.unused = 0x0;
    emit_instruction(p, inst.bits);
+   if (p->print) {
+      indent(p);
+      printf(format, vrs, ra, rb);
+   }
 }
 
 
@@ -420,7 +488,8 @@ union d_inst {
 };
 
 static INLINE void
-emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
+emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si,
+       const char *format, boolean transpose)
 {
    union d_inst inst;
    assert(si >= -32768);
@@ -430,6 +499,13 @@ emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
    inst.inst.ra = ra;
    inst.inst.si = (unsigned) (si & 0xffff);
    emit_instruction(p, inst.bits);
+   if (p->print) {
+      indent(p);
+      if (transpose)
+         printf(format, rt, si, ra);
+      else
+         printf(format, rt, ra, si);
+   }
 }
 
 
@@ -448,7 +524,7 @@ union a_inst {
 
 static INLINE void
 emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
-       uint rc)
+       uint rc, const char *format)
 {
    union a_inst inst;
    inst.inst.op = op;
@@ -459,6 +535,10 @@ emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
    inst.inst.op2 = op2;
    inst.inst.rc = rc;
    emit_instruction(p, inst.bits);
+   if (p->print) {
+      indent(p);
+      printf(format, frt, fra, frb);
+   }
 }
 
 
@@ -477,7 +557,7 @@ union xo_inst {
 
 static INLINE void
 emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
-        uint op2, uint rc)
+        uint op2, uint rc, const char *format)
 {
    union xo_inst inst;
    inst.inst.op = op;
@@ -488,6 +568,10 @@ emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
    inst.inst.op2 = op2;
    inst.inst.rc = rc;
    emit_instruction(p, inst.bits);
+   if (p->print) {
+      indent(p);
+      printf(format, rt, ra, rb);
+   }
 }
 
 
@@ -502,140 +586,142 @@ emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
 void
 ppc_vaddfp(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 10, vD, vA, vB);
+   emit_vx(p, 10, vD, vA, vB, "vaddfp\t%u, v%u, v%u\n", FALSE);
 }
 
 /** vector float substract */
 void
 ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 74, vD, vA, vB);
+   emit_vx(p, 74, vD, vA, vB, "vsubfp\tv%u, v%u, v%u\n", FALSE);
 }
 
 /** vector float min */
 void
 ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 1098, vD, vA, vB);
+   emit_vx(p, 1098, vD, vA, vB, "vminfp\tv%u, v%u, v%u\n", FALSE);
 }
 
 /** vector float max */
 void
 ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 1034, vD, vA, vB);
+   emit_vx(p, 1034, vD, vA, vB, "vmaxfp\tv%u, v%u, v%u\n", FALSE);
 }
 
 /** vector float mult add: vD = vA * vB + vC */
 void
 ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
 {
-   emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
+   /* note arg order */
+   emit_va(p, 46, vD, vA, vC, vB, "vmaddfp\tv%u, v%u, v%u, v%u\n");
 }
 
 /** vector float negative mult subtract: vD = vA - vB * vC */
 void
 ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
 {
-   emit_va(p, 47, vD, vB, vA, vC); /* note arg order */
+   /* note arg order */
+   emit_va(p, 47, vD, vB, vA, vC, "vnmsubfp\tv%u, v%u, v%u, v%u\n");
 }
 
 /** vector float compare greater than */
 void
 ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vxr(p, 710, vD, vA, vB);
+   emit_vxr(p, 710, vD, vA, vB, "vcmpgtfpx\tv%u, v%u, v%u");
 }
 
 /** vector float compare greater than or equal to */
 void
 ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vxr(p, 454, vD, vA, vB);
+   emit_vxr(p, 454, vD, vA, vB, "vcmpgefpx\tv%u, v%u, v%u");
 }
 
 /** vector float compare equal */
 void
 ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vxr(p, 198, vD, vA, vB);
+   emit_vxr(p, 198, vD, vA, vB, "vcmpeqfpx\tv%u, v%u, v%u");
 }
 
 /** vector float 2^x */
 void
 ppc_vexptefp(struct ppc_function *p, uint vD, uint vB)
 {
-   emit_vx(p, 394, vD, 0, vB);
+   emit_vx(p, 394, vD, 0, vB, "vexptefp\tv%u, 0%u, v%u\n", FALSE);
 }
 
 /** vector float log2(x) */
 void
 ppc_vlogefp(struct ppc_function *p, uint vD, uint vB)
 {
-   emit_vx(p, 458, vD, 0, vB);
+   emit_vx(p, 458, vD, 0, vB, "vlogefp\tv%u, 0%u, v%u\n", FALSE);
 }
 
 /** vector float reciprocol */
 void
 ppc_vrefp(struct ppc_function *p, uint vD, uint vB)
 {
-   emit_vx(p, 266, vD, 0, vB);
+   emit_vx(p, 266, vD, 0, vB, "vrefp\tv%u, 0%u, v%u\n", FALSE);
 }
 
 /** vector float reciprocol sqrt estimate */
 void
 ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB)
 {
-   emit_vx(p, 330, vD, 0, vB);
+   emit_vx(p, 330, vD, 0, vB, "vrsqrtefp\tv%u, 0%u, v%u\n", FALSE);
 }
 
 /** vector float round to negative infinity */
 void
 ppc_vrfim(struct ppc_function *p, uint vD, uint vB)
 {
-   emit_vx(p, 714, vD, 0, vB);
+   emit_vx(p, 714, vD, 0, vB, "vrfim\tv%u, 0%u, v%u\n", FALSE);
 }
 
 /** vector float round to positive infinity */
 void
 ppc_vrfip(struct ppc_function *p, uint vD, uint vB)
 {
-   emit_vx(p, 650, vD, 0, vB);
+   emit_vx(p, 650, vD, 0, vB, "vrfip\tv%u, 0%u, v%u\n", FALSE);
 }
 
 /** vector float round to nearest int */
 void
 ppc_vrfin(struct ppc_function *p, uint vD, uint vB)
 {
-   emit_vx(p, 522, vD, 0, vB);
+   emit_vx(p, 522, vD, 0, vB, "vrfin\tv%u, 0%u, v%u\n", FALSE);
 }
 
 /** vector float round to int toward zero */
 void
 ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
 {
-   emit_vx(p, 586, vD, 0, vB);
+   emit_vx(p, 586, vD, 0, vB, "vrfiz\tv%u, 0%u, v%u\n", FALSE);
 }
 
-/** vector store: store vR at mem[vA+vB] */
+/** vector store: store vR at mem[rA+rB] */
 void
-ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+ppc_stvx(struct ppc_function *p, uint vR, uint rA, uint rB)
 {
-   emit_x(p, 31, vR, vA, vB, 231);
+   emit_x(p, 31, vR, rA, rB, 231, "stvx\tv%u, r%u, r%u\n");
 }
 
-/** vector load: vR = mem[vA+vB] */
+/** vector load: vR = mem[rA+rB] */
 void
-ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+ppc_lvx(struct ppc_function *p, uint vR, uint rA, uint rB)
 {
-   emit_x(p, 31, vR, vA, vB, 103);
+   emit_x(p, 31, vR, rA, rB, 103, "lvx\tv%u, r%u, r%u\n");
 }
 
 /** load vector element word: vR = mem_word[ra+rb] */
 void
-ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
+ppc_lvewx(struct ppc_function *p, uint vR, uint rA, uint rB)
 {
-   emit_x(p, 31, vr, ra, rb, 71);
+   emit_x(p, 31, vR, rA, rB, 71, "lvewx\tv%u, r%u, r%u\n");
 }
 
 
@@ -649,49 +735,63 @@ ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
 void
 ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 1028, vD, vA, vB);
+   emit_vx(p, 1028, vD, vA, vB, "vand\tv%u, v%u, v%u\n", FALSE);
 }
 
 /** vector and complement */
 void
 ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 1092, vD, vA, vB);
+   emit_vx(p, 1092, vD, vA, vB, "vandc\tv%u, v%u, v%u\n", FALSE);
 }
 
 /** vector or */
 void
 ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 1156, vD, vA, vB);
+   emit_vx(p, 1156, vD, vA, vB, "vor\tv%u, v%u, v%u\n", FALSE);
 }
 
 /** vector nor */
 void
 ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 1284, vD, vA, vB);
+   emit_vx(p, 1284, vD, vA, vB, "vnor\tv%u, v%u, v%u\n", FALSE);
 }
 
 /** vector xor */
 void
 ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 1220, vD, vA, vB);
+   emit_vx(p, 1220, vD, vA, vB, "vxor\tv%u, v%u, v%u\n", FALSE);
 }
 
 /** Pseudo-instruction: vector move */
 void
 ppc_vmove(struct ppc_function *p, uint vD, uint vA)
 {
+   boolean print = p->print;
+   p->print = FALSE;
    ppc_vor(p, vD, vA, vA);
+   if (print) {
+      indent(p);
+      printf("vor\tv%u, v%u, v%u \t# v%u = v%u\n", vD, vA, vA, vD, vA);
+   }
+   p->print = print;
 }
 
 /** Set vector register to {0,0,0,0} */
 void
 ppc_vzero(struct ppc_function *p, uint vr)
 {
+   boolean print = p->print;
+   p->print = FALSE;
    ppc_vxor(p, vr, vr, vr);
+   if (print) {
+      indent(p);
+      printf("vxor\tv%u, v%u, v%u \t# v%u = {0,0,0,0}\n", vr, vr, vr, vr);
+   }
+   p->print = print;
 }
 
 
@@ -705,35 +805,35 @@ ppc_vzero(struct ppc_function *p, uint vr)
 void
 ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
 {
-   emit_va(p, 43, vD, vA, vB, vC);
+   emit_va(p, 43, vD, vA, vB, vC, "vperm\tr%u, r%u, r%u, r%u");
 }
 
 /** vector select */
 void
 ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
 {
-   emit_va(p, 42, vD, vA, vB, vC);
+   emit_va(p, 42, vD, vA, vB, vC, "vsel\tr%u, r%u, r%u, r%u");
 }
 
 /** vector splat byte */
 void
 ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm)
 {
-   emit_vx(p, 42, vD, imm, vB);
+   emit_vx(p, 42, vD, imm, vB, "vspltb\tv%u, v%u, %u\n", TRUE);
 }
 
 /** vector splat half word */
 void
 ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm)
 {
-   emit_vx(p, 588, vD, imm, vB);
+   emit_vx(p, 588, vD, imm, vB, "vsplthw\tv%u, v%u, %u\n", TRUE);
 }
 
 /** vector splat word */
 void
 ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
 {
-   emit_vx(p, 652, vD, imm, vB);
+   emit_vx(p, 652, vD, imm, vB, "vspltw\tv%u, v%u, %u\n", TRUE);
 }
 
 /** vector splat signed immediate word */
@@ -742,14 +842,14 @@ ppc_vspltisw(struct ppc_function *p, uint vD, int imm)
 {
    assert(imm >= -16);
    assert(imm < 15);
-   emit_vx(p, 908, vD, imm, 0);
+   emit_vx(p, 908, vD, imm, 0, "vspltisw\tv%u, %d, %u\n", FALSE);
 }
 
 /** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
 void
 ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB)
 {
-   emit_vx(p, 388, vD, vA, vB);
+   emit_vx(p, 388, vD, vA, vB, "vslw\tv%u, v%u, v%u\n", FALSE);
 }
 
 
@@ -763,63 +863,66 @@ ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB)
 void
 ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm)
 {
-   emit_d(p, 14, rt, ra, imm);
+   emit_d(p, 14, rt, ra, imm, "addi\tr%u, r%u, %d\n", FALSE);
 }
 
 /** rt = ra + (imm << 16) */
 void
 ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm)
 {
-   emit_d(p, 15, rt, ra, imm);
+   emit_d(p, 15, rt, ra, imm, "addis\tr%u, r%u, %d\n", FALSE);
 }
 
 /** rt = ra + rb */
 void
 ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb)
 {
-   emit_xo(p, 31, rt, ra, rb, 0, 266, 0);
+   emit_xo(p, 31, rt, ra, rb, 0, 266, 0, "add\tr%u, r%u, r%u\n");
 }
 
 /** rt = ra AND ra */
 void
 ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb)
 {
-   emit_x(p, 31, ra, rt, rb, 28);  /* note argument order */
+   emit_x(p, 31, ra, rt, rb, 28, "and\tr%u, r%u, r%u\n");  /* note argument order */
 }
 
 /** rt = ra AND imm */
 void
 ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm)
 {
-   emit_d(p, 28, ra, rt, imm);  /* note argument order */
+   /* note argument order */
+   emit_d(p, 28, ra, rt, imm, "andi\tr%u, r%u, %d\n", FALSE);
 }
 
 /** rt = ra OR ra */
 void
 ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb)
 {
-   emit_x(p, 31, ra, rt, rb, 444);  /* note argument order */
+   emit_x(p, 31, ra, rt, rb, 444, "or\tr%u, r%u, r%u\n");  /* note argument order */
 }
 
 /** rt = ra OR imm */
 void
 ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm)
 {
-   emit_d(p, 24, ra, rt, imm);  /* note argument order */
+   /* note argument order */
+   emit_d(p, 24, ra, rt, imm, "ori\tr%u, r%u, %d\n", FALSE);
 }
 
 /** rt = ra XOR ra */
 void
 ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb)
 {
-   emit_x(p, 31, ra, rt, rb, 316);  /* note argument order */
+   emit_x(p, 31, ra, rt, rb, 316, "xor\tr%u, r%u, r%u\n");  /* note argument order */
 }
 
 /** rt = ra XOR imm */
 void
 ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm)
 {
-   emit_d(p, 26, ra, rt, imm);  /* note argument order */
+   /* note argument order */
+   emit_d(p, 26, ra, rt, imm, "xori\tr%u, r%u, %d\n", FALSE);
 }
 
 /** pseudo instruction: move: rt = ra */
@@ -833,7 +936,14 @@ ppc_mr(struct ppc_function *p, uint rt, uint ra)
 void
 ppc_li(struct ppc_function *p, uint rt, int imm)
 {
+   boolean print = p->print;
+   p->print = FALSE;
    ppc_addi(p, rt, 0, imm);
+   if (print) {
+      indent(p);
+      printf("addi\tr%u, r0, %d \t# r%u = %d\n", rt, imm, rt, imm);
+   }
+   p->print = print;
 }
 
 /** rt = imm << 16 */
@@ -864,21 +974,21 @@ ppc_load_int(struct ppc_function *p, uint rt, int imm)
 void
 ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d)
 {
-   emit_d(p, 37, rs, ra, d);
+   emit_d(p, 37, rs, ra, d, "stwu\tr%u, %d(r%u)\n", TRUE);
 }
 
 /** store rs at memory[(ra)+d] */
 void
 ppc_stw(struct ppc_function *p, uint rs, uint ra, int d)
 {
-   emit_d(p, 36, rs, ra, d);
+   emit_d(p, 36, rs, ra, d, "stw\tr%u, %d(r%u)\n", TRUE);
 }
 
 /** Load rt = mem[(ra)+d];  then zero set high 32 bits to zero. */
 void
 ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d)
 {
-   emit_d(p, 32, rt, ra, d);
+   emit_d(p, 32, rt, ra, d, "lwz\tr%u, %d(r%u)\n", TRUE);
 }
 
 
@@ -891,42 +1001,42 @@ ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d)
 void
 ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb)
 {
-   emit_a(p, 63, frt, fra, frb, 21, 0);
+   emit_a(p, 63, frt, fra, frb, 21, 0, "fadd\tf%u, f%u, f%u\n");
 }
 
 /** sub: frt = fra - frb */
 void
 ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb)
 {
-   emit_a(p, 63, frt, fra, frb, 20, 0);
+   emit_a(p, 63, frt, fra, frb, 20, 0, "fsub\tf%u, f%u, f%u\n");
 }
 
 /** convert to int: rt = (int) ra */
 void
 ppc_fctiwz(struct ppc_function *p, uint rt, uint fra)
 {
-   emit_x(p, 63, rt, 0, fra, 15);
+   emit_x(p, 63, rt, 0, fra, 15, "fctiwz\tr%u, r%u, r%u\n");
 }
 
 /** store frs at mem[(ra)+offset] */
 void
 ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset)
 {
-   emit_d(p, 52, frs, ra, offset);
+   emit_d(p, 52, frs, ra, offset, "stfs\tr%u, %d(r%u)\n", TRUE);
 }
 
 /** store frs at mem[(ra)+(rb)] */
 void
 ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb)
 {
-   emit_x(p, 31, frs, ra, rb, 983);
+   emit_x(p, 31, frs, ra, rb, 983, "stfiwx\tr%u, r%u, r%u\n");
 }
 
 /** load frt = mem[(ra)+offset] */
 void
 ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset)
 {
-   emit_d(p, 48, frt, ra, offset);
+   emit_d(p, 48, frt, ra, offset, "stfs\tr%u, %d(r%u)\n", TRUE);
 }
 
 
@@ -942,6 +1052,10 @@ void
 ppc_blr(struct ppc_function *p)
 {
    emit_i(p, 18, 0, 0, 1);
+   if (p->print) {
+      indent(p);
+      printf("blr\n");
+   }
 }
 
 /** Branch Conditional to Link Register (p. 36) */
@@ -949,6 +1063,10 @@ void
 ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg)
 {
    emit_xl(p, 19, condOp, condReg, branchHint, 16, 0);
+   if (p->print) {
+      indent(p);
+      printf("bclr\t%u %u %u\n", condOp, branchHint, condReg);
+   }
 }
 
 /** Pseudo instruction: return from subroutine */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
index 08212a2a25..93e5f5187d 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -1,6 +1,7 @@
 /**************************************************************************
  *
  * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ * Copyright (C) 2009 VMware, Inc.  All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -58,6 +59,8 @@ struct ppc_function
    uint32_t reg_used;   /** used/free general-purpose registers bitmask */
    uint32_t fp_used;   /** used/free floating point registers bitmask */
    uint32_t vec_used;   /** used/free vector registers bitmask */
+   int indent;
+   boolean print;
 };
 
 
@@ -68,6 +71,10 @@ extern uint ppc_num_instructions(const struct ppc_function *p);
 extern void (*ppc_get_func( struct ppc_function *p ))( void );
 extern void ppc_dump_func(const struct ppc_function *p);
 
+extern void ppc_print_code(struct ppc_function *p, boolean enable);
+extern void ppc_indent(struct ppc_function *p, int spaces);
+extern void ppc_comment(struct ppc_function *p, int rel_indent, const char *s);
+
 extern int ppc_reserve_register(struct ppc_function *p, int reg);
 extern int ppc_allocate_register(struct ppc_function *p);
 extern void ppc_release_register(struct ppc_function *p, int reg);
-- 
cgit v1.2.3


From c4a782041b19cb4a08712384b19be25b79acba3c Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Sun, 11 Jan 2009 14:22:00 -0700
Subject: cell: datatype clean-ups in SPE rtasm

---
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c | 123 ++++++++++++++--------------
 src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h |  81 +++++++++---------
 2 files changed, 99 insertions(+), 105 deletions(-)

(limited to 'src/gallium/auxiliary/rtasm')

diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 071bc2015c..53a0e722cf 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -213,8 +213,8 @@ emit_instruction(struct spe_function *p, uint32_t inst_bits)
 
 
-static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB, const char *name)
+static void emit_RR(struct spe_function *p, unsigned op, int rT,
+		    int rA, int rB, const char *name)
 {
     union spe_inst_RR inst;
     inst.inst.op = op;
@@ -230,8 +230,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
-                     unsigned rA, unsigned rB, unsigned rC, const char *name)
+static void emit_RRR(struct spe_function *p, unsigned op, int rT,
+                     int rA, int rB, int rC, const char *name)
 {
     union spe_inst_RRR inst;
     inst.inst.op = op;
@@ -248,8 +248,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm, const char *name)
+static void emit_RI7(struct spe_function *p, unsigned op, int rT,
+		     int rA, int imm, const char *name)
 {
     union spe_inst_RI7 inst;
     inst.inst.op = op;
@@ -266,8 +266,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
 
 
-static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm, const char *name)
+static void emit_RI8(struct spe_function *p, unsigned op, int rT,
+		     int rA, int imm, const char *name)
 {
     union spe_inst_RI8 inst;
     inst.inst.op = op;
@@ -284,8 +284,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
 
 
-static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
-		      unsigned rA, int imm, const char *name)
+static void emit_RI10(struct spe_function *p, unsigned op, int rT,
+		      int rA, int imm, const char *name)
 {
     union spe_inst_RI10 inst;
     inst.inst.op = op;
@@ -302,8 +302,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
 
 
 /** As above, but do range checking on signed immediate value */
-static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
-                       unsigned rA, int imm, const char *name)
+static void emit_RI10s(struct spe_function *p, unsigned op, int rT,
+                       int rA, int imm, const char *name)
 {
     assert(imm <= 511);
     assert(imm >= -512);
@@ -311,7 +311,7 @@ static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
+static void emit_RI16(struct spe_function *p, unsigned op, int rT,
 		      int imm, const char *name)
 {
     union spe_inst_RI16 inst;
@@ -326,7 +326,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
 }
 
 
-static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
+static void emit_RI18(struct spe_function *p, unsigned op, int rT,
 		      int imm, const char *name)
 {
     union spe_inst_RI18 inst;
@@ -348,61 +348,61 @@ void _name (struct spe_function *p) \
 }
 
 #define EMIT_(_name, _op) \
-void _name (struct spe_function *p, unsigned rT) \
+void _name (struct spe_function *p, int rT) \
 { \
    emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
 }
 
 #define EMIT_R(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA) \
+void _name (struct spe_function *p, int rT, int rA) \
 { \
    emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
 }
 
 #define EMIT_RR(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
+void _name (struct spe_function *p, int rT, int rA, int rB) \
 { \
    emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
 }
 
 #define EMIT_RRR(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
+void _name (struct spe_function *p, int rT, int rA, int rB, int rC) \
 { \
    emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
 }
 
 #define EMIT_RI7(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+void _name (struct spe_function *p, int rT, int rA, int imm) \
 { \
    emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
 }
 
 #define EMIT_RI8(_name, _op, bias) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+void _name (struct spe_function *p, int rT, int rA, int imm) \
 { \
    emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
 }
 
 #define EMIT_RI10(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+void _name (struct spe_function *p, int rT, int rA, int imm) \
 { \
    emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI10s(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+void _name (struct spe_function *p, int rT, int rA, int imm) \
 { \
    emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI16(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, int imm) \
+void _name (struct spe_function *p, int rT, int imm) \
 { \
    emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_RI18(_name, _op) \
-void _name (struct spe_function *p, unsigned rT, int imm) \
+void _name (struct spe_function *p, int rT, int imm) \
 { \
    emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
 }
@@ -424,7 +424,7 @@ void _name (struct spe_function *p, int imm) \
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
-    unsigned int i;
+    uint i;
 
     if (!code_size)
        code_size = 64;
@@ -503,6 +503,7 @@ int spe_allocate_register(struct spe_function *p, int reg)
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
+   assert(reg >= 0);
    assert(reg < SPE_NUM_REGS);
    assert(p->regs[reg] == 1);
 
@@ -517,7 +518,7 @@ void spe_release_register(struct spe_function *p, int reg)
  */
 void spe_allocate_register_set(struct spe_function *p)
 {
-   unsigned int i;
+   uint i;
 
    /* Keep track of the set count.  If it ever wraps around to 0, 
     * we're in trouble.
@@ -538,7 +539,7 @@ void spe_allocate_register_set(struct spe_function *p)
 
 void spe_release_register_set(struct spe_function *p)
 {
-   unsigned int i;
+   uint i;
 
    /* If the set count drops below zero, we're in trouble. */
    assert(p->set_count > 0);
@@ -599,7 +600,7 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
  * Load quad word.
  * NOTE: offset is in bytes and the least significant 4 bits must be zero!
  */
-void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+void spe_lqd(struct spe_function *p, int rT, int rA, int offset)
 {
    const boolean pSave = p->print;
 
@@ -624,7 +625,7 @@ void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
  * Store quad word.
  * NOTE: offset is in bytes and the least significant 4 bits must be zero!
  */
-void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+void spe_stqd(struct spe_function *p, int rT, int rA, int offset)
 {
    const boolean pSave = p->print;
 
@@ -653,51 +654,51 @@ void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
  */
 
 /** Branch Indirect to address in rA */
-void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
+void spe_bi(struct spe_function *p, int rA, int d, int e)
 {
    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Interupt Return */
-void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
+void spe_iret(struct spe_function *p, int rA, int d, int e)
 {
    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link on external data */
-void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
+void spe_bisled(struct spe_function *p, int rT, int rA, int d,
 		int e)
 {
    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link.  Save PC in rT, jump to rA. */
-void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
+void spe_bisl(struct spe_function *p, int rT, int rA, int d,
 		int e)
 {
    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
-void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+void spe_biz(struct spe_function *p, int rT, int rA, int d, int e)
 {
    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
-void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+void spe_binz(struct spe_function *p, int rT, int rA, int d, int e)
 {
    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
-void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+void spe_bihz(struct spe_function *p, int rT, int rA, int d, int e)
 {
    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
-void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+void spe_bihnz(struct spe_function *p, int rT, int rA, int d, int e)
 {
    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
@@ -733,7 +734,7 @@ EMIT_R   (spe_mtspr, 0x10c);
 
 
 void
-spe_load_float(struct spe_function *p, unsigned rT, float x)
+spe_load_float(struct spe_function *p, int rT, float x)
 {
    if (x == 0.0f) {
       spe_il(p, rT, 0x0);
@@ -760,7 +761,7 @@ spe_load_float(struct spe_function *p, unsigned rT, float x)
 
 
 void
-spe_load_int(struct spe_function *p, unsigned rT, int i)
+spe_load_int(struct spe_function *p, int rT, int i)
 {
    if (-32768 <= i && i <= 32767) {
       spe_il(p, rT, i);
@@ -772,7 +773,7 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
 }
 
-void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+void spe_load_uint(struct spe_function *p, int rT, uint ui)
 {
    /* If the whole value is in the lower 18 bits, use ila, which
     * doesn't sign-extend.  Otherwise, if the two halfwords of
@@ -793,7 +794,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
       ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
       ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
    ) {
-      unsigned int mask = 0;
+      uint mask = 0;
       /* fsmbi duplicates each bit in the given mask eight times,
        * using a 16-bit value to initialize a 16-byte quadword.
        * Each 4-bit nybble of the mask corresponds to a full word
@@ -822,7 +823,7 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
  * Changes to one should be made in the other.
  */
 void
-spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+spe_and_uint(struct spe_function *p, int rT, int rA, uint ui)
 {
    /* If we can, emit a single instruction, either And Byte Immediate
     * (which uses the same constant across each byte), And Halfword Immediate
@@ -832,7 +833,7 @@ spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
     *
     * Otherwise, we'll need to use a temporary register.
     */
-   unsigned int tmp;
+   uint tmp;
 
    /* If the upper 23 bits are all 0s or all 1s, sign extension
     * will work and we can use And Word Immediate
@@ -863,7 +864,7 @@ spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
    }
 
    /* Otherwise, we'll have to use a temporary register. */
-   unsigned int tmp_reg = spe_allocate_available_register(p);
+   int tmp_reg = spe_allocate_available_register(p);
    spe_load_uint(p, tmp_reg, ui);
    spe_and(p, rT, rA, tmp_reg);
    spe_release_register(p, tmp_reg);
@@ -875,7 +876,7 @@ spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
  * Changes to one should be made in the other.
  */
 void
-spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+spe_xor_uint(struct spe_function *p, int rT, int rA, uint ui)
 {
    /* If we can, emit a single instruction, either Exclusive Or Byte 
     * Immediate (which uses the same constant across each byte), Exclusive 
@@ -885,7 +886,7 @@ spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
     *
     * Otherwise, we'll need to use a temporary register.
     */
-   unsigned int tmp;
+   uint tmp;
 
    /* If the upper 23 bits are all 0s or all 1s, sign extension
     * will work and we can use Exclusive Or Word Immediate
@@ -916,14 +917,14 @@ spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
    }
 
    /* Otherwise, we'll have to use a temporary register. */
-   unsigned int tmp_reg = spe_allocate_available_register(p);
+   int tmp_reg = spe_allocate_available_register(p);
    spe_load_uint(p, tmp_reg, ui);
    spe_xor(p, rT, rA, tmp_reg);
    spe_release_register(p, tmp_reg);
 }
 
 void
-spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+spe_compare_equal_uint(struct spe_function *p, int rT, int rA, uint ui)
 {
    /* If the comparison value is 9 bits or less, it fits inside a
     * Compare Equal Word Immediate instruction.
@@ -933,7 +934,7 @@ spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigne
    }
    /* Otherwise, we're going to have to load a word first. */
    else {
-      unsigned int tmp_reg = spe_allocate_available_register(p);
+      int tmp_reg = spe_allocate_available_register(p);
       spe_load_uint(p, tmp_reg, ui);
       spe_ceq(p, rT, rA, tmp_reg);
       spe_release_register(p, tmp_reg);
@@ -941,7 +942,7 @@ spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigne
 }
 
 void
-spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+spe_compare_greater_uint(struct spe_function *p, int rT, int rA, uint ui)
 {
    /* If the comparison value is 10 bits or less, it fits inside a
     * Compare Logical Greater Than Word Immediate instruction.
@@ -951,7 +952,7 @@ spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsig
    }
    /* Otherwise, we're going to have to load a word first. */
    else {
-      unsigned int tmp_reg = spe_allocate_available_register(p);
+      int tmp_reg = spe_allocate_available_register(p);
       spe_load_uint(p, tmp_reg, ui);
       spe_clgt(p, rT, rA, tmp_reg);
       spe_release_register(p, tmp_reg);
@@ -959,10 +960,10 @@ spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsig
 }
 
 void
-spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
+spe_splat(struct spe_function *p, int rT, int rA)
 {
    /* Use a temporary, just in case rT == rA */
-   unsigned int tmp_reg = spe_allocate_available_register(p);
+   int tmp_reg = spe_allocate_available_register(p);
    /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
    spe_ila(p, tmp_reg, 0x00010203);
    spe_shufb(p, rT, rA, rA, tmp_reg);
@@ -971,14 +972,14 @@ spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 
 
 void
-spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
+spe_complement(struct spe_function *p, int rT, int rA)
 {
    spe_nor(p, rT, rA, rA);
 }
 
 
 void
-spe_move(struct spe_function *p, unsigned rT, unsigned rA)
+spe_move(struct spe_function *p, int rT, int rA)
 {
    /* Use different instructions depending on the instruction address
     * to take advantage of the dual pipelines.
@@ -991,14 +992,14 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA)
 
 
 void
-spe_zero(struct spe_function *p, unsigned rT)
+spe_zero(struct spe_function *p, int rT)
 {
    spe_xor(p, rT, rT, rT);
 }
 
 
 void
-spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+spe_splat_word(struct spe_function *p, int rT, int rA, int word)
 {
    assert(word >= 0);
    assert(word <= 3);
@@ -1038,9 +1039,9 @@ spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
  * like "x = min(x, a)", we always allocate a new register to be safe.
  */
 void 
-spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+spe_float_min(struct spe_function *p, int rT, int rA, int rB)
 {
-   unsigned int compare_reg = spe_allocate_available_register(p);
+   int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
    spe_selb(p, rT, rA, rB, compare_reg);
    spe_release_register(p, compare_reg);
@@ -1055,9 +1056,9 @@ spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
  * so that the larger of the two is selected instead of the smaller.
  */
 void 
-spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+spe_float_max(struct spe_function *p, int rT, int rA, int rB)
 {
-   unsigned int compare_reg = spe_allocate_available_register(p);
+   int compare_reg = spe_allocate_available_register(p);
    spe_fcgt(p, compare_reg, rA, rB);
    spe_selb(p, rT, rB, rA, compare_reg);
    spe_release_register(p, compare_reg);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index f9ad2acacd..65d9c77415 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -79,9 +79,9 @@ struct spe_function
 };
 
 
-extern void spe_init_func(struct spe_function *p, unsigned code_size);
+extern void spe_init_func(struct spe_function *p, uint code_size);
 extern void spe_release_func(struct spe_function *p);
-extern unsigned spe_code_size(const struct spe_function *p);
+extern uint spe_code_size(const struct spe_function *p);
 
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
@@ -89,8 +89,7 @@ extern void spe_release_register(struct spe_function *p, int reg);
 extern void spe_allocate_register_set(struct spe_function *p);
 extern void spe_release_register_set(struct spe_function *p);
 
-extern unsigned
-spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+extern uint spe_get_registers_used(const struct spe_function *p, ubyte used[]);
 
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
@@ -103,31 +102,25 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT(_name, _op) \
     extern void _name (struct spe_function *p);
 #define EMIT_(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT);
+    extern void _name (struct spe_function *p, int rT);
 #define EMIT_R(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA);
+    extern void _name (struct spe_function *p, int rT, int rA);
 #define EMIT_RR(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       unsigned rB);
+    extern void _name (struct spe_function *p, int rT, int rA, int rB);
 #define EMIT_RRR(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       unsigned rB, unsigned rC);
+    extern void _name (struct spe_function *p, int rT, int rA, int rB, int rC);
 #define EMIT_RI7(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       int imm);
+    extern void _name (struct spe_function *p, int rT, int rA, int imm);
 #define EMIT_RI8(_name, _op, bias) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       int imm);
+    extern void _name (struct spe_function *p, int rT, int rA, int imm);
 #define EMIT_RI10(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       int imm);
+    extern void _name (struct spe_function *p, int rT, int rA, int imm);
 #define EMIT_RI10s(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
-                       int imm);
+    extern void _name (struct spe_function *p, int rT, int rA, int imm);
 #define EMIT_RI16(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm);
+    extern void _name (struct spe_function *p, int rT, int imm);
 #define EMIT_RI18(_name, _op) \
-    extern void _name (struct spe_function *p, unsigned rT, int imm);
+    extern void _name (struct spe_function *p, int rT, int imm);
 #define EMIT_I16(_name, _op) \
     extern void _name (struct spe_function *p, int imm);
 #define UNDEF_EMIT_MACROS
@@ -301,82 +294,82 @@ EMIT_RI16(spe_brhz,      0x044)
 EMIT     (spe_lnop,      0x001)
 
 extern void
-spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+spe_lqd(struct spe_function *p, int rT, int rA, int offset);
 
 extern void
-spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+spe_stqd(struct spe_function *p, int rT, int rA, int offset);
 
-extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
-extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
-extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_bi(struct spe_function *p, int rA, int d, int e);
+extern void spe_iret(struct spe_function *p, int rA, int d, int e);
+extern void spe_bisled(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_bisl(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_biz(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_biz(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_binz(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_binz(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_bihz(struct spe_function *p, int rT, int rA,
     int d, int e);
-extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
+extern void spe_bihnz(struct spe_function *p, int rT, int rA,
     int d, int e);
 
 
 /** Load/splat immediate float into rT. */
 extern void
-spe_load_float(struct spe_function *p, unsigned rT, float x);
+spe_load_float(struct spe_function *p, int rT, float x);
 
 /** Load/splat immediate int into rT. */
 extern void
-spe_load_int(struct spe_function *p, unsigned rT, int i);
+spe_load_int(struct spe_function *p, int rT, int i);
 
 /** Load/splat immediate unsigned int into rT. */
 extern void
-spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+spe_load_uint(struct spe_function *p, int rT, uint ui);
 
 /** And immediate value into rT. */
 extern void
-spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+spe_and_uint(struct spe_function *p, int rT, int rA, uint ui);
 
 /** Xor immediate value into rT. */
 extern void
-spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+spe_xor_uint(struct spe_function *p, int rT, int rA, uint ui);
 
 /** Compare equal with immediate value. */
 extern void
-spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+spe_compare_equal_uint(struct spe_function *p, int rT, int rA, uint ui);
 
 /** Compare greater with immediate value. */
 extern void
-spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+spe_compare_greater_uint(struct spe_function *p, int rT, int rA, uint ui);
 
 /** Replicate word 0 of rA across rT. */
 extern void
-spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
+spe_splat(struct spe_function *p, int rT, int rA);
 
 /** rT = complement_all_bits(rA). */
 extern void
-spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
+spe_complement(struct spe_function *p, int rT, int rA);
 
 /** rT = rA. */
 extern void
-spe_move(struct spe_function *p, unsigned rT, unsigned rA);
+spe_move(struct spe_function *p, int rT, int rA);
 
 /** rT = {0,0,0,0}. */
 extern void
-spe_zero(struct spe_function *p, unsigned rT);
+spe_zero(struct spe_function *p, int rT);
 
 /** rT = splat(rA, word) */
 extern void
-spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+spe_splat_word(struct spe_function *p, int rT, int rA, int word);
 
 /** rT = float min(rA, rB) */
 extern void
-spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+spe_float_min(struct spe_function *p, int rT, int rA, int rB);
 
 /** rT = float max(rA, rB) */
 extern void
-spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+spe_float_max(struct spe_function *p, int rT, int rA, int rB);
 
 
 /* Floating-point instructions
-- 
cgit v1.2.3