12 files changed, 595 insertions, 301 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index c749a7a315..69718eb4b3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -125,4 +125,22 @@ lp_build_const_float(struct gallivm_state *gallivm, float x)
 }
 
 
+/** Return constant-valued pointer to int */
+static INLINE LLVMValueRef
+lp_build_const_int_pointer(struct gallivm_state *gallivm, const void *ptr)
+{
+   LLVMTypeRef int_type;
+   LLVMValueRef v;
+
+   /* int type large enough to hold a pointer */
+   int_type = LLVMIntTypeInContext(gallivm->context, 8 * sizeof(void *));
+   v = LLVMConstInt(int_type, (uintptr_t) ptr, 0);
+   v = LLVMBuildIntToPtr(gallivm->builder, v,
+                         LLVMPointerType(int_type, 0),
+                         "cast int to ptr");
+   return v;
+}
+
+
+
 #endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index c43ee8ac63..c261d76116 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -231,44 +231,53 @@ lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm,
 
    assert(dst_type.floating);
 
-   /* Special-case int8->float, though most cases could be handled
-    * this way:
-    */
-   if (src_width == 8) {
-      scale = 1.0/255.0;
+   mantissa = lp_mantissa(dst_type);
+
+   if (src_width <= (mantissa + 1)) {
+      /*
+       * The source width matches fits what can be represented in floating
+       * point (i.e., mantissa + 1 bits). So do a straight multiplication
+       * followed by casting. No further rounding is necessary.
+       */
+
+      scale = 1.0/(double)((1ULL << src_width) - 1);
       res = LLVMBuildSIToFP(builder, src, vec_type, "");
       res = LLVMBuildFMul(builder, res,
                           lp_build_const_vec(gallivm, dst_type, scale), "");
       return res;
    }
+   else {
+      /*
+       * The source width exceeds what can be represented in floating
+       * point. So truncate the incoming values.
+       */
 
-   mantissa = lp_mantissa(dst_type);
-
-   n = MIN2(mantissa, src_width);
+      n = MIN2(mantissa, src_width);
 
-   ubound = ((unsigned long long)1 << n);
-   mask = ubound - 1;
-   scale = (double)ubound/mask;
-   bias = (double)((unsigned long long)1 << (mantissa - n));
+      ubound = ((unsigned long long)1 << n);
+      mask = ubound - 1;
+      scale = (double)ubound/mask;
+      bias = (double)((unsigned long long)1 << (mantissa - n));
 
-   res = src;
+      res = src;
 
-   if(src_width > mantissa) {
-      int shift = src_width - mantissa;
-      res = LLVMBuildLShr(builder, res,
-                          lp_build_const_int_vec(gallivm, dst_type, shift), "");
-   }
+      if (src_width > mantissa) {
+         int shift = src_width - mantissa;
+         res = LLVMBuildLShr(builder, res,
+                             lp_build_const_int_vec(gallivm, dst_type, shift), "");
+      }
 
-   bias_ = lp_build_const_vec(gallivm, dst_type, bias);
+      bias_ = lp_build_const_vec(gallivm, dst_type, bias);
 
-   res = LLVMBuildOr(builder,
-                     res,
-                     LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
+      res = LLVMBuildOr(builder,
+                        res,
+                        LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
 
-   res = LLVMBuildBitCast(builder, res, vec_type, "");
+      res = LLVMBuildBitCast(builder, res, vec_type, "");
 
-   res = LLVMBuildFSub(builder, res, bias_, "");
-   res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
+      res = LLVMBuildFSub(builder, res, bias_, "");
+      res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), "");
+   }
 
    return res;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
deleted file mode 100644
index 93e56553d7..0000000000
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2009 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifdef HAVE_UDIS86
-#include <udis86.h>
-#endif
-
-#include "util/u_math.h"
-#include "util/u_debug.h"
-#include "lp_bld_debug.h"
-
-
-/**
- * Check alignment.
- *
- * It is important that this check is not implemented as a macro or inlined
- * function, as the compiler assumptions in respect to alignment of global
- * and stack variables would often make the check a no op, defeating the
- * whole purpose of the exercise.
- */
-boolean
-lp_check_alignment(const void *ptr, unsigned alignment)
-{
-   assert(util_is_power_of_two(alignment));
-   return ((uintptr_t)ptr & (alignment - 1)) == 0;
-}
-
-
-void
-lp_disassemble(const void* func)
-{
-#ifdef HAVE_UDIS86
-   ud_t ud_obj;
-   uint64_t max_jmp_pc;
-   uint inst_no;
-   boolean emit_addrs = TRUE, emit_line_nos = FALSE;
-
-   ud_init(&ud_obj);
-
-   ud_set_input_buffer(&ud_obj, (void*)func, 0xffff);
-
-   max_jmp_pc = (uint64_t) (uintptr_t) func;
-   ud_set_pc(&ud_obj, max_jmp_pc);
-
-#ifdef PIPE_ARCH_X86
-   ud_set_mode(&ud_obj, 32);
-#endif
-#ifdef PIPE_ARCH_X86_64
-   ud_set_mode(&ud_obj, 64);
-#endif
-
-   ud_set_syntax(&ud_obj, UD_SYN_ATT);
-
-   while (ud_disassemble(&ud_obj)) {
-
-      if (emit_addrs) {
-#ifdef PIPE_ARCH_X86
-         debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
-#endif
-#ifdef PIPE_ARCH_X86_64
-         debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
-#endif
-      }
-      else if (emit_line_nos) {
-         debug_printf("%6d:\t", inst_no);
-         inst_no++;
-      }
-#if 0
-      debug_printf("%-16s ", ud_insn_hex(&ud_obj));
-#endif
-
-      debug_printf("%s\n", ud_insn_asm(&ud_obj));
-
-      if(ud_obj.mnemonic != UD_Icall) {
-         unsigned i;
-         for(i = 0; i < 3; ++i) {
-            const struct ud_operand *op = &ud_obj.operand[i];
-            if (op->type == UD_OP_JIMM){
-               uint64_t pc = ud_obj.pc;
-
-               switch (op->size) {
-               case 8:
-                  pc += op->lval.sbyte;
-                  break;
-               case 16:
-                  pc += op->lval.sword;
-                  break;
-               case 32:
-                  pc += op->lval.sdword;
-                  break;
-               default:
-                  break;
-               }
-               if(pc > max_jmp_pc)
-                  max_jmp_pc = pc;
-            }
-         }
-      }
-
-      if (ud_obj.mnemonic == UD_Iinvalid ||
-          (ud_insn_off(&ud_obj) >= max_jmp_pc &&
-           (ud_obj.mnemonic == UD_Iret ||
-            ud_obj.mnemonic == UD_Ijmp)))
-         break;
-   }
-
-#if 0
-   /* Print GDB command, useful to verify udis86 output */
-   debug_printf("disassemble %p %p\n", func, (void*)(uintptr_t)ud_obj.pc);
-#endif
-
-   debug_printf("\n");
-#else
-   (void)func;
-#endif
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
new file mode 100644
index 0000000000..bb2c82fe0e
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -0,0 +1,357 @@
+/**************************************************************************
+ *
+ * Copyright 2009-2011 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <llvm-c/Core.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetRegistry.h>
+#include <llvm/Target/TargetSelect.h>
+#include <llvm/Target/TargetInstrInfo.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/MemoryObject.h>
+#include <llvm/System/Host.h>
+
+#if HAVE_LLVM >= 0x0207
+#include <llvm/MC/MCDisassembler.h>
+#include <llvm/MC/MCAsmInfo.h>
+#include <llvm/MC/MCInst.h>
+#include <llvm/MC/MCInstPrinter.h>
+#endif /* HAVE_LLVM >= 0x0207 */
+
+#include "util/u_math.h"
+#include "util/u_debug.h"
+
+#include "lp_bld_debug.h"
+
+
+
+/**
+ * Check alignment.
+ *
+ * It is important that this check is not implemented as a macro or inlined
+ * function, as the compiler assumptions in respect to alignment of global
+ * and stack variables would often make the check a no op, defeating the
+ * whole purpose of the exercise.
+ */
+extern "C" boolean
+lp_check_alignment(const void *ptr, unsigned alignment)
+{
+   assert(util_is_power_of_two(alignment));
+   return ((uintptr_t)ptr & (alignment - 1)) == 0;
+}
+
+
+class raw_debug_ostream :
+   public llvm::raw_ostream
+{
+   uint64_t pos;
+
+   void write_impl(const char *Ptr, size_t Size);
+   uint64_t current_pos() { return pos; }
+   uint64_t current_pos() const { return pos; }
+
+#if HAVE_LLVM >= 0x207
+   uint64_t preferred_buffer_size() { return 512; }
+#else
+   size_t preferred_buffer_size() { return 512; }
+#endif
+};
+
+
+void
+raw_debug_ostream::write_impl(const char *Ptr, size_t Size)
+{
+   if (Size > 0) {
+      char *lastPtr = (char *)&Ptr[Size];
+      char last = *lastPtr;
+      *lastPtr = 0;
+      _debug_printf("%*s", Size, Ptr);
+      *lastPtr = last;
+      pos += Size;
+   }
+}
+
+
+/**
+ * Same as LLVMDumpValue, but through our debugging channels.
+ */
+extern "C" void
+lp_debug_dump_value(LLVMValueRef value)
+{
+#if (defined(PIPE_OS_WINDOWS) && !defined(PIPE_CC_MSVC)) || defined(PIPE_OS_EMBDDED)
+   raw_debug_ostream os;
+   llvm::unwrap(value)->print(os);
+   os.flush();
+#else
+   LLVMDumpValue(value);
+#endif
+}
+
+
+#if HAVE_LLVM >= 0x0207
+/*
+ * MemoryObject wrapper around a buffer of memory, to be used by MC
+ * disassembler.
+ */
+class BufferMemoryObject:
+   public llvm::MemoryObject
+{
+private:
+   const uint8_t *Bytes;
+   uint64_t Length;
+public:
+   BufferMemoryObject(const uint8_t *bytes, uint64_t length) :
+      Bytes(bytes), Length(length)
+   {
+   }
+
+   uint64_t getBase() const
+   {
+      return 0;
+   }
+
+   uint64_t getExtent() const
+   {
+      return Length;
+   }
+
+   int readByte(uint64_t addr, uint8_t *byte) const
+   {
+      if (addr > getExtent())
+         return -1;
+      *byte = Bytes[addr];
+      return 0;
+   }
+};
+#endif /* HAVE_LLVM >= 0x0207 */
+
+
+/*
+ * Disassemble a function, using the LLVM MC disassembler.
+ *
+ * See also:
+ * - http://blog.llvm.org/2010/01/x86-disassembler.html
+ * - http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html
+ */
+extern "C" void
+lp_disassemble(const void* func)
+{
+#if HAVE_LLVM >= 0x0207
+   using namespace llvm;
+
+   const uint8_t *bytes = (const uint8_t *)func;
+
+   /*
+    * Limit disassembly to this extent
+    */
+   const uint64_t extent = 0x10000;
+
+   uint64_t max_pc = 0;
+
+   /*
+    * Initialize all used objects.
+    */
+
+   std::string Triple = sys::getHostTriple();
+
+   std::string Error;
+   const Target *T = TargetRegistry::lookupTarget(Triple, Error);
+
+#if HAVE_LLVM >= 0x0208
+   InitializeNativeTargetAsmPrinter();
+#else
+   InitializeAllAsmPrinters();
+#endif
+
+   InitializeAllDisassemblers();
+
+   OwningPtr<const MCAsmInfo> AsmInfo(T->createAsmInfo(Triple));
+
+   if (!AsmInfo) {
+      debug_printf("error: no assembly info for target %s\n", Triple.c_str());
+      return;
+   }
+
+   OwningPtr<const MCDisassembler> DisAsm(T->createMCDisassembler());
+   if (!DisAsm) {
+      debug_printf("error: no disassembler for target %s\n", Triple.c_str());
+      return;
+   }
+
+   raw_debug_ostream Out;
+
+   int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
+#if HAVE_LLVM >= 0x0208
+   OwningPtr<MCInstPrinter> Printer(
+         T->createMCInstPrinter(AsmPrinterVariant, *AsmInfo));
+#else
+   OwningPtr<MCInstPrinter> Printer(
+         T->createMCInstPrinter(AsmPrinterVariant, *AsmInfo, Out));
+#endif
+   if (!Printer) {
+      debug_printf("error: no instruction printer for target %s\n", Triple.c_str());
+      return;
+   }
+
+   TargetMachine *TM = T->createTargetMachine(Triple, "");
+
+   const TargetInstrInfo *TII = TM->getInstrInfo();
+
+   /*
+    * Wrap the data in a MemoryObject
+    */
+   BufferMemoryObject memoryObject((const uint8_t *)bytes, extent);
+
+   uint64_t pc;
+   pc = 0;
+   while (true) {
+      MCInst Inst;
+      uint64_t Size;
+
+      /*
+       * Print address.  We use addresses relative to the start of the function,
+       * so that between runs.
+       */
+
+      debug_printf("%6lu:\t", (unsigned long)pc);
+
+      if (!DisAsm->getInstruction(Inst, Size, memoryObject,
+                                 pc,
+                                 nulls())) {
+         debug_printf("invalid\n");
+         pc += 1;
+      }
+
+      /*
+       * Output the bytes in hexidecimal format.
+       */
+
+      if (0) {
+         unsigned i;
+         for (i = 0; i < Size; ++i) {
+            debug_printf("%02x ", ((const uint8_t*)bytes)[pc + i]);
+         }
+         for (; i < 16; ++i) {
+            debug_printf("   ");
+         }
+      }
+
+      /*
+       * Print the instruction.
+       */
+
+#if HAVE_LLVM >= 0x208
+      Printer->printInst(&Inst, Out);
+#else
+      Printer->printInst(&Inst);
+#endif
+      Out.flush();
+
+      /*
+       * Advance.
+       */
+
+      pc += Size;
+
+      const TargetInstrDesc &TID = TII->get(Inst.getOpcode());
+
+      /*
+       * Keep track of forward jumps to a nearby address.
+       */
+
+      if (TID.isBranch()) {
+         for (unsigned i = 0; i < Inst.getNumOperands(); ++i) {
+            const MCOperand &operand = Inst.getOperand(i);
+            if (operand.isImm()) {
+               uint64_t jump;
+
+               /*
+                * FIXME: Handle both relative and absolute addresses correctly.
+                * EDInstInfo actually has this info, but operandTypes and
+                * operandFlags enums are not exposed in the public interface.
+                */
+
+               if (1) {
+                  /*
+                   * PC relative addr.
+                   */
+
+                  jump = pc + operand.getImm();
+               } else {
+                  /*
+                   * Absolute addr.
+                   */
+
+                  jump = (uint64_t)operand.getImm();
+               }
+
+               /*
+                * Output the address relative to the function start, given
+                * that MC will print the addresses relative the current pc.
+                */
+               debug_printf("\t\t; %lu", (unsigned long)jump);
+
+               /*
+                * Ignore far jumps given it could be actually a tail return to
+                * a random address.
+                */
+
+               if (jump > max_pc &&
+                   jump < extent) {
+                  max_pc = jump;
+               }
+            }
+         }
+      }
+
+      debug_printf("\n");
+
+      /*
+       * Stop disassembling on return statements, if there is no record of a
+       * jump to a successive address.
+       */
+
+      if (TID.isReturn()) {
+         if (pc > max_pc) {
+            break;
+         }
+      }
+   }
+
+   /*
+    * Print GDB command, useful to verify output.
+    */
+
+   if (0) {
+      debug_printf("disassemble %p %p\n", bytes, bytes + pc);
+   }
+
+   debug_printf("\n");
+#else /* HAVE_LLVM < 0x0207 */
+   (void)func;
+#endif /* HAVE_LLVM < 0x0207 */
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 8a58f95b78..da873f30b2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -45,6 +45,11 @@
 #define GALLIVM_DEBUG_GC            (1 << 6)
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
 #ifdef DEBUG
 extern unsigned gallivm_debug;
 #else
@@ -81,4 +86,9 @@ void
 lp_disassemble(const void* func);
 
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif /* !LP_BLD_DEBUG_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 75d2e666f0..82ab19eda1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -36,6 +36,7 @@
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_pointer.h"
 #include "util/u_string.h"
 
 #include "lp_bld_arit.h"
@@ -511,8 +512,6 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
        * or incentive to optimize.
        */
 
-      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(gallivm->builder)));
-      char name[256];
       LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
       LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
       LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
@@ -522,19 +521,20 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
       LLVMValueRef res;
       unsigned k;
 
-      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm",
-                    format_desc->short_name);
-
       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
-         debug_printf("%s: falling back to %s\n", __FUNCTION__, name);
+         debug_printf("%s: falling back to util_format_%s_fetch_rgba_8unorm\n",
+                      __FUNCTION__, format_desc->short_name);
       }
 
       /*
        * Declare and bind format_desc->fetch_rgba_8unorm().
        */
 
-      function = LLVMGetNamedFunction(module, name);
-      if (!function) {
+      {
+         /*
+          * Function to call looks like:
+          *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+          */
          LLVMTypeRef ret_type;
          LLVMTypeRef arg_types[4];
          LLVMTypeRef function_type;
@@ -542,17 +542,19 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
          ret_type = LLVMVoidTypeInContext(gallivm->context);
          arg_types[0] = pi8t;
          arg_types[1] = pi8t;
-         arg_types[3] = arg_types[2] = LLVMIntTypeInContext(gallivm->context, sizeof(unsigned) * 8);
-         function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
-         function = LLVMAddFunction(module, name, function_type);
-
-         LLVMSetFunctionCallConv(function, LLVMCCallConv);
-         LLVMSetLinkage(function, LLVMExternalLinkage);
-
-         assert(LLVMIsDeclaration(function));
-
-         LLVMAddGlobalMapping(gallivm->engine, function,
-                              func_to_pointer((func_pointer)format_desc->fetch_rgba_8unorm));
+         arg_types[2] = i32t;
+         arg_types[3] = i32t;
+         function_type = LLVMFunctionType(ret_type, arg_types,
+                                          Elements(arg_types), 0);
+
+         /* make const pointer for the C fetch_rgba_8unorm function */
+         function = lp_build_const_int_pointer(gallivm,
+            func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
+
+         /* cast the callee pointer to the function's type */
+         function = LLVMBuildBitCast(builder, function,
+                                     LLVMPointerType(function_type, 0),
+                                     "cast callee");
       }
 
       tmp_ptr = lp_build_alloca(gallivm, i32t, "");
@@ -614,48 +616,55 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
        * or incentive to optimize.
        */
 
-      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
-      char name[256];
       LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
       LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
       LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
+      LLVMTypeRef pi8t = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
+      LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
       LLVMValueRef function;
       LLVMValueRef tmp_ptr;
       LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
       LLVMValueRef res;
       unsigned k;
 
-      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float",
-                    format_desc->short_name);
-
       if (gallivm_debug & GALLIVM_DEBUG_PERF) {
-         debug_printf("%s: falling back to %s\n", __FUNCTION__, name);
+         debug_printf("%s: falling back to util_format_%s_fetch_rgba_float\n",
+                      __FUNCTION__, format_desc->short_name);
       }
 
       /*
        * Declare and bind format_desc->fetch_rgba_float().
        */
 
-      function = LLVMGetNamedFunction(module, name);
-      if (!function) {
+      {
+         /*
+          * Function to call looks like:
+          *   fetch(float *dst, const uint8_t *src, unsigned i, unsigned j)
+          */
          LLVMTypeRef ret_type;
          LLVMTypeRef arg_types[4];
          LLVMTypeRef function_type;
 
          ret_type = LLVMVoidTypeInContext(gallivm->context);
          arg_types[0] = pf32t;
-         arg_types[1] = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
-         arg_types[3] = arg_types[2] = LLVMIntTypeInContext(gallivm->context, sizeof(unsigned) * 8);
-         function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
-         function = LLVMAddFunction(module, name, function_type);
+         arg_types[1] = pi8t;
+         arg_types[2] = i32t;
+         arg_types[3] = i32t;
+         function_type = LLVMFunctionType(ret_type, arg_types,
+                                          Elements(arg_types), 0);
 
-         LLVMSetFunctionCallConv(function, LLVMCCallConv);
-         LLVMSetLinkage(function, LLVMExternalLinkage);
+         /* Note: we're using this casting here instead of LLVMAddGlobalMapping()
+          * to work around a bug in LLVM 2.6, and for efficiency/simplicity.
+          */
 
-         assert(LLVMIsDeclaration(function));
+         /* make const pointer for the C fetch_rgba_float function */
+         function = lp_build_const_int_pointer(gallivm,
+            func_to_pointer((func_pointer) format_desc->fetch_rgba_float));
 
-         LLVMAddGlobalMapping(gallivm->engine, function,
-                              func_to_pointer((func_pointer)format_desc->fetch_rgba_float));
+         /* cast the callee pointer to the function's type */
+         function = LLVMBuildBitCast(builder, function,
+                                     LLVMPointerType(function_type, 0),
+                                     "cast callee");
       }
 
       tmp_ptr = lp_build_alloca(gallivm, f32x4t, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index efe8d38b8f..45addee8fa 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -30,6 +30,7 @@
 #include "util/u_cpu_detect.h"
 #include "util/u_debug.h"
 #include "util/u_memory.h"
+#include "util/u_simple_list.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_init.h"
 
@@ -221,11 +222,12 @@ free_gallivm_state(struct gallivm_state *gallivm)
 static boolean
 init_gallivm_state(struct gallivm_state *gallivm)
 {
-   assert(gallivm_initialized);
    assert(!gallivm->context);
    assert(!gallivm->module);
    assert(!gallivm->provider);
 
+   lp_build_init();
+
    gallivm->context = LLVMContextCreate();
    if (!gallivm->context)
       goto fail;
@@ -291,12 +293,12 @@ struct callback
 {
    garbage_collect_callback_func func;
    void *cb_data;
+   struct callback *prev, *next;
 };
 
 
-#define MAX_CALLBACKS 32
-static struct callback Callbacks[MAX_CALLBACKS];
-static unsigned NumCallbacks = 0;
+/** list of all garbage collector callbacks */
+static struct callback callback_list = {NULL, NULL, NULL, NULL};
 
 
 /**
@@ -307,20 +309,24 @@ void
 gallivm_register_garbage_collector_callback(garbage_collect_callback_func func,
                                             void *cb_data)
 {
-   unsigned i;
+   struct callback *cb;
+
+   if (!callback_list.prev) {
+      make_empty_list(&callback_list);
+   }
 
-   for (i = 0; i < NumCallbacks; i++) {
-      if (Callbacks[i].func == func && Callbacks[i].cb_data == cb_data) {
-         /* already in list: no-op */
+   /* see if already in list */
+   foreach(cb, &callback_list) {
+      if (cb->func == func && cb->cb_data == cb_data)
          return;
-      }
    }
 
-   assert(NumCallbacks < MAX_CALLBACKS);
-   if (NumCallbacks < MAX_CALLBACKS) {
-      Callbacks[NumCallbacks].func = func;
-      Callbacks[NumCallbacks].cb_data = cb_data;
-      NumCallbacks++;
+   /* add to list */
+   cb = CALLOC_STRUCT(callback);
+   if (cb) {
+      cb->func = func;
+      cb->cb_data = cb_data;
+      insert_at_head(&callback_list, cb);
    }
 }
 
@@ -332,15 +338,13 @@ void
 gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
                                           void *cb_data)
 {
-   unsigned i;
-
-   for (i = 0; i < NumCallbacks; i++) {
-      if (Callbacks[i].func == func && Callbacks[i].cb_data == cb_data) {
-         /* found, now remove it */
-         NumCallbacks--;
-         for ( ; i < NumCallbacks; i++) {
-            Callbacks[i] = Callbacks[i + 1];
-         }
+   struct callback *cb;
+
+   /* search list */
+   foreach(cb, &callback_list) {
+      if (cb->func == func && cb->cb_data == cb_data) {
+         /* found, remove it */
+         remove_from_list(cb);
          return;
       }
    }
@@ -354,10 +358,9 @@ gallivm_remove_garbage_collector_callback(garbage_collect_callback_func func,
 static void
 call_garbage_collector_callbacks(void)
 {
-   unsigned i;
-
-   for (i = 0; i < NumCallbacks; i++) {
-      Callbacks[i].func(Callbacks[i].cb_data);
+   struct callback *cb;
+   foreach(cb, &callback_list) {
+      cb->func(cb->cb_data);
    }
 }
 
@@ -385,6 +388,9 @@ gallivm_garbage_collect(struct gallivm_state *gallivm)
 void
 lp_build_init(void)
 {
+   if (gallivm_initialized)
+      return;
+
 #ifdef DEBUG
    gallivm_debug = debug_get_option_gallivm_debug();
 #endif
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index f56ddee7fd..843a14a500 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -46,66 +46,6 @@
 #include "util/u_debug.h"
 
 
-#if (defined(PIPE_OS_WINDOWS) && !defined(PIPE_CC_MSVC)) || defined(PIPE_OS_EMBDDED)
-
-#include "llvm/Support/raw_ostream.h"
-
-class raw_debug_ostream :
-   public llvm::raw_ostream
-{
-   uint64_t pos;
-
-   void write_impl(const char *Ptr, size_t Size);
-   uint64_t current_pos() { return pos; }
-   uint64_t current_pos() const { return pos; }
-
-#if HAVE_LLVM >= 0x207
-   uint64_t preferred_buffer_size() { return 512; }
-#else
-   size_t preferred_buffer_size() { return 512; }
-#endif
-};
-
-
-void
-raw_debug_ostream::write_impl(const char *Ptr, size_t Size)
-{
-   if (Size > 0) {
-      char *lastPtr = (char *)&Ptr[Size];
-      char last = *lastPtr;
-      *lastPtr = 0;
-      _debug_printf("%*s", Size, Ptr);
-      *lastPtr = last;
-      pos += Size;
-   }
-}
-
-
-/**
- * Same as LLVMDumpValue, but through our debugging channels.
- */
-extern "C" void
-lp_debug_dump_value(LLVMValueRef value)
-{
-   raw_debug_ostream os;
-   llvm::unwrap(value)->print(os);
-   os.flush();
-}
-
-
-#else
-
-
-extern "C" void
-lp_debug_dump_value(LLVMValueRef value)
-{
-   LLVMDumpValue(value);
-}
-
-
-#endif
-
-
 /**
  * Register the engine with oprofile.
  *
@@ -144,6 +84,7 @@ lp_set_target_options(void)
    llvm::UnsafeFPMath = true;
 #endif
 
+#if HAVE_LLVM < 0x0209
    /*
     * LLVM will generate MMX instructions for vectors <= 64 bits, leading to
     * innefficient code, and in 32bit systems, to the corruption of the FPU
@@ -162,6 +103,7 @@ lp_set_target_options(void)
       llvm::cl::ParseCommandLineOptions(2, const_cast<char**>(options));
       first = FALSE;
    }
+#endif
 
    /*
     * By default LLVM adds a signal handler to output a pretty stack trace.
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 991f6fa5ef..e61cf9541e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -1098,6 +1098,4 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
       texel_out[2] = unswizzled[2];
       texel_out[3] = unswizzled[3];
    }
-
-   apply_sampler_swizzle(bld, texel_out);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index cf46e2be83..9961ba08f3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -187,8 +187,6 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                                            border_chan_vec, texel_out[chan]);
       }
    }
-
-   apply_sampler_swizzle(bld, texel_out);
 }
 
 
@@ -834,14 +832,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                        LLVMValueRef *colors_out)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef size0;
-   LLVMValueRef size1;
-   LLVMValueRef row_stride0_vec;
-   LLVMValueRef row_stride1_vec;
-   LLVMValueRef img_stride0_vec;
-   LLVMValueRef img_stride1_vec;
-   LLVMValueRef data_ptr0;
-   LLVMValueRef data_ptr1;
+   LLVMValueRef size0 = NULL;
+   LLVMValueRef size1 = NULL;
+   LLVMValueRef row_stride0_vec = NULL;
+   LLVMValueRef row_stride1_vec = NULL;
+   LLVMValueRef img_stride0_vec = NULL;
+   LLVMValueRef img_stride1_vec = NULL;
+   LLVMValueRef data_ptr0 = NULL;
+   LLVMValueRef data_ptr1 = NULL;
    LLVMValueRef colors0[4], colors1[4];
    unsigned chan;
 
@@ -1110,6 +1108,11 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
                       coord, tex);
    }
 
+   /* Clamp p coords to [0,1] */
+   p = lp_build_clamp(&bld->coord_bld, p,
+                      bld->coord_bld.zero,
+                      bld->coord_bld.one);
+
    /* result = (p FUNC texel) ? 1 : 0 */
    res = lp_build_cmp(texel_bld, bld->static_state->compare_func,
                       p, texel[chan]);
@@ -1268,4 +1271,6 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
    }
 
    lp_build_sample_compare(&bld, r, texel_out);
+
+   apply_sampler_swizzle(&bld, texel_out);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 40186befb9..9713d10048 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -180,6 +180,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
                   struct lp_type type,
                   struct lp_build_mask_context *mask,
                   LLVMValueRef consts_ptr,
+                  LLVMValueRef system_values_array,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[4],
                   LLVMValueRef (*outputs)[4],
@@ -199,4 +200,11 @@ lp_build_tgsi_aos(struct gallivm_state *gallivm,
                   const struct tgsi_shader_info *info);
 
 
+LLVMValueRef
+lp_build_system_values_array(struct gallivm_state *gallivm,
+                             const struct tgsi_shader_info *info,
+                             LLVMValueRef instance_id,
+                             LLVMValueRef facing);
+
+
 #endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 1b5a8a5903..d1585c8e2b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -157,6 +157,8 @@ struct lp_build_tgsi_soa_context
     */
    LLVMValueRef inputs_array;
 
+   LLVMValueRef system_values_array;
+
    const struct tgsi_shader_info *info;
    /** bitmask indicating which register files are accessed indirectly */
    unsigned indirect_files;
@@ -759,6 +761,23 @@ emit_fetch(
       }
       break;
 
+   case TGSI_FILE_SYSTEM_VALUE:
+      assert(!reg->Register.Indirect);
+      {
+         LLVMValueRef index;  /* index into the system value array */
+         LLVMValueRef scalar, scalar_ptr;
+
+         index = lp_build_const_int32(gallivm,
+                                      reg->Register.Index * 4 + swizzle);
+
+         scalar_ptr = LLVMBuildGEP(builder, bld->system_values_array,
+                                   &index, 1, "");
+         scalar = LLVMBuildLoad(builder, scalar_ptr, "");
+
+         res = lp_build_broadcast_scalar(&bld->base, scalar);
+      }
+      break;
+
    default:
       assert(0 && "invalid src register in emit_fetch()");
       return bld->base.undef;
@@ -2322,6 +2341,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
                   struct lp_type type,
                   struct lp_build_mask_context *mask,
                   LLVMValueRef consts_ptr,
+                  LLVMValueRef system_values_array,
                   const LLVMValueRef *pos,
                   const LLVMValueRef (*inputs)[NUM_CHANNELS],
                   LLVMValueRef (*outputs)[NUM_CHANNELS],
@@ -2411,6 +2431,8 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
       }
    }
 
+   bld.system_values_array = system_values_array;
+
    tgsi_parse_init( &parse, tokens );
 
    while( !tgsi_parse_end_of_tokens( &parse ) ) {
@@ -2512,3 +2534,54 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    FREE( bld.instructions );
 }
 
+
+/**
+ * Build up the system values array out of individual values such as
+ * the instance ID, front-face, primitive ID, etc.  The shader info is
+ * used to determine which system values are needed and where to put
+ * them in the system values array.
+ *
+ * XXX only instance ID is implemented at this time.
+ *
+ * The system values register file is similar to the constants buffer.
+ * Example declaration:
+ *    DCL SV[0], INSTANCEID
+ * Example instruction:
+ *    MOVE foo, SV[0].xxxx;
+ *
+ * \return  LLVM float array (interpreted as float [][4])
+ */
+LLVMValueRef
+lp_build_system_values_array(struct gallivm_state *gallivm,
+                             const struct tgsi_shader_info *info,
+                             LLVMValueRef instance_id,
+                             LLVMValueRef facing)
+{
+   LLVMValueRef size = lp_build_const_int32(gallivm, 4 * info->num_system_values);
+   LLVMTypeRef float_t = LLVMFloatTypeInContext(gallivm->context);
+   LLVMValueRef array = lp_build_array_alloca(gallivm, float_t,
+                                              size, "sysvals_array");
+   unsigned i;
+
+   for (i = 0; i < info->num_system_values; i++) {
+      LLVMValueRef index = lp_build_const_int32(gallivm, i * 4);
+      LLVMValueRef ptr, value;
+
+      switch (info->system_value_semantic_name[i]) {
+      case TGSI_SEMANTIC_INSTANCEID:
+         /* convert instance ID from int to float */
+         value = LLVMBuildSIToFP(gallivm->builder, instance_id, float_t,
+                                 "sysval_instanceid");
+         break;
+      case TGSI_SEMANTIC_FACE:
+         /* fall-through */
+      default:
+         assert(0 && "unexpected semantic in build_system_values_array()");
+      }
+
+      ptr = LLVMBuildGEP(gallivm->builder, array, &index, 1, "");
+      LLVMBuildStore(gallivm->builder, value, ptr);
+   }
+      
+   return array;
+}