gallivm: faster iround implementation for sse2

sse2 supports round to nearest directly (or rather, assuming default nearest rounding mode in MXCSR). Use intrinsic to use this rather than round (sse41) or bit manipulation whenever possible.
author: Roland Scheidegger <sroland@vmware.com> 2010-10-08 18:38:25 +0200
committer: Roland Scheidegger <sroland@vmware.com> 2010-10-09 00:36:37 +0200
commit: cb3af2b43439088fc0be0085df8b1ee1a91f33c7 (patch)
tree: ba1562bd09e7539a9bfc4ac5bffbc62c0684d606
parent: 0ed8c56bfe4ff5e3d451ec4791ee2cd64fb3daf2 (diff)
1 files changed, 53 insertions, 1 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 4f108f6e81..6ab13506e1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1053,6 +1053,54 @@ lp_build_round_sse41(struct lp_build_context *bld,
 }
 
 
+static INLINE LLVMValueRef
+lp_build_iround_nearest_sse2(struct lp_build_context *bld,
+                             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMTypeRef ret_type = lp_build_int_vec_type(type);
+   const char *intrinsic;
+   LLVMValueRef res;
+
+   assert(type.floating);
+   /* using the double precision conversions is a bit more complicated */
+   assert(type.width == 32);
+
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse2);
+
+   /* This is relying on MXCSR rounding mode, which should always be nearest. */
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef arg;
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      intrinsic = "llvm.x86.sse.cvtss2si";
+
+      undef = LLVMGetUndef(vec_type);
+
+      arg = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, arg);
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      intrinsic = "llvm.x86.sse2.cvtps2dq";
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, a);
+   }
+
+   return res;
+}
+
+
 /**
  * Return the integer part of a float (vector) value (== round toward zero).
  * The returned value is a float (vector).
@@ -1217,7 +1265,11 @@ lp_build_iround(struct lp_build_context *bld,
 
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
+   if (util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+      return lp_build_iround_nearest_sse2(bld, a);
+   }
+   else if (util_cpu_caps.has_sse4_1 &&
        (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
author	Roland Scheidegger <sroland@vmware.com>	2010-10-08 18:38:25 +0200
committer	Roland Scheidegger <sroland@vmware.com>	2010-10-09 00:36:37 +0200
commit	cb3af2b43439088fc0be0085df8b1ee1a91f33c7 (patch)
tree	ba1562bd09e7539a9bfc4ac5bffbc62c0684d606
parent	0ed8c56bfe4ff5e3d451ec4791ee2cd64fb3daf2 (diff)