diff options
Diffstat (limited to 'src/mesa/pipe')
| -rw-r--r-- | src/mesa/pipe/cell/spu/spu_vertex_fetch.c | 484 | ||||
| -rw-r--r-- | src/mesa/pipe/cell/spu/spu_vertex_shader.h | 3 | 
2 files changed, 368 insertions, 119 deletions
| diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c index ec10bb99df..f6ffcae90e 100644 --- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c +++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c @@ -40,41 +40,83 @@  #include "spu_vertex_shader.h"  #include "spu_main.h" +#define CACHE_NAME            attribute +#define CACHED_TYPE           qword +#define CACHE_TYPE            CACHE_TYPE_RO +#define CACHE_SET_TAGID(set)  TAG_VERTEX_BUFFER +#define CACHE_LOG2NNWAY       2 +#define CACHE_LOG2NSETS       6 +#include <cache-api.h> + +/* Yes folks, this is ugly. + */ +#undef CACHE_NWAY +#undef CACHE_NSETS +#define CACHE_NAME            attribute +#define CACHE_NWAY            4 +#define CACHE_NSETS           (1U << 6) +  #define DRAW_DBG 0  static const vec_float4 defaults = { 0.0, 0.0, 0.0, 1.0 }; -static INLINE qword -fetch_unaligned_qword(const void *ptr) +/** + * Fetch between 1 and 32 bytes from an unaligned address + */ +static INLINE void +fetch_unaligned(qword *dst, unsigned ea, unsigned size)  { -    const int shift = (unsigned)(ptr) & 0x0f; -    const qword x = *(qword *)(ptr); -    const qword y = *(qword *)(ptr + 16); +   qword tmp[4]; +   const int shift = ea & 0x0f; +   const unsigned aligned_start_ea = ea & ~0x0f; +   const unsigned aligned_end_ea = (ea + size) & ~0x0f; +   const unsigned num_entries = ((aligned_end_ea - aligned_start_ea) / 16) + 1; +   unsigned i; + -    return si_or((qword) spu_slqwbyte(x, shift), -		 (qword) spu_rlmaskqwbyte(y, shift - 16)); +   if (shift == 0) { +      /* Data is already aligned.  Fetch directly into the destination buffer. +       */ +      for (i = 0; i < num_entries; i++) { +	 dst[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16)); +      } +   } else { +      /* Fetch data from the cache to the local buffer. +       */ +      for (i = 0; i < num_entries; i++) { +	 tmp[i] = cache_rd(attribute, (ea & ~0x0f) + (i * 16)); +      } + + +      /* Fix the alignment of the data and write to the destination buffer. +       */ +      for (i = 0; i < ((size + 15) / 16); i++) { +	 dst[i] = si_or((qword) spu_slqwbyte(tmp[i], shift), +			(qword) spu_rlmaskqwbyte(tmp[i + 1], shift - 16)); +      } +   }  }  static qword -fetch_R32G32B32A32_FLOAT(const void *ptr) +fetch_R32G32B32A32_FLOAT(const qword *qw)  { -    return fetch_unaligned_qword(ptr); +    return *qw;  }  static qword -fetch_R32G32B32A32_USCALED(const void *ptr) +fetch_R32G32B32A32_USCALED(const qword *qw)  { -    return si_cuflt(fetch_unaligned_qword(ptr), 0); +    return si_cuflt(*qw, 0);  }  static qword -fetch_R32G32B32A32_UNORM(const void *ptr) +fetch_R32G32B32A32_UNORM(const qword *qw)  { -    qword x = si_cuflt(fetch_unaligned_qword(ptr), 0); +    qword x = si_cuflt(*qw, 0);      vec_float4 scale = spu_splats(1.0f / 255.0f);      return si_fm(x, (qword) scale); @@ -82,11 +124,146 @@ fetch_R32G32B32A32_UNORM(const void *ptr)  static qword -fetch_R32G32B32A32_SSCALED(const void *ptr) +fetch_R32G32B32A32_SSCALED(const qword *qw) +{ +    return si_csflt(*qw, 0); +} + + +#define CVT_32_FLOAT(q)    (*q) + +static INLINE qword +CVT_64_FLOAT(const qword *qw) +{ +   qword shuf_first = (qword) { +      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, +      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +   }; + +   qword a = si_frds(qw[0]); +   qword b = si_frds(si_rotqbyi(qw[0], 8)); +   qword c = si_frds(qw[1]); +   qword d = si_frds(si_rotqbyi(qw[1], 8)); + +   qword ab = si_shufb(a, b, shuf_first); +   qword cd = si_shufb(c, d, si_rotqbyi(shuf_first, 8)); +    +   return si_or(ab, cd); +} + + +static INLINE qword +CVT_8_USCALED(const qword *qw) +{ +   qword shuffle = (qword) { +      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, +      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, +   }; + +   return si_cuflt(si_shufb(*qw, *qw, shuffle), 0); +} + + +static INLINE qword +CVT_16_USCALED(const qword *qw) +{ +   qword shuffle = (qword) { +      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80, +      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80, +   }; + +   return si_cuflt(si_shufb(*qw, *qw, shuffle), 0); +} + + +static INLINE qword +CVT_32_USCALED(const qword *qw) +{ +   return si_cuflt(*qw, 0); +} + +static INLINE qword +CVT_8_SSCALED(const qword *qw) +{ +   qword shuffle = (qword) { +      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80, +      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80, +   }; + +   return si_csflt(si_shufb(*qw, *qw, shuffle), 0); +} + + +static INLINE qword +CVT_16_SSCALED(const qword *qw) +{ +   qword shuffle = (qword) { +      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80, +      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80, +   }; + +   return si_csflt(si_shufb(*qw, *qw, shuffle), 0); +} + + +static INLINE qword +CVT_32_SSCALED(const qword *qw) +{ +   return si_csflt(*qw, 0); +} + + +static INLINE qword +CVT_8_UNORM(const qword *qw) +{ +   const qword scale = (qword) spu_splats(1.0f / 255.0f); +   return si_fm(CVT_8_USCALED(qw), scale); +} + + +static INLINE qword +CVT_16_UNORM(const qword *qw) +{ +   const qword scale = (qword) spu_splats(1.0f / 65535.0f); +   return si_fm(CVT_16_USCALED(qw), scale); +} + + +static INLINE qword +CVT_32_UNORM(const qword *qw) +{ +   const qword scale = (qword) spu_splats(1.0f / 4294967295.0f); +   return si_fm(CVT_32_USCALED(qw), scale); +} + + +static INLINE qword +CVT_8_SNORM(const qword *qw) +{ +   const qword scale = (qword) spu_splats(1.0f / 127.0f); +   return si_fm(CVT_8_SSCALED(qw), scale); +} + + +static INLINE qword +CVT_16_SNORM(const qword *qw) +{ +   const qword scale = (qword) spu_splats(1.0f / 32767.0f); +   return si_fm(CVT_16_SSCALED(qw), scale); +} + + +static INLINE qword +CVT_32_SNORM(const qword *qw)  { -    return si_csflt(fetch_unaligned_qword(ptr), 0); +   const qword scale = (qword) spu_splats(1.0f / 2147483647.0f); +   return si_fm(CVT_32_SSCALED(qw), scale);  } +#define SZ_4 si_il(0U) +#define SZ_3 si_rotqmbyi(si_il(~0), -12) +#define SZ_2 si_rotqmbyi(si_il(~0), -8) +#define SZ_1 si_rotqmbyi(si_il(~0), -4)  /**   * Fetch a float[4] vertex attribute from memory, doing format/type @@ -97,117 +274,84 @@ fetch_R32G32B32A32_SSCALED(const void *ptr)   */  #define FETCH_ATTRIB( NAME, SZ, CVT )			\  static qword						\ -fetch_##NAME(const void *ptr)				\ +fetch_##NAME(const qword *qw)				\  {							\ -   vec_float4 attrib = defaults;			\ -   int i;						\ -							\ -   for (i = 0; i < SZ; i++) {				\ -      attrib = spu_insert(CVT, attrib, i);		\ -   }							\ -   return (qword) attrib;				\ +   qword expanded = CVT(qw);				\ +   return si_selb(expanded, (qword) defaults, SZ);	\  } -#define CVT_64_FLOAT   (float) ((double *) ptr)[i] -#define CVT_32_FLOAT   ((float *) ptr)[i] +FETCH_ATTRIB( R64G64B64A64_FLOAT,   SZ_4, CVT_64_FLOAT ) +FETCH_ATTRIB( R64G64B64_FLOAT,      SZ_3, CVT_64_FLOAT ) +FETCH_ATTRIB( R64G64_FLOAT,         SZ_2, CVT_64_FLOAT ) +FETCH_ATTRIB( R64_FLOAT,            SZ_1, CVT_64_FLOAT ) -#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i] -#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i] -#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i] +FETCH_ATTRIB( R32G32B32_FLOAT,      SZ_3, CVT_32_FLOAT ) +FETCH_ATTRIB( R32G32_FLOAT,         SZ_2, CVT_32_FLOAT ) +FETCH_ATTRIB( R32_FLOAT,            SZ_1, CVT_32_FLOAT ) -#define CVT_8_SSCALED  (float) ((char *) ptr)[i] -#define CVT_16_SSCALED (float) ((short *) ptr)[i] -#define CVT_32_SSCALED (float) ((int *) ptr)[i] +FETCH_ATTRIB( R32G32B32_USCALED,    SZ_3, CVT_32_USCALED ) +FETCH_ATTRIB( R32G32_USCALED,       SZ_2, CVT_32_USCALED ) +FETCH_ATTRIB( R32_USCALED,          SZ_1, CVT_32_USCALED ) -#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f -#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f -#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f +FETCH_ATTRIB( R32G32B32_SSCALED,    SZ_3, CVT_32_SSCALED ) +FETCH_ATTRIB( R32G32_SSCALED,       SZ_2, CVT_32_SSCALED ) +FETCH_ATTRIB( R32_SSCALED,          SZ_1, CVT_32_SSCALED ) -#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f -#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f -#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f +FETCH_ATTRIB( R32G32B32_UNORM,    SZ_3, CVT_32_UNORM ) +FETCH_ATTRIB( R32G32_UNORM,       SZ_2, CVT_32_UNORM ) +FETCH_ATTRIB( R32_UNORM,          SZ_1, CVT_32_UNORM ) -FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT ) -FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT ) -FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT ) -FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT ) +FETCH_ATTRIB( R32G32B32A32_SNORM, SZ_4, CVT_32_SNORM ) +FETCH_ATTRIB( R32G32B32_SNORM,    SZ_3, CVT_32_SNORM ) +FETCH_ATTRIB( R32G32_SNORM,       SZ_2, CVT_32_SNORM ) +FETCH_ATTRIB( R32_SNORM,          SZ_1, CVT_32_SNORM ) -FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT ) -FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT ) -FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT ) +FETCH_ATTRIB( R16G16B16A16_USCALED, SZ_4, CVT_16_USCALED ) +FETCH_ATTRIB( R16G16B16_USCALED,    SZ_3, CVT_16_USCALED ) +FETCH_ATTRIB( R16G16_USCALED,       SZ_2, CVT_16_USCALED ) +FETCH_ATTRIB( R16_USCALED,          SZ_1, CVT_16_USCALED ) -FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED ) -FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED ) -FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED ) +FETCH_ATTRIB( R16G16B16A16_SSCALED, SZ_4, CVT_16_SSCALED ) +FETCH_ATTRIB( R16G16B16_SSCALED,    SZ_3, CVT_16_SSCALED ) +FETCH_ATTRIB( R16G16_SSCALED,       SZ_2, CVT_16_SSCALED ) +FETCH_ATTRIB( R16_SSCALED,          SZ_1, CVT_16_SSCALED ) -FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED ) -FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED ) -FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED ) +FETCH_ATTRIB( R16G16B16A16_UNORM, SZ_4, CVT_16_UNORM ) +FETCH_ATTRIB( R16G16B16_UNORM,    SZ_3, CVT_16_UNORM ) +FETCH_ATTRIB( R16G16_UNORM,       SZ_2, CVT_16_UNORM ) +FETCH_ATTRIB( R16_UNORM,          SZ_1, CVT_16_UNORM ) -FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM ) -FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM ) -FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM ) +FETCH_ATTRIB( R16G16B16A16_SNORM, SZ_4, CVT_16_SNORM ) +FETCH_ATTRIB( R16G16B16_SNORM,    SZ_3, CVT_16_SNORM ) +FETCH_ATTRIB( R16G16_SNORM,       SZ_2, CVT_16_SNORM ) +FETCH_ATTRIB( R16_SNORM,          SZ_1, CVT_16_SNORM ) -FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM ) -FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM ) -FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM ) -FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM ) +FETCH_ATTRIB( R8G8B8A8_USCALED,   SZ_4, CVT_8_USCALED ) +FETCH_ATTRIB( R8G8B8_USCALED,     SZ_3, CVT_8_USCALED ) +FETCH_ATTRIB( R8G8_USCALED,       SZ_2, CVT_8_USCALED ) +FETCH_ATTRIB( R8_USCALED,         SZ_1, CVT_8_USCALED ) -FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED ) -FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED ) -FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED ) -FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED ) +FETCH_ATTRIB( R8G8B8A8_SSCALED,  SZ_4, CVT_8_SSCALED ) +FETCH_ATTRIB( R8G8B8_SSCALED,    SZ_3, CVT_8_SSCALED ) +FETCH_ATTRIB( R8G8_SSCALED,      SZ_2, CVT_8_SSCALED ) +FETCH_ATTRIB( R8_SSCALED,        SZ_1, CVT_8_SSCALED ) -FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED ) -FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED ) -FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED ) -FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED ) +FETCH_ATTRIB( R8G8B8A8_UNORM,  SZ_4, CVT_8_UNORM ) +FETCH_ATTRIB( R8G8B8_UNORM,    SZ_3, CVT_8_UNORM ) +FETCH_ATTRIB( R8G8_UNORM,      SZ_2, CVT_8_UNORM ) +FETCH_ATTRIB( R8_UNORM,        SZ_1, CVT_8_UNORM ) -FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM ) -FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM ) -FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM ) -FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM ) +FETCH_ATTRIB( R8G8B8A8_SNORM,  SZ_4, CVT_8_SNORM ) +FETCH_ATTRIB( R8G8B8_SNORM,    SZ_3, CVT_8_SNORM ) +FETCH_ATTRIB( R8G8_SNORM,      SZ_2, CVT_8_SNORM ) +FETCH_ATTRIB( R8_SNORM,        SZ_1, CVT_8_SNORM ) -FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM ) -FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM ) -FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM ) -FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM ) - -FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED ) -FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED ) -FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED ) -FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED ) - -FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED ) -FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED ) -FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED ) -FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED ) - -FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM ) -FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM ) -FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM ) -FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM ) - -FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM ) -FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM ) -FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM ) -FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM ) - -FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM ) -//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM ) +FETCH_ATTRIB( A8R8G8B8_UNORM,       SZ_4, CVT_8_UNORM )  static spu_fetch_func get_fetch_func( enum pipe_format format )  { -#if 0 -   { -      char tmp[80]; -      pf_sprint_name(tmp, format); -      _mesa_printf("%s: %s\n", __FUNCTION__, tmp); -   } -#endif -     switch (format) {     case PIPE_FORMAT_R64_FLOAT:        return fetch_R64_FLOAT; @@ -348,6 +492,96 @@ static spu_fetch_func get_fetch_func( enum pipe_format format )  } +static unsigned get_vertex_size( enum pipe_format format ) +{ +   switch (format) { +   case PIPE_FORMAT_R64_FLOAT: +      return 8; +   case PIPE_FORMAT_R64G64_FLOAT: +      return 2 * 8; +   case PIPE_FORMAT_R64G64B64_FLOAT: +      return 3 * 8; +   case PIPE_FORMAT_R64G64B64A64_FLOAT: +      return 4 * 8; + +   case PIPE_FORMAT_R32_SSCALED: +   case PIPE_FORMAT_R32_SNORM: +   case PIPE_FORMAT_R32_USCALED: +   case PIPE_FORMAT_R32_UNORM: +   case PIPE_FORMAT_R32_FLOAT: +      return 4; +   case PIPE_FORMAT_R32G32_SSCALED: +   case PIPE_FORMAT_R32G32_SNORM: +   case PIPE_FORMAT_R32G32_USCALED: +   case PIPE_FORMAT_R32G32_UNORM: +   case PIPE_FORMAT_R32G32_FLOAT: +      return 2 * 4; +   case PIPE_FORMAT_R32G32B32_SSCALED: +   case PIPE_FORMAT_R32G32B32_SNORM: +   case PIPE_FORMAT_R32G32B32_USCALED: +   case PIPE_FORMAT_R32G32B32_UNORM: +   case PIPE_FORMAT_R32G32B32_FLOAT: +      return 3 * 4; +   case PIPE_FORMAT_R32G32B32A32_SSCALED: +   case PIPE_FORMAT_R32G32B32A32_SNORM: +   case PIPE_FORMAT_R32G32B32A32_USCALED: +   case PIPE_FORMAT_R32G32B32A32_UNORM: +   case PIPE_FORMAT_R32G32B32A32_FLOAT: +      return 4 * 4; + +   case PIPE_FORMAT_R16_SSCALED: +   case PIPE_FORMAT_R16_SNORM: +   case PIPE_FORMAT_R16_UNORM: +   case PIPE_FORMAT_R16_USCALED: +      return 2; +   case PIPE_FORMAT_R16G16_SSCALED: +   case PIPE_FORMAT_R16G16_SNORM: +   case PIPE_FORMAT_R16G16_USCALED: +   case PIPE_FORMAT_R16G16_UNORM: +      return 2 * 2; +   case PIPE_FORMAT_R16G16B16_SSCALED: +   case PIPE_FORMAT_R16G16B16_SNORM: +   case PIPE_FORMAT_R16G16B16_USCALED: +   case PIPE_FORMAT_R16G16B16_UNORM: +      return 3 * 2; +   case PIPE_FORMAT_R16G16B16A16_SSCALED: +   case PIPE_FORMAT_R16G16B16A16_SNORM: +   case PIPE_FORMAT_R16G16B16A16_USCALED: +   case PIPE_FORMAT_R16G16B16A16_UNORM: +      return 4 * 2; + +   case PIPE_FORMAT_R8_SSCALED: +   case PIPE_FORMAT_R8_SNORM: +   case PIPE_FORMAT_R8_USCALED: +   case PIPE_FORMAT_R8_UNORM: +      return 1; +   case PIPE_FORMAT_R8G8_SSCALED: +   case PIPE_FORMAT_R8G8_SNORM: +   case PIPE_FORMAT_R8G8_USCALED: +   case PIPE_FORMAT_R8G8_UNORM: +      return 2 * 1; +   case PIPE_FORMAT_R8G8B8_SSCALED: +   case PIPE_FORMAT_R8G8B8_SNORM: +   case PIPE_FORMAT_R8G8B8_USCALED: +   case PIPE_FORMAT_R8G8B8_UNORM: +      return 3 * 1; +   case PIPE_FORMAT_A8R8G8B8_UNORM: +   case PIPE_FORMAT_R8G8B8A8_SSCALED: +   case PIPE_FORMAT_R8G8B8A8_SNORM: +   case PIPE_FORMAT_R8G8B8A8_USCALED: +   case PIPE_FORMAT_R8G8B8A8_UNORM: +      return 4 * 1; + +   case 0: +      return 0;		/* not sure why this is needed */ + +   default: +      assert(0); +      return 0; +   } +} + +  /**   * Fetch vertex attributes for 'count' vertices.   */ @@ -361,8 +595,6 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,     assert(count <= 4); -   wait_on_mask(1 << TAG_VERTEX_BUFFER); -  #if DRAW_DBG     printf("SPU: %s count = %u, nr_attrs = %u\n",             __FUNCTION__, count, nr_attrs); @@ -375,33 +607,40 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,        const uint64_t src = draw->vertex_fetch.src_ptr[attr];        const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];        unsigned i; +      unsigned idx; +      const unsigned bytes_per_entry = draw->vertex_fetch.size[attr]; +      const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;        qword p[4]; +      qword in[2 * 4];        /* Fetch four attributes for four vertices.   -       *  -       * Could fetch directly into AOS format, but this is meant to be -       * a prototype for an sse implementation, which would have -       * difficulties doing that.         */ +      idx = 0;        for (i = 0; i < count; i++) { -         uint8_t buffer[32] ALIGN16_ATTRIB;           const uint64_t addr = src + (elts[i] * pitch); -         const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;  #if DRAW_DBG           printf("SPU: fetching = 0x%llx\n", addr);  #endif -         mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0); -         wait_on_mask(1 << TAG_VERTEX_BUFFER); -         p[i] = (*fetch)(buffer + (addr & 0x0f)); +	 fetch_unaligned(& in[idx], addr, bytes_per_entry); +	 idx += quads_per_entry;        } -      /* Be nice and zero out any missing vertices:  +      /* Be nice and zero out any missing vertices. +       */ +      (void) memset(& in[idx], 0, (8 - idx) * sizeof(qword)); + + +      /* Convert all 4 vertices to vectors of float.         */ -      for (/* empty */; i < 4; i++)  -          p[i] = si_xor(p[i], p[i]); +      idx = 0; +      for (i = 0; i < 4; i++) { +         p[i] = (*fetch)(in + idx); +	 idx += quads_per_entry; +      } +        /* Transpose/swizzle into vector-friendly format.  Currently         * assuming that all vertex shader inputs are float[4], but this @@ -422,9 +661,18 @@ void spu_update_vertex_fetch( struct spu_vs_context *draw )     unsigned i; +   /* Invalidate the vertex cache. +    */ +   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) { +      CACHELINE_CLEARVALID(i); +   } + +     for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {        draw->vertex_fetch.fetch[i] =            get_fetch_func(draw->vertex_fetch.format[i]); +      draw->vertex_fetch.size[i] = +          get_vertex_size(draw->vertex_fetch.format[i]);     }     draw->vertex_fetch.fetch_func = generic_vertex_fetch; diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h index c96b93ff0a..ea044e841d 100644 --- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h +++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h @@ -6,7 +6,7 @@  struct spu_vs_context; -typedef qword (*spu_fetch_func)(const void *ptr); +typedef qword (*spu_fetch_func)(const qword *qw);  typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,  				     struct spu_exec_machine *machine,  				     const unsigned *elts, @@ -18,6 +18,7 @@ struct spu_vs_context {     struct {        uint64_t src_ptr[PIPE_ATTRIB_MAX];        unsigned pitch[PIPE_ATTRIB_MAX]; +      unsigned size[PIPE_ATTRIB_MAX];        enum pipe_format format[PIPE_ATTRIB_MAX];        unsigned nr_attrs;        boolean dirty; | 
