summaryrefslogtreecommitdiff
path: root/src/mesa/drivers/dri/i965/brw_wm_glsl.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/mesa/drivers/dri/i965/brw_wm_glsl.c')
-rw-r--r--src/mesa/drivers/dri/i965/brw_wm_glsl.c2096
1 files changed, 2096 insertions, 0 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
new file mode 100644
index 0000000000..cb728190f5
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -0,0 +1,2096 @@
+#include "main/macros.h"
+#include "shader/prog_parameter.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+
+enum _subroutine {
+ SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
+};
+
+/* Only guess, need a flag in gl_fragment_program later */
+GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
+{
+ int i;
+ for (i = 0; i < fp->Base.NumInstructions; i++) {
+ struct prog_instruction *inst = &fp->Base.Instructions[i];
+ switch (inst->Opcode) {
+ case OPCODE_IF:
+ case OPCODE_TRUNC:
+ case OPCODE_ENDIF:
+ case OPCODE_CAL:
+ case OPCODE_BRK:
+ case OPCODE_RET:
+ case OPCODE_DDX:
+ case OPCODE_DDY:
+ case OPCODE_NOISE1:
+ case OPCODE_NOISE2:
+ case OPCODE_NOISE3:
+ case OPCODE_NOISE4:
+ case OPCODE_BGNLOOP:
+ return GL_TRUE;
+ default:
+ break;
+ }
+ }
+ return GL_FALSE;
+}
+
+static void set_reg(struct brw_wm_compile *c, int file, int index,
+ int component, struct brw_reg reg)
+{
+ c->wm_regs[file][index][component].reg = reg;
+ c->wm_regs[file][index][component].inited = GL_TRUE;
+}
+
+static int get_scalar_dst_index(struct prog_instruction *inst)
+{
+ int i;
+ for (i = 0; i < 4; i++)
+ if (inst->DstReg.WriteMask & (1<<i))
+ break;
+ return i;
+}
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+ struct brw_reg reg;
+ if(c->tmp_index == c->tmp_max)
+ c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
+
+ reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
+ return reg;
+}
+
+static int mark_tmps(struct brw_wm_compile *c)
+{
+ return c->tmp_index;
+}
+
+static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index )
+{
+ return brw_vec8_grf( c->tmp_regs[ index ], 0 );
+}
+
+static void release_tmps(struct brw_wm_compile *c, int mark)
+{
+ c->tmp_index = mark;
+}
+
+static struct brw_reg
+get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs)
+{
+ struct brw_reg reg;
+ switch (file) {
+ case PROGRAM_STATE_VAR:
+ case PROGRAM_CONSTANT:
+ case PROGRAM_UNIFORM:
+ file = PROGRAM_STATE_VAR;
+ break;
+ case PROGRAM_UNDEFINED:
+ return brw_null_reg();
+ default:
+ break;
+ }
+
+ if(c->wm_regs[file][index][component].inited)
+ reg = c->wm_regs[file][index][component].reg;
+ else
+ reg = brw_vec8_grf(c->reg_index, 0);
+
+ if(!c->wm_regs[file][index][component].inited) {
+ set_reg(c, file, index, component, reg);
+ c->reg_index++;
+ }
+
+ if (neg & (1<< component)) {
+ reg = negate(reg);
+ }
+ if (abs)
+ reg = brw_abs(reg);
+ return reg;
+}
+
+static void prealloc_reg(struct brw_wm_compile *c)
+{
+ int i, j;
+ struct brw_reg reg;
+ int nr_interp_regs = 0;
+ GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
+
+ for (i = 0; i < 4; i++) {
+ reg = (i < c->key.nr_depth_regs)
+ ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
+ set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
+ }
+ c->reg_index += 2*c->key.nr_depth_regs;
+ {
+ int nr_params = c->fp->program.Base.Parameters->NumParameters;
+ struct gl_program_parameter_list *plist =
+ c->fp->program.Base.Parameters;
+ int index = 0;
+ c->prog_data.nr_params = 4*nr_params;
+ for (i = 0; i < nr_params; i++) {
+ for (j = 0; j < 4; j++, index++) {
+ reg = brw_vec1_grf(c->reg_index + index/8,
+ index%8);
+ c->prog_data.param[index] =
+ &plist->ParameterValues[i][j];
+ set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
+ }
+ }
+ c->nr_creg = 2*((4*nr_params+15)/16);
+ c->reg_index += c->nr_creg;
+ }
+ for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
+ if (inputs & (1<<i)) {
+ nr_interp_regs++;
+ reg = brw_vec8_grf(c->reg_index, 0);
+ for (j = 0; j < 4; j++)
+ set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
+ c->reg_index += 2;
+
+ }
+ }
+ c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+ c->prog_data.urb_read_length = nr_interp_regs * 2;
+ c->prog_data.curb_read_length = c->nr_creg;
+ c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+ c->reg_index++;
+ c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+ c->reg_index += 2;
+}
+
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+ struct prog_instruction *inst, int component, int nr)
+{
+ return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
+ 0, 0);
+}
+
+static struct brw_reg get_src_reg(struct brw_wm_compile *c,
+ struct prog_src_register *src, int index, int nr)
+{
+ int component = GET_SWZ(src->Swizzle, index);
+ return get_reg(c, src->File, src->Index, component, nr,
+ src->NegateBase, src->Abs);
+}
+
+/* Subroutines are minimal support for resusable instruction sequences.
+ They are implemented as simply as possible to minimise overhead: there
+ is no explicit support for communication between the caller and callee
+ other than saving the return address in a temporary register, nor is
+ there any automatic local storage. This implies that great care is
+ required before attempting reentrancy or any kind of nested
+ subroutine invocations. */
+static void invoke_subroutine( struct brw_wm_compile *c,
+ enum _subroutine subroutine,
+ void (*emit)( struct brw_wm_compile * ) )
+{
+ struct brw_compile *p = &c->func;
+
+ assert( subroutine < BRW_WM_MAX_SUBROUTINE );
+
+ if( c->subroutines[ subroutine ] ) {
+ /* subroutine previously emitted: reuse existing instructions */
+
+ int mark = mark_tmps( c );
+ struct brw_reg return_address = retype( alloc_tmp( c ),
+ BRW_REGISTER_TYPE_UD );
+ int here = p->nr_insn;
+
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) );
+
+ brw_ADD( p, brw_ip_reg(), brw_ip_reg(),
+ brw_imm_d( ( c->subroutines[ subroutine ] -
+ here - 1 ) << 4 ) );
+ brw_pop_insn_state(p);
+
+ release_tmps( c, mark );
+ } else {
+ /* previously unused subroutine: emit, and mark for later reuse */
+
+ int mark = mark_tmps( c );
+ struct brw_reg return_address = retype( alloc_tmp( c ),
+ BRW_REGISTER_TYPE_UD );
+ struct brw_instruction *calc;
+ int base = p->nr_insn;
+
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) );
+ brw_pop_insn_state(p);
+
+ c->subroutines[ subroutine ] = p->nr_insn;
+
+ emit( c );
+
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ brw_MOV( p, brw_ip_reg(), return_address );
+ brw_pop_insn_state(p);
+
+ brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) );
+
+ release_tmps( c, mark );
+ }
+}
+
+static void emit_abs( struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ int i;
+ struct brw_compile *p = &c->func;
+ brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+ for (i = 0; i < 4; i++) {
+ if (inst->DstReg.WriteMask & (1<<i)) {
+ struct brw_reg src, dst;
+ dst = get_dst_reg(c, inst, i, 1);
+ src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ brw_MOV(p, dst, brw_abs(src));
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static void emit_trunc( struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ int i;
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+ for (i = 0; i < 4; i++) {
+ if (mask & (1<<i)) {
+ struct brw_reg src, dst;
+ dst = get_dst_reg(c, inst, i, 1) ;
+ src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ brw_RNDD(p, dst, src);
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static void emit_mov( struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ int i;
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+ for (i = 0; i < 4; i++) {
+ if (mask & (1<<i)) {
+ struct brw_reg src, dst;
+ dst = get_dst_reg(c, inst, i, 1);
+ src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ brw_MOV(p, dst, src);
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static void emit_pixel_xy(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_reg r1 = brw_vec1_grf(1, 0);
+ struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
+
+ struct brw_reg dst0, dst1;
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+
+ dst0 = get_dst_reg(c, inst, 0, 1);
+ dst1 = get_dst_reg(c, inst, 1, 1);
+ /* Calculate pixel centers by adding 1 or 0 to each of the
+ * micro-tile coordinates passed in r1.
+ */
+ if (mask & WRITEMASK_X) {
+ brw_ADD(p,
+ vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
+ stride(suboffset(r1_uw, 4), 2, 4, 0),
+ brw_imm_v(0x10101010));
+ }
+
+ if (mask & WRITEMASK_Y) {
+ brw_ADD(p,
+ vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
+ stride(suboffset(r1_uw, 5), 2, 4, 0),
+ brw_imm_v(0x11001100));
+ }
+
+}
+
+static void emit_delta_xy(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_reg r1 = brw_vec1_grf(1, 0);
+ struct brw_reg dst0, dst1, src0, src1;
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+
+ dst0 = get_dst_reg(c, inst, 0, 1);
+ dst1 = get_dst_reg(c, inst, 1, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
+ /* Calc delta X,Y by subtracting origin in r1 from the pixel
+ * centers.
+ */
+ if (mask & WRITEMASK_X) {
+ brw_ADD(p,
+ dst0,
+ retype(src0, BRW_REGISTER_TYPE_UW),
+ negate(r1));
+ }
+
+ if (mask & WRITEMASK_Y) {
+ brw_ADD(p,
+ dst1,
+ retype(src1, BRW_REGISTER_TYPE_UW),
+ negate(suboffset(r1,1)));
+
+ }
+
+}
+
+
+static void fire_fb_write( struct brw_wm_compile *c,
+ GLuint base_reg,
+ GLuint nr,
+ GLuint target,
+ GLuint eot)
+{
+ struct brw_compile *p = &c->func;
+ /* Pass through control information:
+ */
+ /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */
+ {
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+ brw_MOV(p,
+ brw_message_reg(base_reg + 1),
+ brw_vec8_grf(1, 0));
+ brw_pop_insn_state(p);
+ }
+ /* Send framebuffer write message: */
+ brw_fb_WRITE(p,
+ retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+ base_reg,
+ retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+ target,
+ nr,
+ 0,
+ eot);
+}
+
+static void emit_fb_write(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ int nr = 2;
+ int channel;
+ GLuint target, eot;
+ struct brw_reg src0;
+
+ /* Reserve a space for AA - may not be needed:
+ */
+ if (c->key.aa_dest_stencil_reg)
+ nr += 1;
+ {
+ brw_push_insn_state(p);
+ for (channel = 0; channel < 4; channel++) {
+ src0 = get_src_reg(c, &inst->SrcReg[0], channel, 1);
+ /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */
+ /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */
+ brw_MOV(p, brw_message_reg(nr + channel), src0);
+ }
+ /* skip over the regs populated above: */
+ nr += 8;
+ brw_pop_insn_state(p);
+ }
+
+ if (c->key.source_depth_to_render_target)
+ {
+ if (c->key.computes_depth) {
+ src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
+ brw_MOV(p, brw_message_reg(nr), src0);
+ } else {
+ src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+ brw_MOV(p, brw_message_reg(nr), src0);
+ }
+
+ nr += 2;
+ }
+ target = inst->Sampler >> 1;
+ eot = inst->Sampler & 1;
+ fire_fb_write(c, 0, nr, target, eot);
+}
+
+static void emit_pixel_w( struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ if (mask & WRITEMASK_W) {
+ struct brw_reg dst, src0, delta0, delta1;
+ struct brw_reg interp3;
+
+ dst = get_dst_reg(c, inst, 3, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+ delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+
+ interp3 = brw_vec1_grf(src0.nr+1, 4);
+ /* Calc 1/w - just linterp wpos[3] optimized by putting the
+ * result straight into a message reg.
+ */
+ brw_LINE(p, brw_null_reg(), interp3, delta0);
+ brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1);
+
+ /* Calc w */
+ brw_math_16( p, dst,
+ BRW_MATH_FUNCTION_INV,
+ BRW_MATH_SATURATE_NONE,
+ 2, brw_null_reg(),
+ BRW_MATH_PRECISION_FULL);
+ }
+}
+
+static void emit_linterp(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ struct brw_reg interp[4];
+ struct brw_reg dst, delta0, delta1;
+ struct brw_reg src0;
+
+ src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+ delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+ GLuint nr = src0.nr;
+ int i;
+
+ interp[0] = brw_vec1_grf(nr, 0);
+ interp[1] = brw_vec1_grf(nr, 4);
+ interp[2] = brw_vec1_grf(nr+1, 0);
+ interp[3] = brw_vec1_grf(nr+1, 4);
+
+ for(i = 0; i < 4; i++ ) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_LINE(p, brw_null_reg(), interp[i], delta0);
+ brw_MAC(p, dst, suboffset(interp[i],1), delta1);
+ }
+ }
+}
+
+static void emit_cinterp(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+
+ struct brw_reg interp[4];
+ struct brw_reg dst, src0;
+
+ src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ GLuint nr = src0.nr;
+ int i;
+
+ interp[0] = brw_vec1_grf(nr, 0);
+ interp[1] = brw_vec1_grf(nr, 4);
+ interp[2] = brw_vec1_grf(nr+1, 0);
+ interp[3] = brw_vec1_grf(nr+1, 4);
+
+ for(i = 0; i < 4; i++ ) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_MOV(p, dst, suboffset(interp[i],3));
+ }
+ }
+}
+
+static void emit_pinterp(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+
+ struct brw_reg interp[4];
+ struct brw_reg dst, delta0, delta1;
+ struct brw_reg src0, w;
+
+ src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+ delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+ w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
+ GLuint nr = src0.nr;
+ int i;
+
+ interp[0] = brw_vec1_grf(nr, 0);
+ interp[1] = brw_vec1_grf(nr, 4);
+ interp[2] = brw_vec1_grf(nr+1, 0);
+ interp[3] = brw_vec1_grf(nr+1, 4);
+
+ for(i = 0; i < 4; i++ ) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_LINE(p, brw_null_reg(), interp[i], delta0);
+ brw_MAC(p, dst, suboffset(interp[i],1),
+ delta1);
+ brw_MUL(p, dst, dst, w);
+ }
+ }
+}
+
+static void emit_xpd(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ int i;
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ for (i = 0; i < 4; i++) {
+ GLuint i2 = (i+2)%3;
+ GLuint i1 = (i+1)%3;
+ if (mask & (1<<i)) {
+ struct brw_reg src0, src1, dst;
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
+ src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
+ brw_MUL(p, brw_null_reg(), src0, src1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
+ brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+ brw_MAC(p, dst, src0, src1);
+ brw_set_saturate(p, 0);
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static void emit_dp3(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_reg src0[3], src1[3], dst;
+ int i;
+ struct brw_compile *p = &c->func;
+ for (i = 0; i < 3; i++) {
+ src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ }
+
+ dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+ brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+ brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ brw_MAC(p, dst, src0[2], src1[2]);
+ brw_set_saturate(p, 0);
+}
+
+static void emit_dp4(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_reg src0[4], src1[4], dst;
+ int i;
+ struct brw_compile *p = &c->func;
+ for (i = 0; i < 4; i++) {
+ src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ }
+ dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+ brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+ brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+ brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ brw_MAC(p, dst, src0[3], src1[3]);
+ brw_set_saturate(p, 0);
+}
+
+static void emit_dph(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_reg src0[4], src1[4], dst;
+ int i;
+ struct brw_compile *p = &c->func;
+ for (i = 0; i < 4; i++) {
+ src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ }
+ dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+ brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+ brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+ brw_MAC(p, dst, src0[2], src1[2]);
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ brw_ADD(p, dst, src0[3], src1[3]);
+ brw_set_saturate(p, 0);
+}
+
+static void emit_math1(struct brw_wm_compile *c,
+ struct prog_instruction *inst, GLuint func)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src0, dst;
+
+ src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+ brw_MOV(p, brw_message_reg(2), src0);
+ brw_math(p,
+ dst,
+ func,
+ (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+ 2,
+ brw_null_reg(),
+ BRW_MATH_DATA_VECTOR,
+ BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_rcp(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
+}
+
+static void emit_rsq(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
+}
+
+static void emit_sin(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
+}
+
+static void emit_cos(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
+}
+
+static void emit_ex2(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
+}
+
+static void emit_lg2(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
+}
+
+static void emit_add(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src0, src1, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ brw_ADD(p, dst, src0, src1);
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static void emit_sub(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src0, src1, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ brw_ADD(p, dst, src0, negate(src1));
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static void emit_mul(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src0, src1, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ brw_MUL(p, dst, src0, src1);
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static void emit_frc(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src0, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ brw_FRC(p, dst, src0);
+ }
+ }
+ if (inst->SaturateMode != SATURATE_OFF)
+ brw_set_saturate(p, 0);
+}
+
+static void emit_flr(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src0, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ brw_RNDD(p, dst, src0);
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static void emit_max(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ struct brw_reg src0, src1, dst;
+ int i;
+ brw_push_insn_state(p);
+ for (i = 0; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ brw_MOV(p, dst, src0);
+ brw_set_saturate(p, 0);
+
+ brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+ brw_MOV(p, dst, src1);
+ brw_set_saturate(p, 0);
+ brw_set_predicate_control_flag_value(p, 0xff);
+ }
+ }
+ brw_pop_insn_state(p);
+}
+
+static void emit_min(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ struct brw_reg src0, src1, dst;
+ int i;
+ brw_push_insn_state(p);
+ for (i = 0; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ brw_MOV(p, dst, src0);
+ brw_set_saturate(p, 0);
+
+ brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+ brw_MOV(p, dst, src1);
+ brw_set_saturate(p, 0);
+ brw_set_predicate_control_flag_value(p, 0xff);
+ }
+ }
+ brw_pop_insn_state(p);
+}
+
+static void emit_pow(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg dst, src0, src1;
+ dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+
+ brw_MOV(p, brw_message_reg(2), src0);
+ brw_MOV(p, brw_message_reg(3), src1);
+
+ brw_math(p,
+ dst,
+ BRW_MATH_FUNCTION_POW,
+ (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+ 2,
+ brw_null_reg(),
+ BRW_MATH_DATA_VECTOR,
+ BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_lrp(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
+ int i;
+ int mark = mark_tmps(c);
+ for (i = 0; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+
+ src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+
+ if (src1.nr == dst.nr) {
+ tmp1 = alloc_tmp(c);
+ brw_MOV(p, tmp1, src1);
+ } else
+ tmp1 = src1;
+
+ src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
+ if (src2.nr == dst.nr) {
+ tmp2 = alloc_tmp(c);
+ brw_MOV(p, tmp2, src2);
+ } else
+ tmp2 = src2;
+
+ brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
+ brw_MUL(p, brw_null_reg(), dst, tmp2);
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ brw_MAC(p, dst, src0, tmp1);
+ brw_set_saturate(p, 0);
+ }
+ release_tmps(c, mark);
+ }
+}
+
+static void emit_kil(struct brw_wm_compile *c)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+ brw_AND(p, depth, c->emit_mask_reg, depth);
+ brw_pop_insn_state(p);
+}
+
+static void emit_mad(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ struct brw_reg dst, src0, src1, src2;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
+ brw_MUL(p, dst, src0, src1);
+
+ brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
+ brw_ADD(p, dst, dst, src2);
+ brw_set_saturate(p, 0);
+ }
+ }
+}
+
+static void emit_sop(struct brw_wm_compile *c,
+ struct prog_instruction *inst, GLuint cond)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ struct brw_reg dst, src0, src1;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+ src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+ brw_push_insn_state(p);
+ brw_CMP(p, brw_null_reg(), cond, src0, src1);
+ brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+ brw_MOV(p, dst, brw_imm_f(0.0));
+ brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+ brw_MOV(p, dst, brw_imm_f(1.0));
+ brw_pop_insn_state(p);
+ }
+ }
+}
+
+static void emit_slt(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_sop(c, inst, BRW_CONDITIONAL_L);
+}
+
+static void emit_sle(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_sop(c, inst, BRW_CONDITIONAL_LE);
+}
+
+static void emit_sgt(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_sop(c, inst, BRW_CONDITIONAL_G);
+}
+
+static void emit_sge(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_sop(c, inst, BRW_CONDITIONAL_GE);
+}
+
+static void emit_seq(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_sop(c, inst, BRW_CONDITIONAL_EQ);
+}
+
+static void emit_sne(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
+}
+
+static void emit_ddx(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ struct brw_reg interp[4];
+ struct brw_reg dst;
+ struct brw_reg src0, w;
+ GLuint nr, i;
+ src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
+ nr = src0.nr;
+ interp[0] = brw_vec1_grf(nr, 0);
+ interp[1] = brw_vec1_grf(nr, 4);
+ interp[2] = brw_vec1_grf(nr+1, 0);
+ interp[3] = brw_vec1_grf(nr+1, 4);
+ brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+ for(i = 0; i < 4; i++ ) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_MOV(p, dst, interp[i]);
+ brw_MUL(p, dst, dst, w);
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static void emit_ddy(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ struct brw_reg interp[4];
+ struct brw_reg dst;
+ struct brw_reg src0, w;
+ GLuint nr, i;
+
+ src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ nr = src0.nr;
+ w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
+ interp[0] = brw_vec1_grf(nr, 0);
+ interp[1] = brw_vec1_grf(nr, 4);
+ interp[2] = brw_vec1_grf(nr+1, 0);
+ interp[3] = brw_vec1_grf(nr+1, 4);
+ brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
+ for(i = 0; i < 4; i++ ) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_MOV(p, dst, suboffset(interp[i], 1));
+ brw_MUL(p, dst, dst, w);
+ }
+ }
+ brw_set_saturate(p, 0);
+}
+
+static __inline struct brw_reg high_words( struct brw_reg reg )
+{
+ return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
+ 0, 8, 2 );
+}
+
+static __inline struct brw_reg low_words( struct brw_reg reg )
+{
+ return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
+}
+
+static __inline struct brw_reg even_bytes( struct brw_reg reg )
+{
+ return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
+}
+
+static __inline struct brw_reg odd_bytes( struct brw_reg reg )
+{
+ return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
+ 0, 16, 2 );
+}
+
+/* One-, two- and three-dimensional Perlin noise, similar to the description
+ in _Improving Noise_, Ken Perlin, Computer Graphics vol. 35 no. 3. */
+static void noise1_sub( struct brw_wm_compile *c ) {
+
+ struct brw_compile *p = &c->func;
+ struct brw_reg param,
+ x0, x1, /* gradients at each end */
+ t, tmp[ 2 ], /* float temporaries */
+ itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */
+ int i;
+ int mark = mark_tmps( c );
+
+ x0 = alloc_tmp( c );
+ x1 = alloc_tmp( c );
+ t = alloc_tmp( c );
+ tmp[ 0 ] = alloc_tmp( c );
+ tmp[ 1 ] = alloc_tmp( c );
+ itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD );
+ itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD );
+ itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD );
+ itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD );
+ itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD );
+
+ param = lookup_tmp( c, mark - 2 );
+
+ brw_set_access_mode( p, BRW_ALIGN_1 );
+
+ brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
+
+ /* Arrange the two end coordinates into scalars (itmp0/itmp1) to
+ be hashed. Also compute the remainder (offset within the unit
+ length), interleaved to reduce register dependency penalties. */
+ brw_RNDD( p, itmp[ 0 ], param );
+ brw_FRC( p, param, param );
+ brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) );
+ brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
+ brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
+
+ /* We're now ready to perform the hashing. The two hashes are
+ interleaved for performance. The hash function used is
+ designed to rapidly achieve avalanche and require only 32x16
+ bit multiplication, and 16-bit swizzles (which we get for
+ free). We can't use immediate operands in the multiplies,
+ because immediates are permitted only in src1 and the 16-bit
+ factor is permitted only in src0. */
+ for( i = 0; i < 2; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] );
+ for( i = 0; i < 2; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+ for( i = 0; i < 2; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] );
+ for( i = 0; i < 2; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+ for( i = 0; i < 2; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
+ for( i = 0; i < 2; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+
+ /* Now we want to initialise the two gradients based on the
+ hashes. Format conversion from signed integer to float leaves
+ everything scaled too high by a factor of pow( 2, 31 ), but
+ we correct for that right at the end. */
+ brw_ADD( p, t, param, brw_imm_f( -1.0 ) );
+ brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) );
+ brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) );
+
+ brw_MUL( p, x0, x0, param );
+ brw_MUL( p, x1, x1, t );
+
+ /* We interpolate between the gradients using the polynomial
+ 6t^5 - 15t^4 + 10t^3 (Perlin). */
+ brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) );
+ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
+ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
+ brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the
+ pipeline */
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param );
+ brw_MUL( p, param, tmp[ 0 ], param );
+ brw_MUL( p, x1, x1, param );
+ brw_ADD( p, x0, x0, x1 );
+ /* scale by pow( 2, -30 ), to compensate for the format conversion
+ above and an extra factor of 2 so that a single gradient covers
+ the [-1,1] range */
+ brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) );
+
+ release_tmps( c, mark );
+}
+
+static void emit_noise1( struct brw_wm_compile *c,
+ struct prog_instruction *inst )
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src, param, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ int mark = mark_tmps( c );
+
+ assert( mark == 0 );
+
+ src = get_src_reg( c, inst->SrcReg, 0, 1 );
+
+ param = alloc_tmp( c );
+
+ brw_MOV( p, param, src );
+
+ invoke_subroutine( c, SUB_NOISE1, noise1_sub );
+
+ /* Fill in the result: */
+ brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_MOV( p, dst, param );
+ }
+ }
+ if( inst->SaturateMode == SATURATE_ZERO_ONE )
+ brw_set_saturate( p, 0 );
+
+ release_tmps( c, mark );
+}
+
+static void noise2_sub( struct brw_wm_compile *c ) {
+
+ struct brw_compile *p = &c->func;
+ struct brw_reg param0, param1,
+ x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */
+ t, tmp[ 4 ], /* float temporaries */
+ itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */
+ int i;
+ int mark = mark_tmps( c );
+
+ x0y0 = alloc_tmp( c );
+ x0y1 = alloc_tmp( c );
+ x1y0 = alloc_tmp( c );
+ x1y1 = alloc_tmp( c );
+ t = alloc_tmp( c );
+ for( i = 0; i < 4; i++ ) {
+ tmp[ i ] = alloc_tmp( c );
+ itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
+ }
+ itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD );
+ itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD );
+ itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD );
+
+ param0 = lookup_tmp( c, mark - 3 );
+ param1 = lookup_tmp( c, mark - 2 );
+
+ brw_set_access_mode( p, BRW_ALIGN_1 );
+
+ /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to
+ be hashed. Also compute the remainders (offsets within the unit
+ square), interleaved to reduce register dependency penalties. */
+ brw_RNDD( p, itmp[ 0 ], param0 );
+ brw_RNDD( p, itmp[ 1 ], param1 );
+ brw_FRC( p, param0, param0 );
+ brw_FRC( p, param1, param1 );
+ brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */
+ brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ),
+ low_words( itmp[ 1 ] ) );
+ brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */
+ brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */
+ brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) );
+ brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) );
+ brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) );
+
+ /* We're now ready to perform the hashing. The four hashes are
+ interleaved for performance. The hash function used is
+ designed to rapidly achieve avalanche and require only 32x16
+ bit multiplication, and 16-bit swizzles (which we get for
+ free). We can't use immediate operands in the multiplies,
+ because immediates are permitted only in src1 and the 16-bit
+ factor is permitted only in src0. */
+ for( i = 0; i < 4; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] );
+ for( i = 0; i < 4; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+ for( i = 0; i < 4; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] );
+ for( i = 0; i < 4; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+ for( i = 0; i < 4; i++ )
+ brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] );
+ for( i = 0; i < 4; i++ )
+ brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ),
+ high_words( itmp[ i ] ) );
+
+ /* Now we want to initialise the four gradients based on the
+ hashes. Format conversion from signed integer to float leaves
+ everything scaled too high by a factor of pow( 2, 15 ), but
+ we correct for that right at the end. */
+ brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
+ brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
+ brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
+ brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) );
+ brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) );
+
+ brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) );
+ brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) );
+ brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) );
+ brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) );
+
+ brw_MUL( p, x1y0, x1y0, t );
+ brw_MUL( p, x1y1, x1y1, t );
+ brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
+ brw_MUL( p, x0y0, x0y0, param0 );
+ brw_MUL( p, x0y1, x0y1, param0 );
+
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 );
+ brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 );
+ brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t );
+ brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t );
+
+ brw_ADD( p, x0y0, x0y0, tmp[ 0 ] );
+ brw_ADD( p, x1y0, x1y0, tmp[ 2 ] );
+ brw_ADD( p, x0y1, x0y1, tmp[ 1 ] );
+ brw_ADD( p, x1y1, x1y1, tmp[ 3 ] );
+
+ /* We interpolate between the gradients using the polynomial
+ 6t^5 - 15t^4 + 10t^3 (Perlin). */
+ brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) );
+ brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) );
+ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) );
+ brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) );
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
+ brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
+ brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the
+ pipeline */
+ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) );
+ brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) );
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
+ brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
+ brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the
+ pipeline */
+ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 );
+ brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 );
+ brw_MUL( p, param0, tmp[ 0 ], param0 );
+ brw_MUL( p, param1, tmp[ 1 ], param1 );
+
+ /* Here we interpolate in the y dimension... */
+ brw_MUL( p, x0y1, x0y1, param1 );
+ brw_MUL( p, x1y1, x1y1, param1 );
+ brw_ADD( p, x0y0, x0y0, x0y1 );
+ brw_ADD( p, x1y0, x1y0, x1y1 );
+
+ /* And now in x. There are horrible register dependencies here,
+ but we have nothing else to do. */
+ brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
+ brw_MUL( p, x1y0, x1y0, param0 );
+ brw_ADD( p, x0y0, x0y0, x1y0 );
+
+ /* scale by pow( 2, -15 ), as described above */
+ brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) );
+
+ release_tmps( c, mark );
+}
+
+static void emit_noise2( struct brw_wm_compile *c,
+ struct prog_instruction *inst )
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src0, src1, param0, param1, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ int mark = mark_tmps( c );
+
+ assert( mark == 0 );
+
+ src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
+ src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
+
+ param0 = alloc_tmp( c );
+ param1 = alloc_tmp( c );
+
+ brw_MOV( p, param0, src0 );
+ brw_MOV( p, param1, src1 );
+
+ invoke_subroutine( c, SUB_NOISE2, noise2_sub );
+
+ /* Fill in the result: */
+ brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_MOV( p, dst, param0 );
+ }
+ }
+ if( inst->SaturateMode == SATURATE_ZERO_ONE )
+ brw_set_saturate( p, 0 );
+
+ release_tmps( c, mark );
+}
+
+/* The three-dimensional case is much like the one- and two- versions above,
+ but since the number of corners is rapidly growing we now pack 16 16-bit
+ hashes into each register to extract more parallelism from the EUs. */
+static void noise3_sub( struct brw_wm_compile *c ) {
+
+ struct brw_compile *p = &c->func;
+ struct brw_reg param0, param1, param2,
+ x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
+ xi, yi, zi, /* interpolation coefficients */
+ t, tmp[ 8 ], /* float temporaries */
+ itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */
+ wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */
+ int i;
+ int mark = mark_tmps( c );
+
+ x0y0 = alloc_tmp( c );
+ x0y1 = alloc_tmp( c );
+ x1y0 = alloc_tmp( c );
+ x1y1 = alloc_tmp( c );
+ xi = alloc_tmp( c );
+ yi = alloc_tmp( c );
+ zi = alloc_tmp( c );
+ t = alloc_tmp( c );
+ for( i = 0; i < 8; i++ ) {
+ tmp[ i ] = alloc_tmp( c );
+ itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD );
+ wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 );
+ }
+
+ param0 = lookup_tmp( c, mark - 4 );
+ param1 = lookup_tmp( c, mark - 3 );
+ param2 = lookup_tmp( c, mark - 2 );
+
+ brw_set_access_mode( p, BRW_ALIGN_1 );
+
+ /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to
+ be hashed. Also compute the remainders (offsets within the unit
+ cube), interleaved to reduce register dependency penalties. */
+ brw_RNDD( p, itmp[ 0 ], param0 );
+ brw_RNDD( p, itmp[ 1 ], param1 );
+ brw_RNDD( p, itmp[ 2 ], param2 );
+ brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBC8F ) ); /* constant used later */
+ brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0xD0BD ) ); /* constant used later */
+ brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0x9B93 ) ); /* constant used later */
+ brw_FRC( p, param0, param0 );
+ brw_FRC( p, param1, param1 );
+ brw_FRC( p, param2, param2 );
+ /* Since we now have only 16 bits of precision in the hash, we must
+ be more careful about thorough mixing to maintain entropy as we
+ squash the input vector into a small scalar. */
+ brw_MUL( p, brw_acc_reg(), itmp[ 4 ], itmp[ 0 ] );
+ brw_MAC( p, brw_acc_reg(), itmp[ 5 ], itmp[ 1 ] );
+ brw_MAC( p, itmp[ 0 ], itmp[ 6 ], itmp[ 2 ] );
+ brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ),
+ brw_imm_uw( 0xBC8F ) );
+
+ /* Temporarily disable the execution mask while we work with ExecSize=16
+ channels (the mask is set for ExecSize=8 and is probably incorrect).
+ Although this might cause execution of unwanted channels, the code
+ writes only to temporary registers and has no side effects, so
+ disabling the mask is harmless. */
+ brw_push_insn_state( p );
+ brw_set_mask_control( p, BRW_MASK_DISABLE );
+ brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) );
+ brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) );
+ brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) );
+
+ /* We're now ready to perform the hashing. The eight hashes are
+ interleaved for performance. The hash function used is
+ designed to rapidly achieve avalanche and require only 16x16
+ bit multiplication, and 8-bit swizzles (which we get for
+ free). */
+ for( i = 0; i < 4; i++ )
+ brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) );
+ for( i = 0; i < 4; i++ )
+ brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
+ odd_bytes( wtmp[ i ] ) );
+ for( i = 0; i < 4; i++ )
+ brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) );
+ for( i = 0; i < 4; i++ )
+ brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ),
+ odd_bytes( wtmp[ i ] ) );
+ brw_pop_insn_state( p );
+
+ /* Now we want to initialise the four rear gradients based on the
+ hashes. Format conversion from signed integer to float leaves
+ everything scaled too high by a factor of pow( 2, 15 ), but
+ we correct for that right at the end. */
+ /* x component */
+ brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
+ brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) );
+ brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) );
+ brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) );
+ brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) );
+
+ brw_push_insn_state( p );
+ brw_set_mask_control( p, BRW_MASK_DISABLE );
+ brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
+ brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
+ brw_pop_insn_state( p );
+
+ brw_MUL( p, x1y0, x1y0, t );
+ brw_MUL( p, x1y1, x1y1, t );
+ brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
+ brw_MUL( p, x0y0, x0y0, param0 );
+ brw_MUL( p, x0y1, x0y1, param0 );
+
+ /* y component */
+ brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
+ brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
+ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
+ brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
+
+ brw_push_insn_state( p );
+ brw_set_mask_control( p, BRW_MASK_DISABLE );
+ brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) );
+ brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) );
+ brw_pop_insn_state( p );
+
+ brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+ brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+ brw_ADD( p, t, param0, brw_imm_f( -1.0 ) );
+ brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
+ brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
+
+ brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+ brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+ brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+ brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+
+ /* z component */
+ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) );
+ brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) );
+ brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) );
+ brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) );
+
+ brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 );
+ brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 );
+ brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 );
+ brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 );
+
+ brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+ brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+ brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+ brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+
+ /* We interpolate between the gradients using the polynomial
+ 6t^5 - 15t^4 + 10t^3 (Perlin). */
+ brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) );
+ brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) );
+ brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) );
+ brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) );
+ brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) );
+ brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) );
+ brw_MUL( p, xi, xi, param0 );
+ brw_MUL( p, yi, yi, param1 );
+ brw_MUL( p, zi, zi, param2 );
+ brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) );
+ brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) );
+ brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) );
+ brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */
+ brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */
+ brw_MUL( p, xi, xi, param0 );
+ brw_MUL( p, yi, yi, param1 );
+ brw_MUL( p, zi, zi, param2 );
+ brw_MUL( p, xi, xi, param0 );
+ brw_MUL( p, yi, yi, param1 );
+ brw_MUL( p, zi, zi, param2 );
+ brw_MUL( p, xi, xi, param0 );
+ brw_MUL( p, yi, yi, param1 );
+ brw_MUL( p, zi, zi, param2 );
+
+ /* Here we interpolate in the y dimension... */
+ brw_MUL( p, x0y1, x0y1, yi );
+ brw_MUL( p, x1y1, x1y1, yi );
+ brw_ADD( p, x0y0, x0y0, x0y1 );
+ brw_ADD( p, x1y0, x1y0, x1y1 );
+
+ /* And now in x. Leave the result in tmp[ 0 ] (see below)... */
+ brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
+ brw_MUL( p, x1y0, x1y0, xi );
+ brw_ADD( p, tmp[ 0 ], x0y0, x1y0 );
+
+ /* Now do the same thing for the front four gradients... */
+ /* x component */
+ brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) );
+ brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) );
+ brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) );
+ brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) );
+
+ brw_push_insn_state( p );
+ brw_set_mask_control( p, BRW_MASK_DISABLE );
+ brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
+ brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
+ brw_pop_insn_state( p );
+
+ brw_MUL( p, x1y0, x1y0, t );
+ brw_MUL( p, x1y1, x1y1, t );
+ brw_ADD( p, t, param1, brw_imm_f( -1.0 ) );
+ brw_MUL( p, x0y0, x0y0, param0 );
+ brw_MUL( p, x0y1, x0y1, param0 );
+
+ /* y component */
+ brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
+ brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
+ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
+ brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
+
+ brw_push_insn_state( p );
+ brw_set_mask_control( p, BRW_MASK_DISABLE );
+ brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) );
+ brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) );
+ brw_pop_insn_state( p );
+
+ brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+ brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+ brw_ADD( p, t, param2, brw_imm_f( -1.0 ) );
+ brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 );
+ brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 );
+
+ brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+ brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+ brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+ brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+
+ /* z component */
+ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) );
+ brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) );
+ brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) );
+ brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) );
+
+ brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t );
+ brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t );
+ brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t );
+ brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t );
+
+ brw_ADD( p, x0y0, x0y0, tmp[ 4 ] );
+ brw_ADD( p, x0y1, x0y1, tmp[ 5 ] );
+ brw_ADD( p, x1y0, x1y0, tmp[ 6 ] );
+ brw_ADD( p, x1y1, x1y1, tmp[ 7 ] );
+
+ /* The interpolation coefficients are still around from last time, so
+ again interpolate in the y dimension... */
+ brw_ADD( p, x0y1, x0y1, negate( x0y0 ) );
+ brw_ADD( p, x1y1, x1y1, negate( x1y0 ) );
+ brw_MUL( p, x0y1, x0y1, yi );
+ brw_MUL( p, x1y1, x1y1, yi );
+ brw_ADD( p, x0y0, x0y0, x0y1 );
+ brw_ADD( p, x1y0, x1y0, x1y1 );
+
+ /* And now in x. The rear face is in tmp[ 0 ] (see above), so this
+ time put the front face in tmp[ 1 ] and we're nearly there... */
+ brw_ADD( p, x1y0, x1y0, negate( x0y0 ) );
+ brw_MUL( p, x1y0, x1y0, xi );
+ brw_ADD( p, tmp[ 1 ], x0y0, x1y0 );
+
+ /* The final interpolation, in the z dimension: */
+ brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) );
+ brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi );
+ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] );
+
+ /* scale by pow( 2, -15 ), as described above */
+ brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) );
+
+ release_tmps( c, mark );
+}
+
+static void emit_noise3( struct brw_wm_compile *c,
+ struct prog_instruction *inst )
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg src0, src1, src2, param0, param1, param2, dst;
+ GLuint mask = inst->DstReg.WriteMask;
+ int i;
+ int mark = mark_tmps( c );
+
+ assert( mark == 0 );
+
+ src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
+ src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
+ src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
+
+ param0 = alloc_tmp( c );
+ param1 = alloc_tmp( c );
+ param2 = alloc_tmp( c );
+
+ brw_MOV( p, param0, src0 );
+ brw_MOV( p, param1, src1 );
+ brw_MOV( p, param2, src2 );
+
+ invoke_subroutine( c, SUB_NOISE3, noise3_sub );
+
+ /* Fill in the result: */
+ brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
+ for (i = 0 ; i < 4; i++) {
+ if (mask & (1<<i)) {
+ dst = get_dst_reg(c, inst, i, 1);
+ brw_MOV( p, dst, param0 );
+ }
+ }
+ if( inst->SaturateMode == SATURATE_ZERO_ONE )
+ brw_set_saturate( p, 0 );
+
+ release_tmps( c, mark );
+}
+
+static void emit_wpos_xy(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ GLuint mask = inst->DstReg.WriteMask;
+ struct brw_reg src0[2], dst[2];
+
+ dst[0] = get_dst_reg(c, inst, 0, 1);
+ dst[1] = get_dst_reg(c, inst, 1, 1);
+
+ src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
+ src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
+
+ /* Calculate the pixel offset from window bottom left into destination
+ * X and Y channels.
+ */
+ if (mask & WRITEMASK_X) {
+ /* X' = X - origin_x */
+ brw_ADD(p,
+ dst[0],
+ retype(src0[0], BRW_REGISTER_TYPE_W),
+ brw_imm_d(0 - c->key.origin_x));
+ }
+
+ if (mask & WRITEMASK_Y) {
+ /* Y' = height - (Y - origin_y) = height + origin_y - Y */
+ brw_ADD(p,
+ dst[1],
+ negate(retype(src0[1], BRW_REGISTER_TYPE_W)),
+ brw_imm_d(c->key.origin_y + c->key.drawable_height - 1));
+ }
+}
+
+/* TODO
+ BIAS on SIMD8 not workind yet...
+ */
+static void emit_txb(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg dst[4], src[4], payload_reg;
+ GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+
+ GLuint i;
+ payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+ for (i = 0; i < 4; i++)
+ dst[i] = get_dst_reg(c, inst, i, 1);
+ for (i = 0; i < 4; i++)
+ src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+
+ switch (inst->TexSrcTarget) {
+ case TEXTURE_1D_INDEX:
+ brw_MOV(p, brw_message_reg(2), src[0]);
+ brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
+ brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+ break;
+ case TEXTURE_2D_INDEX:
+ case TEXTURE_RECT_INDEX:
+ brw_MOV(p, brw_message_reg(2), src[0]);
+ brw_MOV(p, brw_message_reg(3), src[1]);
+ brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+ break;
+ default:
+ brw_MOV(p, brw_message_reg(2), src[0]);
+ brw_MOV(p, brw_message_reg(3), src[1]);
+ brw_MOV(p, brw_message_reg(4), src[2]);
+ break;
+ }
+ brw_MOV(p, brw_message_reg(5), src[3]);
+ brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+ brw_SAMPLE(p,
+ retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+ 1,
+ retype(payload_reg, BRW_REGISTER_TYPE_UW),
+ unit + MAX_DRAW_BUFFERS, /* surface */
+ unit, /* sampler */
+ inst->DstReg.WriteMask,
+ BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
+ 4,
+ 4,
+ 0);
+}
+
+static void emit_tex(struct brw_wm_compile *c,
+ struct prog_instruction *inst)
+{
+ struct brw_compile *p = &c->func;
+ struct brw_reg dst[4], src[4], payload_reg;
+ GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+
+ GLuint msg_len;
+ GLuint i, nr;
+ GLuint emit;
+ GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
+
+ payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+
+ for (i = 0; i < 4; i++)
+ dst[i] = get_dst_reg(c, inst, i, 1);
+ for (i = 0; i < 4; i++)
+ src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+
+
+ switch (inst->TexSrcTarget) {
+ case TEXTURE_1D_INDEX:
+ emit = WRITEMASK_X;
+ nr = 1;
+ break;
+ case TEXTURE_2D_INDEX:
+ case TEXTURE_RECT_INDEX:
+ emit = WRITEMASK_XY;
+ nr = 2;
+ break;
+ default:
+ emit = WRITEMASK_XYZ;
+ nr = 3;
+ break;
+ }
+ msg_len = 1;
+
+ for (i = 0; i < nr; i++) {
+ static const GLuint swz[4] = {0,1,2,2};
+ if (emit & (1<<i))
+ brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
+ else
+ brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
+ msg_len += 1;
+ }
+
+ if (shadow) {
+ brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
+ brw_MOV(p, brw_message_reg(6), src[2]);
+ }
+
+ brw_SAMPLE(p,
+ retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+ 1,
+ retype(payload_reg, BRW_REGISTER_TYPE_UW),
+ unit + MAX_DRAW_BUFFERS, /* surface */
+ unit, /* sampler */
+ inst->DstReg.WriteMask,
+ BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
+ 4,
+ shadow ? 6 : 4,
+ 0);
+
+ if (shadow)
+ brw_MOV(p, dst[3], brw_imm_f(1.0));
+}
+
+static void post_wm_emit( struct brw_wm_compile *c )
+{
+ GLuint nr_insns = c->fp->program.Base.NumInstructions;
+ GLuint insn, target_insn;
+ struct prog_instruction *inst1, *inst2;
+ struct brw_instruction *brw_inst1, *brw_inst2;
+ int offset;
+ for (insn = 0; insn < nr_insns; insn++) {
+ inst1 = &c->fp->program.Base.Instructions[insn];
+ brw_inst1 = inst1->Data;
+ switch (inst1->Opcode) {
+ case OPCODE_CAL:
+ target_insn = inst1->BranchTarget;
+ inst2 = &c->fp->program.Base.Instructions[target_insn];
+ brw_inst2 = inst2->Data;
+ offset = brw_inst2 - brw_inst1;
+ brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
+{
+#define MAX_IFSN 32
+#define MAX_LOOP_DEPTH 32
+ struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
+ struct brw_instruction *inst0, *inst1;
+ int i, if_insn = 0, loop_insn = 0;
+ struct brw_compile *p = &c->func;
+ struct brw_indirect stack_index = brw_indirect(0, 0);
+
+ c->reg_index = 0;
+ prealloc_reg(c);
+ brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+ brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+
+ for (i = 0; i < c->nr_fp_insns; i++) {
+ struct prog_instruction *inst = &c->prog_instructions[i];
+ struct prog_instruction *orig_inst;
+
+ if ((orig_inst = inst->Data) != 0)
+ orig_inst->Data = current_insn(p);
+
+ if (inst->CondUpdate)
+ brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+ else
+ brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+
+ switch (inst->Opcode) {
+ case WM_PIXELXY:
+ emit_pixel_xy(c, inst);
+ break;
+ case WM_DELTAXY:
+ emit_delta_xy(c, inst);
+ break;
+ case WM_PIXELW:
+ emit_pixel_w(c, inst);
+ break;
+ case WM_LINTERP:
+ emit_linterp(c, inst);
+ break;
+ case WM_PINTERP:
+ emit_pinterp(c, inst);
+ break;
+ case WM_CINTERP:
+ emit_cinterp(c, inst);
+ break;
+ case WM_WPOSXY:
+ emit_wpos_xy(c, inst);
+ break;
+ case WM_FB_WRITE:
+ emit_fb_write(c, inst);
+ break;
+ case OPCODE_ABS:
+ emit_abs(c, inst);
+ break;
+ case OPCODE_ADD:
+ emit_add(c, inst);
+ break;
+ case OPCODE_SUB:
+ emit_sub(c, inst);
+ break;
+ case OPCODE_FRC:
+ emit_frc(c, inst);
+ break;
+ case OPCODE_FLR:
+ emit_flr(c, inst);
+ break;
+ case OPCODE_LRP:
+ emit_lrp(c, inst);
+ break;
+ case OPCODE_TRUNC:
+ emit_trunc(c, inst);
+ break;
+ case OPCODE_MOV:
+ emit_mov(c, inst);
+ break;
+ case OPCODE_DP3:
+ emit_dp3(c, inst);
+ break;
+ case OPCODE_DP4:
+ emit_dp4(c, inst);
+ break;
+ case OPCODE_XPD:
+ emit_xpd(c, inst);
+ break;
+ case OPCODE_DPH:
+ emit_dph(c, inst);
+ break;
+ case OPCODE_RCP:
+ emit_rcp(c, inst);
+ break;
+ case OPCODE_RSQ:
+ emit_rsq(c, inst);
+ break;
+ case OPCODE_SIN:
+ emit_sin(c, inst);
+ break;
+ case OPCODE_COS:
+ emit_cos(c, inst);
+ break;
+ case OPCODE_EX2:
+ emit_ex2(c, inst);
+ break;
+ case OPCODE_LG2:
+ emit_lg2(c, inst);
+ break;
+ case OPCODE_MAX:
+ emit_max(c, inst);
+ break;
+ case OPCODE_MIN:
+ emit_min(c, inst);
+ break;
+ case OPCODE_DDX:
+ emit_ddx(c, inst);
+ break;
+ case OPCODE_DDY:
+ emit_ddy(c, inst);
+ break;
+ case OPCODE_SLT:
+ emit_slt(c, inst);
+ break;
+ case OPCODE_SLE:
+ emit_sle(c, inst);
+ break;
+ case OPCODE_SGT:
+ emit_sgt(c, inst);
+ break;
+ case OPCODE_SGE:
+ emit_sge(c, inst);
+ break;
+ case OPCODE_SEQ:
+ emit_seq(c, inst);
+ break;
+ case OPCODE_SNE:
+ emit_sne(c, inst);
+ break;
+ case OPCODE_MUL:
+ emit_mul(c, inst);
+ break;
+ case OPCODE_POW:
+ emit_pow(c, inst);
+ break;
+ case OPCODE_MAD:
+ emit_mad(c, inst);
+ break;
+ case OPCODE_NOISE1:
+ emit_noise1(c, inst);
+ break;
+ case OPCODE_NOISE2:
+ emit_noise2(c, inst);
+ break;
+ case OPCODE_NOISE3:
+ emit_noise3(c, inst);
+ break;
+ /* case OPCODE_NOISE4: */
+ /* not yet implemented */
+ case OPCODE_TEX:
+ emit_tex(c, inst);
+ break;
+ case OPCODE_TXB:
+ emit_txb(c, inst);
+ break;
+ case OPCODE_KIL_NV:
+ emit_kil(c);
+ break;
+ case OPCODE_IF:
+ assert(if_insn < MAX_IFSN);
+ if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+ break;
+ case OPCODE_ELSE:
+ if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
+ break;
+ case OPCODE_ENDIF:
+ assert(if_insn > 0);
+ brw_ENDIF(p, if_inst[--if_insn]);
+ break;
+ case OPCODE_BGNSUB:
+ case OPCODE_ENDSUB:
+ break;
+ case OPCODE_CAL:
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ brw_set_access_mode(p, BRW_ALIGN_1);
+ brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+ brw_set_access_mode(p, BRW_ALIGN_16);
+ brw_ADD(p, get_addr_reg(stack_index),
+ get_addr_reg(stack_index), brw_imm_d(4));
+ orig_inst = inst->Data;
+ orig_inst->Data = &p->store[p->nr_insn];
+ brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+ brw_pop_insn_state(p);
+ break;
+
+ case OPCODE_RET:
+ brw_push_insn_state(p);
+ brw_set_mask_control(p, BRW_MASK_DISABLE);
+ brw_ADD(p, get_addr_reg(stack_index),
+ get_addr_reg(stack_index), brw_imm_d(-4));
+ brw_set_access_mode(p, BRW_ALIGN_1);
+ brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
+ brw_set_access_mode(p, BRW_ALIGN_16);
+ brw_pop_insn_state(p);
+
+ break;
+ case OPCODE_BGNLOOP:
+ loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
+ break;
+ case OPCODE_BRK:
+ brw_BREAK(p);
+ brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+ break;
+ case OPCODE_CONT:
+ brw_CONT(p);
+ brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+ break;
+ case OPCODE_ENDLOOP:
+ loop_insn--;
+ inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
+ /* patch all the BREAK instructions from
+ last BEGINLOOP */
+ while (inst0 > loop_inst[loop_insn]) {
+ inst0--;
+ if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+ inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
+ inst0->bits3.if_else.pop_count = 0;
+ } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+ inst0->bits3.if_else.jump_count = inst1 - inst0;
+ inst0->bits3.if_else.pop_count = 0;
+ }
+ }
+ break;
+ default:
+ _mesa_printf("unsupported IR in fragment shader %d\n",
+ inst->Opcode);
+ }
+ if (inst->CondUpdate)
+ brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+ else
+ brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+ }
+ post_wm_emit(c);
+ for (i = 0; i < c->fp->program.Base.NumInstructions; i++)
+ c->fp->program.Base.Instructions[i].Data = NULL;
+}
+
+void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+ brw_wm_pass_fp(c);
+ brw_wm_emit_glsl(brw, c);
+ c->prog_data.total_grf = c->reg_index;
+ c->prog_data.total_scratch = 0;
+}