#include "main/macros.h" #include "shader/prog_parameter.h" #include "brw_context.h" #include "brw_eu.h" #include "brw_wm.h" enum _subroutine { SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4 }; /* Only guess, need a flag in gl_fragment_program later */ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp) { int i; for (i = 0; i < fp->Base.NumInstructions; i++) { struct prog_instruction *inst = &fp->Base.Instructions[i]; switch (inst->Opcode) { case OPCODE_IF: case OPCODE_TRUNC: case OPCODE_ENDIF: case OPCODE_CAL: case OPCODE_BRK: case OPCODE_RET: case OPCODE_DDX: case OPCODE_DDY: case OPCODE_NOISE1: case OPCODE_NOISE2: case OPCODE_NOISE3: case OPCODE_NOISE4: case OPCODE_BGNLOOP: return GL_TRUE; default: break; } } return GL_FALSE; } static void set_reg(struct brw_wm_compile *c, int file, int index, int component, struct brw_reg reg) { c->wm_regs[file][index][component].reg = reg; c->wm_regs[file][index][component].inited = GL_TRUE; } static int get_scalar_dst_index(struct prog_instruction *inst) { int i; for (i = 0; i < 4; i++) if (inst->DstReg.WriteMask & (1<tmp_index == c->tmp_max) c->tmp_regs[ c->tmp_max++ ] = c->reg_index++; reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0); return reg; } static int mark_tmps(struct brw_wm_compile *c) { return c->tmp_index; } static struct brw_reg lookup_tmp( struct brw_wm_compile *c, int index ) { return brw_vec8_grf( c->tmp_regs[ index ], 0 ); } static void release_tmps(struct brw_wm_compile *c, int mark) { c->tmp_index = mark; } static struct brw_reg get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs) { struct brw_reg reg; switch (file) { case PROGRAM_STATE_VAR: case PROGRAM_CONSTANT: case PROGRAM_UNIFORM: file = PROGRAM_STATE_VAR; break; case PROGRAM_UNDEFINED: return brw_null_reg(); default: break; } if(c->wm_regs[file][index][component].inited) reg = c->wm_regs[file][index][component].reg; else reg = brw_vec8_grf(c->reg_index, 0); if(!c->wm_regs[file][index][component].inited) { set_reg(c, file, index, component, reg); c->reg_index++; } if (neg & (1<< component)) { reg = negate(reg); } if (abs) reg = brw_abs(reg); return reg; } static void prealloc_reg(struct brw_wm_compile *c) { int i, j; struct brw_reg reg; int nr_interp_regs = 0; GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted; for (i = 0; i < 4; i++) { reg = (i < c->key.nr_depth_regs) ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0); set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg); } c->reg_index += 2*c->key.nr_depth_regs; { int nr_params = c->fp->program.Base.Parameters->NumParameters; struct gl_program_parameter_list *plist = c->fp->program.Base.Parameters; int index = 0; c->prog_data.nr_params = 4*nr_params; for (i = 0; i < nr_params; i++) { for (j = 0; j < 4; j++, index++) { reg = brw_vec1_grf(c->reg_index + index/8, index%8); c->prog_data.param[index] = &plist->ParameterValues[i][j]; set_reg(c, PROGRAM_STATE_VAR, i, j, reg); } } c->nr_creg = 2*((4*nr_params+15)/16); c->reg_index += c->nr_creg; } for (i = 0; i < FRAG_ATTRIB_MAX; i++) { if (inputs & (1<reg_index, 0); for (j = 0; j < 4; j++) set_reg(c, PROGRAM_PAYLOAD, i, j, reg); c->reg_index += 2; } } c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2; c->prog_data.urb_read_length = nr_interp_regs * 2; c->prog_data.curb_read_length = c->nr_creg; c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0); c->reg_index++; c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0); c->reg_index += 2; } static struct brw_reg get_dst_reg(struct brw_wm_compile *c, struct prog_instruction *inst, int component, int nr) { return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr, 0, 0); } static struct brw_reg get_src_reg(struct brw_wm_compile *c, struct prog_src_register *src, int index, int nr) { int component = GET_SWZ(src->Swizzle, index); return get_reg(c, src->File, src->Index, component, nr, src->NegateBase, src->Abs); } /* Subroutines are minimal support for resusable instruction sequences. They are implemented as simply as possible to minimise overhead: there is no explicit support for communication between the caller and callee other than saving the return address in a temporary register, nor is there any automatic local storage. This implies that great care is required before attempting reentrancy or any kind of nested subroutine invocations. */ static void invoke_subroutine( struct brw_wm_compile *c, enum _subroutine subroutine, void (*emit)( struct brw_wm_compile * ) ) { struct brw_compile *p = &c->func; assert( subroutine < BRW_WM_MAX_SUBROUTINE ); if( c->subroutines[ subroutine ] ) { /* subroutine previously emitted: reuse existing instructions */ int mark = mark_tmps( c ); struct brw_reg return_address = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD ); int here = p->nr_insn; brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 2 << 4 ) ); brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( ( c->subroutines[ subroutine ] - here - 1 ) << 4 ) ); brw_pop_insn_state(p); release_tmps( c, mark ); } else { /* previously unused subroutine: emit, and mark for later reuse */ int mark = mark_tmps( c ); struct brw_reg return_address = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD ); struct brw_instruction *calc; int base = p->nr_insn; brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); calc = brw_ADD( p, return_address, brw_ip_reg(), brw_imm_ud( 0 ) ); brw_pop_insn_state(p); c->subroutines[ subroutine ] = p->nr_insn; emit( c ); brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_MOV( p, brw_ip_reg(), return_address ); brw_pop_insn_state(p); brw_set_src1( calc, brw_imm_ud( ( p->nr_insn - base ) << 4 ) ); release_tmps( c, mark ); } } static void emit_abs( struct brw_wm_compile *c, struct prog_instruction *inst) { int i; struct brw_compile *p = &c->func; brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF); for (i = 0; i < 4; i++) { if (inst->DstReg.WriteMask & (1<SrcReg[0], i, 1); brw_MOV(p, dst, brw_abs(src)); } } brw_set_saturate(p, 0); } static void emit_trunc( struct brw_wm_compile *c, struct prog_instruction *inst) { int i; struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF); for (i = 0; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); brw_RNDZ(p, dst, src); } } brw_set_saturate(p, 0); } static void emit_mov( struct brw_wm_compile *c, struct prog_instruction *inst) { int i; struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF); for (i = 0; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); brw_MOV(p, dst, src); } } brw_set_saturate(p, 0); } static void emit_pixel_xy(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_reg r1 = brw_vec1_grf(1, 0); struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW); struct brw_reg dst0, dst1; struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; dst0 = get_dst_reg(c, inst, 0, 1); dst1 = get_dst_reg(c, inst, 1, 1); /* Calculate pixel centers by adding 1 or 0 to each of the * micro-tile coordinates passed in r1. */ if (mask & WRITEMASK_X) { brw_ADD(p, vec8(retype(dst0, BRW_REGISTER_TYPE_UW)), stride(suboffset(r1_uw, 4), 2, 4, 0), brw_imm_v(0x10101010)); } if (mask & WRITEMASK_Y) { brw_ADD(p, vec8(retype(dst1, BRW_REGISTER_TYPE_UW)), stride(suboffset(r1_uw, 5), 2, 4, 0), brw_imm_v(0x11001100)); } } static void emit_delta_xy(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_reg r1 = brw_vec1_grf(1, 0); struct brw_reg dst0, dst1, src0, src1; struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; dst0 = get_dst_reg(c, inst, 0, 1); dst1 = get_dst_reg(c, inst, 1, 1); src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1); src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1); /* Calc delta X,Y by subtracting origin in r1 from the pixel * centers. */ if (mask & WRITEMASK_X) { brw_ADD(p, dst0, retype(src0, BRW_REGISTER_TYPE_UW), negate(r1)); } if (mask & WRITEMASK_Y) { brw_ADD(p, dst1, retype(src1, BRW_REGISTER_TYPE_UW), negate(suboffset(r1,1))); } } static void fire_fb_write( struct brw_wm_compile *c, GLuint base_reg, GLuint nr, GLuint target, GLuint eot) { struct brw_compile *p = &c->func; /* Pass through control information: */ /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */ { brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */ brw_MOV(p, brw_message_reg(base_reg + 1), brw_vec8_grf(1, 0)); brw_pop_insn_state(p); } /* Send framebuffer write message: */ brw_fb_WRITE(p, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW), base_reg, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW), target, nr, 0, eot); } static void emit_fb_write(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; int nr = 2; int channel; GLuint target, eot; struct brw_reg src0; /* Reserve a space for AA - may not be needed: */ if (c->key.aa_dest_stencil_reg) nr += 1; { brw_push_insn_state(p); for (channel = 0; channel < 4; channel++) { src0 = get_src_reg(c, &inst->SrcReg[0], channel, 1); /* mov (8) m2.0<1>:ud r28.0<8;8,1>:ud { Align1 } */ /* mov (8) m6.0<1>:ud r29.0<8;8,1>:ud { Align1 SecHalf } */ brw_MOV(p, brw_message_reg(nr + channel), src0); } /* skip over the regs populated above: */ nr += 8; brw_pop_insn_state(p); } if (c->key.source_depth_to_render_target) { if (c->key.computes_depth) { src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1); brw_MOV(p, brw_message_reg(nr), src0); } else { src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1); brw_MOV(p, brw_message_reg(nr), src0); } nr += 2; } target = inst->Sampler >> 1; eot = inst->Sampler & 1; fire_fb_write(c, 0, nr, target, eot); } static void emit_pixel_w( struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; if (mask & WRITEMASK_W) { struct brw_reg dst, src0, delta0, delta1; struct brw_reg interp3; dst = get_dst_reg(c, inst, 3, 1); src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1); delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1); delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1); interp3 = brw_vec1_grf(src0.nr+1, 4); /* Calc 1/w - just linterp wpos[3] optimized by putting the * result straight into a message reg. */ brw_LINE(p, brw_null_reg(), interp3, delta0); brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), delta1); /* Calc w */ brw_math_16( p, dst, BRW_MATH_FUNCTION_INV, BRW_MATH_SATURATE_NONE, 2, brw_null_reg(), BRW_MATH_PRECISION_FULL); } } static void emit_linterp(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg interp[4]; struct brw_reg dst, delta0, delta1; struct brw_reg src0; src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1); delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1); delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1); GLuint nr = src0.nr; int i; interp[0] = brw_vec1_grf(nr, 0); interp[1] = brw_vec1_grf(nr, 4); interp[2] = brw_vec1_grf(nr+1, 0); interp[3] = brw_vec1_grf(nr+1, 4); for(i = 0; i < 4; i++ ) { if (mask & (1<func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg interp[4]; struct brw_reg dst, src0; src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1); GLuint nr = src0.nr; int i; interp[0] = brw_vec1_grf(nr, 0); interp[1] = brw_vec1_grf(nr, 4); interp[2] = brw_vec1_grf(nr+1, 0); interp[3] = brw_vec1_grf(nr+1, 4); for(i = 0; i < 4; i++ ) { if (mask & (1<func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg interp[4]; struct brw_reg dst, delta0, delta1; struct brw_reg src0, w; src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1); delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1); delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1); w = get_src_reg(c, &inst->SrcReg[2], 3, 1); GLuint nr = src0.nr; int i; interp[0] = brw_vec1_grf(nr, 0); interp[1] = brw_vec1_grf(nr, 4); interp[2] = brw_vec1_grf(nr+1, 0); interp[3] = brw_vec1_grf(nr+1, 4); for(i = 0; i < 4; i++ ) { if (mask & (1<func; GLuint mask = inst->DstReg.WriteMask; for (i = 0; i < 4; i++) { GLuint i2 = (i+2)%3; GLuint i1 = (i+1)%3; if (mask & (1<SrcReg[0], i2, 1)); src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1); brw_MUL(p, brw_null_reg(), src0, src1); src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1); src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1); brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF); brw_MAC(p, dst, src0, src1); brw_set_saturate(p, 0); } } brw_set_saturate(p, 0); } static void emit_dp3(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_reg src0[3], src1[3], dst; int i; struct brw_compile *p = &c->func; for (i = 0; i < 3; i++) { src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1); src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1); } dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1); brw_MUL(p, brw_null_reg(), src0[0], src1[0]); brw_MAC(p, brw_null_reg(), src0[1], src1[1]); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); brw_MAC(p, dst, src0[2], src1[2]); brw_set_saturate(p, 0); } static void emit_dp4(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_reg src0[4], src1[4], dst; int i; struct brw_compile *p = &c->func; for (i = 0; i < 4; i++) { src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1); src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1); } dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1); brw_MUL(p, brw_null_reg(), src0[0], src1[0]); brw_MAC(p, brw_null_reg(), src0[1], src1[1]); brw_MAC(p, brw_null_reg(), src0[2], src1[2]); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); brw_MAC(p, dst, src0[3], src1[3]); brw_set_saturate(p, 0); } static void emit_dph(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_reg src0[4], src1[4], dst; int i; struct brw_compile *p = &c->func; for (i = 0; i < 4; i++) { src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1); src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1); } dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1); brw_MUL(p, brw_null_reg(), src0[0], src1[0]); brw_MAC(p, brw_null_reg(), src0[1], src1[1]); brw_MAC(p, dst, src0[2], src1[2]); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); brw_ADD(p, dst, dst, src1[3]); brw_set_saturate(p, 0); } static void emit_math1(struct brw_wm_compile *c, struct prog_instruction *inst, GLuint func) { struct brw_compile *p = &c->func; struct brw_reg src0, dst; src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1); dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1); brw_MOV(p, brw_message_reg(2), src0); brw_math(p, dst, func, (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE, 2, brw_null_reg(), BRW_MATH_DATA_VECTOR, BRW_MATH_PRECISION_FULL); } static void emit_rcp(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_math1(c, inst, BRW_MATH_FUNCTION_INV); } static void emit_rsq(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ); } static void emit_sin(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_math1(c, inst, BRW_MATH_FUNCTION_SIN); } static void emit_cos(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_math1(c, inst, BRW_MATH_FUNCTION_COS); } static void emit_ex2(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_math1(c, inst, BRW_MATH_FUNCTION_EXP); } static void emit_lg2(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_math1(c, inst, BRW_MATH_FUNCTION_LOG); } static void emit_add(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; struct brw_reg src0, src1, dst; GLuint mask = inst->DstReg.WriteMask; int i; brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); for (i = 0 ; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); src1 = get_src_reg(c, &inst->SrcReg[1], i, 1); brw_ADD(p, dst, src0, src1); } } brw_set_saturate(p, 0); } static void emit_sub(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; struct brw_reg src0, src1, dst; GLuint mask = inst->DstReg.WriteMask; int i; brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); for (i = 0 ; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); src1 = get_src_reg(c, &inst->SrcReg[1], i, 1); brw_ADD(p, dst, src0, negate(src1)); } } brw_set_saturate(p, 0); } static void emit_mul(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; struct brw_reg src0, src1, dst; GLuint mask = inst->DstReg.WriteMask; int i; brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); for (i = 0 ; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); src1 = get_src_reg(c, &inst->SrcReg[1], i, 1); brw_MUL(p, dst, src0, src1); } } brw_set_saturate(p, 0); } static void emit_frc(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; struct brw_reg src0, dst; GLuint mask = inst->DstReg.WriteMask; int i; brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); for (i = 0 ; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); brw_FRC(p, dst, src0); } } if (inst->SaturateMode != SATURATE_OFF) brw_set_saturate(p, 0); } static void emit_flr(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; struct brw_reg src0, dst; GLuint mask = inst->DstReg.WriteMask; int i; brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); for (i = 0 ; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); brw_RNDD(p, dst, src0); } } brw_set_saturate(p, 0); } static void emit_max(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg src0, src1, dst; int i; brw_push_insn_state(p); for (i = 0; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); src1 = get_src_reg(c, &inst->SrcReg[1], i, 1); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); brw_MOV(p, dst, src0); brw_set_saturate(p, 0); brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); brw_MOV(p, dst, src1); brw_set_saturate(p, 0); brw_set_predicate_control_flag_value(p, 0xff); } } brw_pop_insn_state(p); } static void emit_min(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg src0, src1, dst; int i; brw_push_insn_state(p); for (i = 0; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); src1 = get_src_reg(c, &inst->SrcReg[1], i, 1); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); brw_MOV(p, dst, src0); brw_set_saturate(p, 0); brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); brw_MOV(p, dst, src1); brw_set_saturate(p, 0); brw_set_predicate_control_flag_value(p, 0xff); } } brw_pop_insn_state(p); } static void emit_pow(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; struct brw_reg dst, src0, src1; dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1); src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1); src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1); brw_MOV(p, brw_message_reg(2), src0); brw_MOV(p, brw_message_reg(3), src1); brw_math(p, dst, BRW_MATH_FUNCTION_POW, (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE, 2, brw_null_reg(), BRW_MATH_DATA_VECTOR, BRW_MATH_PRECISION_FULL); } static void emit_lrp(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg dst, tmp1, tmp2, src0, src1, src2; int i; int mark = mark_tmps(c); for (i = 0; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); src1 = get_src_reg(c, &inst->SrcReg[1], i, 1); if (src1.nr == dst.nr) { tmp1 = alloc_tmp(c); brw_MOV(p, tmp1, src1); } else tmp1 = src1; src2 = get_src_reg(c, &inst->SrcReg[2], i, 1); if (src2.nr == dst.nr) { tmp2 = alloc_tmp(c); brw_MOV(p, tmp2, src2); } else tmp2 = src2; brw_ADD(p, dst, negate(src0), brw_imm_f(1.0)); brw_MUL(p, brw_null_reg(), dst, tmp2); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); brw_MAC(p, dst, src0, tmp1); brw_set_saturate(p, 0); } release_tmps(c, mark); } } /** * For GLSL shaders, this KIL will be unconditional. * It may be contained inside an IF/ENDIF structure of course. */ static void emit_kil(struct brw_wm_compile *c) { struct brw_compile *p = &c->func; struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW); brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK brw_AND(p, depth, c->emit_mask_reg, depth); brw_pop_insn_state(p); } static void emit_mad(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg dst, src0, src1, src2; int i; for (i = 0; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); src1 = get_src_reg(c, &inst->SrcReg[1], i, 1); src2 = get_src_reg(c, &inst->SrcReg[2], i, 1); brw_MUL(p, dst, src0, src1); brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0); brw_ADD(p, dst, dst, src2); brw_set_saturate(p, 0); } } } static void emit_sop(struct brw_wm_compile *c, struct prog_instruction *inst, GLuint cond) { struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg dst, src0, src1; int i; for (i = 0; i < 4; i++) { if (mask & (1<SrcReg[0], i, 1); src1 = get_src_reg(c, &inst->SrcReg[1], i, 1); brw_push_insn_state(p); brw_CMP(p, brw_null_reg(), cond, src0, src1); brw_set_predicate_control(p, BRW_PREDICATE_NONE); brw_MOV(p, dst, brw_imm_f(0.0)); brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); brw_MOV(p, dst, brw_imm_f(1.0)); brw_pop_insn_state(p); } } } static void emit_slt(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_sop(c, inst, BRW_CONDITIONAL_L); } static void emit_sle(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_sop(c, inst, BRW_CONDITIONAL_LE); } static void emit_sgt(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_sop(c, inst, BRW_CONDITIONAL_G); } static void emit_sge(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_sop(c, inst, BRW_CONDITIONAL_GE); } static void emit_seq(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_sop(c, inst, BRW_CONDITIONAL_EQ); } static void emit_sne(struct brw_wm_compile *c, struct prog_instruction *inst) { emit_sop(c, inst, BRW_CONDITIONAL_NEQ); } static void emit_ddx(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg interp[4]; struct brw_reg dst; struct brw_reg src0, w; GLuint nr, i; src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1); w = get_src_reg(c, &inst->SrcReg[1], 3, 1); nr = src0.nr; interp[0] = brw_vec1_grf(nr, 0); interp[1] = brw_vec1_grf(nr, 4); interp[2] = brw_vec1_grf(nr+1, 0); interp[3] = brw_vec1_grf(nr+1, 4); brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF); for(i = 0; i < 4; i++ ) { if (mask & (1<func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg interp[4]; struct brw_reg dst; struct brw_reg src0, w; GLuint nr, i; src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1); nr = src0.nr; w = get_src_reg(c, &inst->SrcReg[1], 3, 1); interp[0] = brw_vec1_grf(nr, 0); interp[1] = brw_vec1_grf(nr, 4); interp[2] = brw_vec1_grf(nr+1, 0); interp[3] = brw_vec1_grf(nr+1, 4); brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF); for(i = 0; i < 4; i++ ) { if (mask & (1<func; struct brw_reg param, x0, x1, /* gradients at each end */ t, tmp[ 2 ], /* float temporaries */ itmp[ 5 ]; /* unsigned integer temporaries (aliases of floats above) */ int i; int mark = mark_tmps( c ); x0 = alloc_tmp( c ); x1 = alloc_tmp( c ); t = alloc_tmp( c ); tmp[ 0 ] = alloc_tmp( c ); tmp[ 1 ] = alloc_tmp( c ); itmp[ 0 ] = retype( tmp[ 0 ], BRW_REGISTER_TYPE_UD ); itmp[ 1 ] = retype( tmp[ 1 ], BRW_REGISTER_TYPE_UD ); itmp[ 2 ] = retype( x0, BRW_REGISTER_TYPE_UD ); itmp[ 3 ] = retype( x1, BRW_REGISTER_TYPE_UD ); itmp[ 4 ] = retype( t, BRW_REGISTER_TYPE_UD ); param = lookup_tmp( c, mark - 2 ); brw_set_access_mode( p, BRW_ALIGN_1 ); brw_MOV( p, itmp[ 2 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */ /* Arrange the two end coordinates into scalars (itmp0/itmp1) to be hashed. Also compute the remainder (offset within the unit length), interleaved to reduce register dependency penalties. */ brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param ); brw_FRC( p, param, param ); brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 1 ) ); brw_MOV( p, itmp[ 3 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */ brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */ /* We're now ready to perform the hashing. The two hashes are interleaved for performance. The hash function used is designed to rapidly achieve avalanche and require only 32x16 bit multiplication, and 16-bit swizzles (which we get for free). We can't use immediate operands in the multiplies, because immediates are permitted only in src1 and the 16-bit factor is permitted only in src0. */ for( i = 0; i < 2; i++ ) brw_MUL( p, itmp[ i ], itmp[ 2 ], itmp[ i ] ); for( i = 0; i < 2; i++ ) brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), high_words( itmp[ i ] ) ); for( i = 0; i < 2; i++ ) brw_MUL( p, itmp[ i ], itmp[ 3 ], itmp[ i ] ); for( i = 0; i < 2; i++ ) brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), high_words( itmp[ i ] ) ); for( i = 0; i < 2; i++ ) brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] ); for( i = 0; i < 2; i++ ) brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), high_words( itmp[ i ] ) ); /* Now we want to initialise the two gradients based on the hashes. Format conversion from signed integer to float leaves everything scaled too high by a factor of pow( 2, 31 ), but we correct for that right at the end. */ brw_ADD( p, t, param, brw_imm_f( -1.0 ) ); brw_MOV( p, x0, retype( tmp[ 0 ], BRW_REGISTER_TYPE_D ) ); brw_MOV( p, x1, retype( tmp[ 1 ], BRW_REGISTER_TYPE_D ) ); brw_MUL( p, x0, x0, param ); brw_MUL( p, x1, x1, t ); /* We interpolate between the gradients using the polynomial 6t^5 - 15t^4 + 10t^3 (Perlin). */ brw_MUL( p, tmp[ 0 ], param, brw_imm_f( 6.0 ) ); brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) ); brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param ); brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) ); brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param ); brw_ADD( p, x1, x1, negate( x0 ) ); /* unrelated work to fill the pipeline */ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param ); brw_MUL( p, param, tmp[ 0 ], param ); brw_MUL( p, x1, x1, param ); brw_ADD( p, x0, x0, x1 ); /* scale by pow( 2, -30 ), to compensate for the format conversion above and an extra factor of 2 so that a single gradient covers the [-1,1] range */ brw_MUL( p, param, x0, brw_imm_f( 0.000000000931322574615478515625 ) ); release_tmps( c, mark ); } static void emit_noise1( struct brw_wm_compile *c, struct prog_instruction *inst ) { struct brw_compile *p = &c->func; struct brw_reg src, param, dst; GLuint mask = inst->DstReg.WriteMask; int i; int mark = mark_tmps( c ); assert( mark == 0 ); src = get_src_reg( c, inst->SrcReg, 0, 1 ); param = alloc_tmp( c ); brw_MOV( p, param, src ); invoke_subroutine( c, SUB_NOISE1, noise1_sub ); /* Fill in the result: */ brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); for (i = 0 ; i < 4; i++) { if (mask & (1<SaturateMode == SATURATE_ZERO_ONE ) brw_set_saturate( p, 0 ); release_tmps( c, mark ); } static void noise2_sub( struct brw_wm_compile *c ) { struct brw_compile *p = &c->func; struct brw_reg param0, param1, x0y0, x0y1, x1y0, x1y1, /* gradients at each corner */ t, tmp[ 4 ], /* float temporaries */ itmp[ 7 ]; /* unsigned integer temporaries (aliases of floats above) */ int i; int mark = mark_tmps( c ); x0y0 = alloc_tmp( c ); x0y1 = alloc_tmp( c ); x1y0 = alloc_tmp( c ); x1y1 = alloc_tmp( c ); t = alloc_tmp( c ); for( i = 0; i < 4; i++ ) { tmp[ i ] = alloc_tmp( c ); itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD ); } itmp[ 4 ] = retype( x0y0, BRW_REGISTER_TYPE_UD ); itmp[ 5 ] = retype( x0y1, BRW_REGISTER_TYPE_UD ); itmp[ 6 ] = retype( x1y0, BRW_REGISTER_TYPE_UD ); param0 = lookup_tmp( c, mark - 3 ); param1 = lookup_tmp( c, mark - 2 ); brw_set_access_mode( p, BRW_ALIGN_1 ); /* Arrange the four corner coordinates into scalars (itmp0..itmp3) to be hashed. Also compute the remainders (offsets within the unit square), interleaved to reduce register dependency penalties. */ brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 ); brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 ); brw_FRC( p, param0, param0 ); brw_FRC( p, param1, param1 ); brw_MOV( p, itmp[ 4 ], brw_imm_ud( 0xBA97 ) ); /* constant used later */ brw_ADD( p, high_words( itmp[ 0 ] ), high_words( itmp[ 0 ] ), low_words( itmp[ 1 ] ) ); brw_MOV( p, itmp[ 5 ], brw_imm_ud( 0x79D9 ) ); /* constant used later */ brw_MOV( p, itmp[ 6 ], brw_imm_ud( 0xD5B1 ) ); /* constant used later */ brw_ADD( p, itmp[ 1 ], itmp[ 0 ], brw_imm_ud( 0x10000 ) ); brw_ADD( p, itmp[ 2 ], itmp[ 0 ], brw_imm_ud( 0x1 ) ); brw_ADD( p, itmp[ 3 ], itmp[ 0 ], brw_imm_ud( 0x10001 ) ); /* We're now ready to perform the hashing. The four hashes are interleaved for performance. The hash function used is designed to rapidly achieve avalanche and require only 32x16 bit multiplication, and 16-bit swizzles (which we get for free). We can't use immediate operands in the multiplies, because immediates are permitted only in src1 and the 16-bit factor is permitted only in src0. */ for( i = 0; i < 4; i++ ) brw_MUL( p, itmp[ i ], itmp[ 4 ], itmp[ i ] ); for( i = 0; i < 4; i++ ) brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), high_words( itmp[ i ] ) ); for( i = 0; i < 4; i++ ) brw_MUL( p, itmp[ i ], itmp[ 5 ], itmp[ i ] ); for( i = 0; i < 4; i++ ) brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), high_words( itmp[ i ] ) ); for( i = 0; i < 4; i++ ) brw_MUL( p, itmp[ i ], itmp[ 6 ], itmp[ i ] ); for( i = 0; i < 4; i++ ) brw_XOR( p, low_words( itmp[ i ] ), low_words( itmp[ i ] ), high_words( itmp[ i ] ) ); /* Now we want to initialise the four gradients based on the hashes. Format conversion from signed integer to float leaves everything scaled too high by a factor of pow( 2, 15 ), but we correct for that right at the end. */ brw_ADD( p, t, param0, brw_imm_f( -1.0 ) ); brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) ); brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) ); brw_MOV( p, x1y0, low_words( tmp[ 2 ] ) ); brw_MOV( p, x1y1, low_words( tmp[ 3 ] ) ); brw_MOV( p, tmp[ 0 ], high_words( tmp[ 0 ] ) ); brw_MOV( p, tmp[ 1 ], high_words( tmp[ 1 ] ) ); brw_MOV( p, tmp[ 2 ], high_words( tmp[ 2 ] ) ); brw_MOV( p, tmp[ 3 ], high_words( tmp[ 3 ] ) ); brw_MUL( p, x1y0, x1y0, t ); brw_MUL( p, x1y1, x1y1, t ); brw_ADD( p, t, param1, brw_imm_f( -1.0 ) ); brw_MUL( p, x0y0, x0y0, param0 ); brw_MUL( p, x0y1, x0y1, param0 ); brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param1 ); brw_MUL( p, tmp[ 2 ], tmp[ 2 ], param1 ); brw_MUL( p, tmp[ 1 ], tmp[ 1 ], t ); brw_MUL( p, tmp[ 3 ], tmp[ 3 ], t ); brw_ADD( p, x0y0, x0y0, tmp[ 0 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 2 ] ); brw_ADD( p, x0y1, x0y1, tmp[ 1 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 3 ] ); /* We interpolate between the gradients using the polynomial 6t^5 - 15t^4 + 10t^3 (Perlin). */ brw_MUL( p, tmp[ 0 ], param0, brw_imm_f( 6.0 ) ); brw_MUL( p, tmp[ 1 ], param1, brw_imm_f( 6.0 ) ); brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( -15.0 ) ); brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( -15.0 ) ); brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 ); brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 ); brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work to fill the pipeline */ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], brw_imm_f( 10.0 ) ); brw_ADD( p, tmp[ 1 ], tmp[ 1 ], brw_imm_f( 10.0 ) ); brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 ); brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 ); brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work to fill the pipeline */ brw_MUL( p, tmp[ 0 ], tmp[ 0 ], param0 ); brw_MUL( p, tmp[ 1 ], tmp[ 1 ], param1 ); brw_MUL( p, param0, tmp[ 0 ], param0 ); brw_MUL( p, param1, tmp[ 1 ], param1 ); /* Here we interpolate in the y dimension... */ brw_MUL( p, x0y1, x0y1, param1 ); brw_MUL( p, x1y1, x1y1, param1 ); brw_ADD( p, x0y0, x0y0, x0y1 ); brw_ADD( p, x1y0, x1y0, x1y1 ); /* And now in x. There are horrible register dependencies here, but we have nothing else to do. */ brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); brw_MUL( p, x1y0, x1y0, param0 ); brw_ADD( p, x0y0, x0y0, x1y0 ); /* scale by pow( 2, -15 ), as described above */ brw_MUL( p, param0, x0y0, brw_imm_f( 0.000030517578125 ) ); release_tmps( c, mark ); } static void emit_noise2( struct brw_wm_compile *c, struct prog_instruction *inst ) { struct brw_compile *p = &c->func; struct brw_reg src0, src1, param0, param1, dst; GLuint mask = inst->DstReg.WriteMask; int i; int mark = mark_tmps( c ); assert( mark == 0 ); src0 = get_src_reg( c, inst->SrcReg, 0, 1 ); src1 = get_src_reg( c, inst->SrcReg, 1, 1 ); param0 = alloc_tmp( c ); param1 = alloc_tmp( c ); brw_MOV( p, param0, src0 ); brw_MOV( p, param1, src1 ); invoke_subroutine( c, SUB_NOISE2, noise2_sub ); /* Fill in the result: */ brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); for (i = 0 ; i < 4; i++) { if (mask & (1<SaturateMode == SATURATE_ZERO_ONE ) brw_set_saturate( p, 0 ); release_tmps( c, mark ); } /* The three-dimensional case is much like the one- and two- versions above, but since the number of corners is rapidly growing we now pack 16 16-bit hashes into each register to extract more parallelism from the EUs. */ static void noise3_sub( struct brw_wm_compile *c ) { struct brw_compile *p = &c->func; struct brw_reg param0, param1, param2, x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */ xi, yi, zi, /* interpolation coefficients */ t, tmp[ 8 ], /* float temporaries */ itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */ wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */ int i; int mark = mark_tmps( c ); x0y0 = alloc_tmp( c ); x0y1 = alloc_tmp( c ); x1y0 = alloc_tmp( c ); x1y1 = alloc_tmp( c ); xi = alloc_tmp( c ); yi = alloc_tmp( c ); zi = alloc_tmp( c ); t = alloc_tmp( c ); for( i = 0; i < 8; i++ ) { tmp[ i ] = alloc_tmp( c ); itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD ); wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 ); } param0 = lookup_tmp( c, mark - 4 ); param1 = lookup_tmp( c, mark - 3 ); param2 = lookup_tmp( c, mark - 2 ); brw_set_access_mode( p, BRW_ALIGN_1 ); /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to be hashed. Also compute the remainders (offsets within the unit cube), interleaved to reduce register dependency penalties. */ brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param0 ); brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param1 ); brw_RNDD( p, retype( itmp[ 2 ], BRW_REGISTER_TYPE_D ), param2 ); brw_FRC( p, param0, param0 ); brw_FRC( p, param1, param1 ); brw_FRC( p, param2, param2 ); /* Since we now have only 16 bits of precision in the hash, we must be more careful about thorough mixing to maintain entropy as we squash the input vector into a small scalar. */ brw_MUL( p, brw_null_reg(), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) ); brw_MAC( p, brw_null_reg(), low_words( itmp[ 1 ] ), brw_imm_uw( 0xD0BD ) ); brw_MAC( p, low_words( itmp[ 0 ] ), low_words( itmp[ 2 ] ), brw_imm_uw( 0x9B93 ) ); brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) ); /* Temporarily disable the execution mask while we work with ExecSize=16 channels (the mask is set for ExecSize=8 and is probably incorrect). Although this might cause execution of unwanted channels, the code writes only to temporary registers and has no side effects, so disabling the mask is harmless. */ brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) ); brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) ); brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) ); /* We're now ready to perform the hashing. The eight hashes are interleaved for performance. The hash function used is designed to rapidly achieve avalanche and require only 16x16 bit multiplication, and 8-bit swizzles (which we get for free). */ for( i = 0; i < 4; i++ ) brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) ); for( i = 0; i < 4; i++ ) brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), odd_bytes( wtmp[ i ] ) ); for( i = 0; i < 4; i++ ) brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) ); for( i = 0; i < 4; i++ ) brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), odd_bytes( wtmp[ i ] ) ); brw_pop_insn_state( p ); /* Now we want to initialise the four rear gradients based on the hashes. Format conversion from signed integer to float leaves everything scaled too high by a factor of pow( 2, 15 ), but we correct for that right at the end. */ /* x component */ brw_ADD( p, t, param0, brw_imm_f( -1.0 ) ); brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) ); brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) ); brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) ); brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) ); brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) ); brw_pop_insn_state( p ); brw_MUL( p, x1y0, x1y0, t ); brw_MUL( p, x1y1, x1y1, t ); brw_ADD( p, t, param1, brw_imm_f( -1.0 ) ); brw_MUL( p, x0y0, x0y0, param0 ); brw_MUL( p, x0y1, x0y1, param0 ); /* y component */ brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 5 ) ); brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 5 ) ); brw_pop_insn_state( p ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); brw_ADD( p, t, param0, brw_imm_f( -1.0 ) ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); /* z component */ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param2 ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param2 ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param2 ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param2 ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); /* We interpolate between the gradients using the polynomial 6t^5 - 15t^4 + 10t^3 (Perlin). */ brw_MUL( p, xi, param0, brw_imm_f( 6.0 ) ); brw_MUL( p, yi, param1, brw_imm_f( 6.0 ) ); brw_MUL( p, zi, param2, brw_imm_f( 6.0 ) ); brw_ADD( p, xi, xi, brw_imm_f( -15.0 ) ); brw_ADD( p, yi, yi, brw_imm_f( -15.0 ) ); brw_ADD( p, zi, zi, brw_imm_f( -15.0 ) ); brw_MUL( p, xi, xi, param0 ); brw_MUL( p, yi, yi, param1 ); brw_MUL( p, zi, zi, param2 ); brw_ADD( p, xi, xi, brw_imm_f( 10.0 ) ); brw_ADD( p, yi, yi, brw_imm_f( 10.0 ) ); brw_ADD( p, zi, zi, brw_imm_f( 10.0 ) ); brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); /* unrelated work */ brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); /* unrelated work */ brw_MUL( p, xi, xi, param0 ); brw_MUL( p, yi, yi, param1 ); brw_MUL( p, zi, zi, param2 ); brw_MUL( p, xi, xi, param0 ); brw_MUL( p, yi, yi, param1 ); brw_MUL( p, zi, zi, param2 ); brw_MUL( p, xi, xi, param0 ); brw_MUL( p, yi, yi, param1 ); brw_MUL( p, zi, zi, param2 ); /* Here we interpolate in the y dimension... */ brw_MUL( p, x0y1, x0y1, yi ); brw_MUL( p, x1y1, x1y1, yi ); brw_ADD( p, x0y0, x0y0, x0y1 ); brw_ADD( p, x1y0, x1y0, x1y1 ); /* And now in x. Leave the result in tmp[ 0 ] (see below)... */ brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); brw_MUL( p, x1y0, x1y0, xi ); brw_ADD( p, tmp[ 0 ], x0y0, x1y0 ); /* Now do the same thing for the front four gradients... */ /* x component */ brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) ); brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) ); brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) ); brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) ); brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) ); brw_pop_insn_state( p ); brw_MUL( p, x1y0, x1y0, t ); brw_MUL( p, x1y1, x1y1, t ); brw_ADD( p, t, param1, brw_imm_f( -1.0 ) ); brw_MUL( p, x0y0, x0y0, param0 ); brw_MUL( p, x0y1, x0y1, param0 ); /* y component */ brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 5 ) ); brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 5 ) ); brw_pop_insn_state( p ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); brw_ADD( p, t, param2, brw_imm_f( -1.0 ) ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param1 ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param1 ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); /* z component */ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); /* The interpolation coefficients are still around from last time, so again interpolate in the y dimension... */ brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); brw_MUL( p, x0y1, x0y1, yi ); brw_MUL( p, x1y1, x1y1, yi ); brw_ADD( p, x0y0, x0y0, x0y1 ); brw_ADD( p, x1y0, x1y0, x1y1 ); /* And now in x. The rear face is in tmp[ 0 ] (see above), so this time put the front face in tmp[ 1 ] and we're nearly there... */ brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); brw_MUL( p, x1y0, x1y0, xi ); brw_ADD( p, tmp[ 1 ], x0y0, x1y0 ); /* The final interpolation, in the z dimension: */ brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) ); brw_MUL( p, tmp[ 1 ], tmp[ 1 ], zi ); brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] ); /* scale by pow( 2, -15 ), as described above */ brw_MUL( p, param0, tmp[ 0 ], brw_imm_f( 0.000030517578125 ) ); release_tmps( c, mark ); } static void emit_noise3( struct brw_wm_compile *c, struct prog_instruction *inst ) { struct brw_compile *p = &c->func; struct brw_reg src0, src1, src2, param0, param1, param2, dst; GLuint mask = inst->DstReg.WriteMask; int i; int mark = mark_tmps( c ); assert( mark == 0 ); src0 = get_src_reg( c, inst->SrcReg, 0, 1 ); src1 = get_src_reg( c, inst->SrcReg, 1, 1 ); src2 = get_src_reg( c, inst->SrcReg, 2, 1 ); param0 = alloc_tmp( c ); param1 = alloc_tmp( c ); param2 = alloc_tmp( c ); brw_MOV( p, param0, src0 ); brw_MOV( p, param1, src1 ); brw_MOV( p, param2, src2 ); invoke_subroutine( c, SUB_NOISE3, noise3_sub ); /* Fill in the result: */ brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); for (i = 0 ; i < 4; i++) { if (mask & (1<SaturateMode == SATURATE_ZERO_ONE ) brw_set_saturate( p, 0 ); release_tmps( c, mark ); } /* For the four-dimensional case, the little micro-optimisation benefits we obtain by unrolling all the loops aren't worth the massive bloat it now causes. Instead, we loop twice around performing a similar operation to noise3, once for the w=0 cube and once for the w=1, with a bit more code to glue it all together. */ static void noise4_sub( struct brw_wm_compile *c ) { struct brw_compile *p = &c->func; struct brw_reg param[ 4 ], x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */ w0, /* noise for the w=0 cube */ floors[ 2 ], /* integer coordinates of base corner of hypercube */ interp[ 4 ], /* interpolation coefficients */ t, tmp[ 8 ], /* float temporaries */ itmp[ 8 ], /* unsigned integer temporaries (aliases of floats above) */ wtmp[ 8 ]; /* 16-way unsigned word temporaries (aliases of above) */ int i, j; int mark = mark_tmps( c ); GLuint loop, origin; x0y0 = alloc_tmp( c ); x0y1 = alloc_tmp( c ); x1y0 = alloc_tmp( c ); x1y1 = alloc_tmp( c ); t = alloc_tmp( c ); w0 = alloc_tmp( c ); floors[ 0 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD ); floors[ 1 ] = retype( alloc_tmp( c ), BRW_REGISTER_TYPE_UD ); for( i = 0; i < 4; i++ ) { param[ i ] = lookup_tmp( c, mark - 5 + i ); interp[ i ] = alloc_tmp( c ); } for( i = 0; i < 8; i++ ) { tmp[ i ] = alloc_tmp( c ); itmp[ i ] = retype( tmp[ i ], BRW_REGISTER_TYPE_UD ); wtmp[ i ] = brw_uw16_grf( tmp[ i ].nr, 0 ); } brw_set_access_mode( p, BRW_ALIGN_1 ); /* We only want 16 bits of precision from the integral part of each co-ordinate, but unfortunately the RNDD semantics would saturate at 16 bits if we performed the operation directly to a 16-bit destination. Therefore, we round to 32-bit temporaries where appropriate, and then store only the lower 16 bits. */ brw_RNDD( p, retype( floors[ 0 ], BRW_REGISTER_TYPE_D ), param[ 0 ] ); brw_RNDD( p, retype( itmp[ 0 ], BRW_REGISTER_TYPE_D ), param[ 1 ] ); brw_RNDD( p, retype( floors[ 1 ], BRW_REGISTER_TYPE_D ), param[ 2 ] ); brw_RNDD( p, retype( itmp[ 1 ], BRW_REGISTER_TYPE_D ), param[ 3 ] ); brw_MOV( p, high_words( floors[ 0 ] ), low_words( itmp[ 0 ] ) ); brw_MOV( p, high_words( floors[ 1 ] ), low_words( itmp[ 1 ] ) ); /* Modify the flag register here, because the side effect is useful later (see below). We know for certain that all flags will be cleared, since the FRC instruction cannot possibly generate negative results. Even for exceptional inputs (infinities, denormals, NaNs), the architecture guarantees that the L conditional is false. */ brw_set_conditionalmod( p, BRW_CONDITIONAL_L ); brw_FRC( p, param[ 0 ], param[ 0 ] ); brw_set_predicate_control( p, BRW_PREDICATE_NONE ); for( i = 1; i < 4; i++ ) brw_FRC( p, param[ i ], param[ i ] ); /* Calculate the interpolation coefficients (6t^5 - 15t^4 + 10t^3) first of all. */ for( i = 0; i < 4; i++ ) brw_MUL( p, interp[ i ], param[ i ], brw_imm_f( 6.0 ) ); for( i = 0; i < 4; i++ ) brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( -15.0 ) ); for( i = 0; i < 4; i++ ) brw_MUL( p, interp[ i ], interp[ i ], param[ i ] ); for( i = 0; i < 4; i++ ) brw_ADD( p, interp[ i ], interp[ i ], brw_imm_f( 10.0 ) ); for( j = 0; j < 3; j++ ) for( i = 0; i < 4; i++ ) brw_MUL( p, interp[ i ], interp[ i ], param[ i ] ); /* Mark the current address, as it will be a jump destination. The following code will be executed twice: first, with the flag register clear indicating the w=0 case, and second with flags set for w=1. */ loop = p->nr_insn; /* Arrange the eight corner coordinates into scalars (itmp0..itmp3) to be hashed. Since we have only 16 bits of precision in the hash, we must be careful about thorough mixing to maintain entropy as we squash the input vector into a small scalar. */ brw_MUL( p, brw_null_reg(), low_words( floors[ 0 ] ), brw_imm_uw( 0xBC8F ) ); brw_MAC( p, brw_null_reg(), high_words( floors[ 0 ] ), brw_imm_uw( 0xD0BD ) ); brw_MAC( p, brw_null_reg(), low_words( floors[ 1 ] ), brw_imm_uw( 0x9B93 ) ); brw_MAC( p, low_words( itmp[ 0 ] ), high_words( floors[ 1 ] ), brw_imm_uw( 0xA359 ) ); brw_ADD( p, high_words( itmp[ 0 ] ), low_words( itmp[ 0 ] ), brw_imm_uw( 0xBC8F ) ); /* Temporarily disable the execution mask while we work with ExecSize=16 channels (the mask is set for ExecSize=8 and is probably incorrect). Although this might cause execution of unwanted channels, the code writes only to temporary registers and has no side effects, so disabling the mask is harmless. */ brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_ADD( p, wtmp[ 1 ], wtmp[ 0 ], brw_imm_uw( 0xD0BD ) ); brw_ADD( p, wtmp[ 2 ], wtmp[ 0 ], brw_imm_uw( 0x9B93 ) ); brw_ADD( p, wtmp[ 3 ], wtmp[ 1 ], brw_imm_uw( 0x9B93 ) ); /* We're now ready to perform the hashing. The eight hashes are interleaved for performance. The hash function used is designed to rapidly achieve avalanche and require only 16x16 bit multiplication, and 8-bit swizzles (which we get for free). */ for( i = 0; i < 4; i++ ) brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0x28D9 ) ); for( i = 0; i < 4; i++ ) brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), odd_bytes( wtmp[ i ] ) ); for( i = 0; i < 4; i++ ) brw_MUL( p, wtmp[ i ], wtmp[ i ], brw_imm_uw( 0xC6D5 ) ); for( i = 0; i < 4; i++ ) brw_XOR( p, even_bytes( wtmp[ i ] ), even_bytes( wtmp[ i ] ), odd_bytes( wtmp[ i ] ) ); brw_pop_insn_state( p ); /* Now we want to initialise the four rear gradients based on the hashes. Format conversion from signed integer to float leaves everything scaled too high by a factor of pow( 2, 15 ), but we correct for that right at the end. */ /* x component */ brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) ); brw_MOV( p, x0y0, low_words( tmp[ 0 ] ) ); brw_MOV( p, x0y1, low_words( tmp[ 1 ] ) ); brw_MOV( p, x1y0, high_words( tmp[ 0 ] ) ); brw_MOV( p, x1y1, high_words( tmp[ 1 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) ); brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) ); brw_pop_insn_state( p ); brw_MUL( p, x1y0, x1y0, t ); brw_MUL( p, x1y1, x1y1, t ); brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) ); brw_MUL( p, x0y0, x0y0, param[ 0 ] ); brw_MUL( p, x0y1, x0y1, param[ 0 ] ); /* y component */ brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) ); brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) ); brw_pop_insn_state( p ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); /* prepare t for the w component (used below): w the first time through the loop; w - 1 the second time) */ brw_set_predicate_control( p, BRW_PREDICATE_NORMAL ); brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) ); p->current->header.predicate_inverse = 1; brw_MOV( p, t, param[ 3 ] ); p->current->header.predicate_inverse = 0; brw_set_predicate_control( p, BRW_PREDICATE_NONE ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); /* z component */ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 0 ], wtmp[ 0 ], brw_imm_uw( 4 ) ); brw_SHL( p, wtmp[ 1 ], wtmp[ 1 ], brw_imm_uw( 4 ) ); brw_pop_insn_state( p ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 2 ] ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], param[ 2 ] ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 2 ] ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], param[ 2 ] ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); /* w component */ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 0 ] ) ); brw_MOV( p, tmp[ 5 ], low_words( tmp[ 1 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 0 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 1 ] ) ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); brw_ADD( p, t, param[ 0 ], brw_imm_f( -1.0 ) ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); /* Here we interpolate in the y dimension... */ brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); brw_MUL( p, x0y1, x0y1, interp[ 1 ] ); brw_MUL( p, x1y1, x1y1, interp[ 1 ] ); brw_ADD( p, x0y0, x0y0, x0y1 ); brw_ADD( p, x1y0, x1y0, x1y1 ); /* And now in x. Leave the result in tmp[ 0 ] (see below)... */ brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); brw_MUL( p, x1y0, x1y0, interp[ 0 ] ); brw_ADD( p, tmp[ 0 ], x0y0, x1y0 ); /* Now do the same thing for the front four gradients... */ /* x component */ brw_MOV( p, x0y0, low_words( tmp[ 2 ] ) ); brw_MOV( p, x0y1, low_words( tmp[ 3 ] ) ); brw_MOV( p, x1y0, high_words( tmp[ 2 ] ) ); brw_MOV( p, x1y1, high_words( tmp[ 3 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) ); brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) ); brw_pop_insn_state( p ); brw_MUL( p, x1y0, x1y0, t ); brw_MUL( p, x1y1, x1y1, t ); brw_ADD( p, t, param[ 1 ], brw_imm_f( -1.0 ) ); brw_MUL( p, x0y0, x0y0, param[ 0 ] ); brw_MUL( p, x0y1, x0y1, param[ 0 ] ); /* y component */ brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) ); brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) ); brw_pop_insn_state( p ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); brw_ADD( p, t, param[ 2 ], brw_imm_f( -1.0 ) ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], param[ 1 ] ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], param[ 1 ] ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); /* z component */ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_SHL( p, wtmp[ 2 ], wtmp[ 2 ], brw_imm_uw( 4 ) ); brw_SHL( p, wtmp[ 3 ], wtmp[ 3 ], brw_imm_uw( 4 ) ); brw_pop_insn_state( p ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); /* prepare t for the w component (used below): w the first time through the loop; w - 1 the second time) */ brw_set_predicate_control( p, BRW_PREDICATE_NORMAL ); brw_ADD( p, t, param[ 3 ], brw_imm_f( -1.0 ) ); p->current->header.predicate_inverse = 1; brw_MOV( p, t, param[ 3 ] ); p->current->header.predicate_inverse = 0; brw_set_predicate_control( p, BRW_PREDICATE_NONE ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); /* w component */ brw_MOV( p, tmp[ 4 ], low_words( tmp[ 2 ] ) ); brw_MOV( p, tmp[ 5 ], low_words( tmp[ 3 ] ) ); brw_MOV( p, tmp[ 6 ], high_words( tmp[ 2 ] ) ); brw_MOV( p, tmp[ 7 ], high_words( tmp[ 3 ] ) ); brw_MUL( p, tmp[ 4 ], tmp[ 4 ], t ); brw_MUL( p, tmp[ 5 ], tmp[ 5 ], t ); brw_MUL( p, tmp[ 6 ], tmp[ 6 ], t ); brw_MUL( p, tmp[ 7 ], tmp[ 7 ], t ); brw_ADD( p, x0y0, x0y0, tmp[ 4 ] ); brw_ADD( p, x0y1, x0y1, tmp[ 5 ] ); brw_ADD( p, x1y0, x1y0, tmp[ 6 ] ); brw_ADD( p, x1y1, x1y1, tmp[ 7 ] ); /* Interpolate in the y dimension: */ brw_ADD( p, x0y1, x0y1, negate( x0y0 ) ); brw_ADD( p, x1y1, x1y1, negate( x1y0 ) ); brw_MUL( p, x0y1, x0y1, interp[ 1 ] ); brw_MUL( p, x1y1, x1y1, interp[ 1 ] ); brw_ADD( p, x0y0, x0y0, x0y1 ); brw_ADD( p, x1y0, x1y0, x1y1 ); /* And now in x. The rear face is in tmp[ 0 ] (see above), so this time put the front face in tmp[ 1 ] and we're nearly there... */ brw_ADD( p, x1y0, x1y0, negate( x0y0 ) ); brw_MUL( p, x1y0, x1y0, interp[ 0 ] ); brw_ADD( p, tmp[ 1 ], x0y0, x1y0 ); /* Another interpolation, in the z dimension: */ brw_ADD( p, tmp[ 1 ], tmp[ 1 ], negate( tmp[ 0 ] ) ); brw_MUL( p, tmp[ 1 ], tmp[ 1 ], interp[ 2 ] ); brw_ADD( p, tmp[ 0 ], tmp[ 0 ], tmp[ 1 ] ); /* Exit the loop if we've computed both cubes... */ origin = p->nr_insn; brw_push_insn_state( p ); brw_set_predicate_control( p, BRW_PREDICATE_NORMAL ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( 0 ) ); brw_pop_insn_state( p ); /* Save the result for the w=0 case, and increment the w coordinate: */ brw_MOV( p, w0, tmp[ 0 ] ); brw_ADD( p, high_words( floors[ 1 ] ), high_words( floors[ 1 ] ), brw_imm_uw( 1 ) ); /* Loop around for the other cube. Explicitly set the flag register (unfortunately we must spend an extra instruction to do this: we can't rely on a side effect of the previous MOV or ADD because conditional modifiers which are normally true might be false in exceptional circumstances, e.g. given a NaN input; the add to brw_ip_reg() is not suitable because the IP is not an 8-vector). */ brw_push_insn_state( p ); brw_set_mask_control( p, BRW_MASK_DISABLE ); brw_MOV( p, brw_flag_reg(), brw_imm_uw( 0xFF ) ); brw_ADD( p, brw_ip_reg(), brw_ip_reg(), brw_imm_d( ( loop - p->nr_insn ) << 4 ) ); brw_pop_insn_state( p ); /* Patch the previous conditional branch now that we know the destination address. */ brw_set_src1( p->store + origin, brw_imm_d( ( p->nr_insn - origin ) << 4 ) ); /* The very last interpolation. */ brw_ADD( p, tmp[ 0 ], tmp[ 0 ], negate( w0 ) ); brw_MUL( p, tmp[ 0 ], tmp[ 0 ], interp[ 3 ] ); brw_ADD( p, tmp[ 0 ], tmp[ 0 ], w0 ); /* scale by pow( 2, -15 ), as described above */ brw_MUL( p, param[ 0 ], tmp[ 0 ], brw_imm_f( 0.000030517578125 ) ); release_tmps( c, mark ); } static void emit_noise4( struct brw_wm_compile *c, struct prog_instruction *inst ) { struct brw_compile *p = &c->func; struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst; GLuint mask = inst->DstReg.WriteMask; int i; int mark = mark_tmps( c ); assert( mark == 0 ); src0 = get_src_reg( c, inst->SrcReg, 0, 1 ); src1 = get_src_reg( c, inst->SrcReg, 1, 1 ); src2 = get_src_reg( c, inst->SrcReg, 2, 1 ); src3 = get_src_reg( c, inst->SrcReg, 3, 1 ); param0 = alloc_tmp( c ); param1 = alloc_tmp( c ); param2 = alloc_tmp( c ); param3 = alloc_tmp( c ); brw_MOV( p, param0, src0 ); brw_MOV( p, param1, src1 ); brw_MOV( p, param2, src2 ); brw_MOV( p, param3, src3 ); invoke_subroutine( c, SUB_NOISE4, noise4_sub ); /* Fill in the result: */ brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE ); for (i = 0 ; i < 4; i++) { if (mask & (1<SaturateMode == SATURATE_ZERO_ONE ) brw_set_saturate( p, 0 ); release_tmps( c, mark ); } static void emit_wpos_xy(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; struct brw_reg src0[2], dst[2]; dst[0] = get_dst_reg(c, inst, 0, 1); dst[1] = get_dst_reg(c, inst, 1, 1); src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1); src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1); /* Calculate the pixel offset from window bottom left into destination * X and Y channels. */ if (mask & WRITEMASK_X) { /* X' = X - origin_x */ brw_ADD(p, dst[0], retype(src0[0], BRW_REGISTER_TYPE_W), brw_imm_d(0 - c->key.origin_x)); } if (mask & WRITEMASK_Y) { /* Y' = height - (Y - origin_y) = height + origin_y - Y */ brw_ADD(p, dst[1], negate(retype(src0[1], BRW_REGISTER_TYPE_W)), brw_imm_d(c->key.origin_y + c->key.drawable_height - 1)); } } /* TODO BIAS on SIMD8 not workind yet... */ static void emit_txb(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; struct brw_reg dst[4], src[4], payload_reg; GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]; GLuint i; payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0); for (i = 0; i < 4; i++) dst[i] = get_dst_reg(c, inst, i, 1); for (i = 0; i < 4; i++) src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1); switch (inst->TexSrcTarget) { case TEXTURE_1D_INDEX: brw_MOV(p, brw_message_reg(2), src[0]); brw_MOV(p, brw_message_reg(3), brw_imm_f(0)); brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); break; case TEXTURE_2D_INDEX: case TEXTURE_RECT_INDEX: brw_MOV(p, brw_message_reg(2), src[0]); brw_MOV(p, brw_message_reg(3), src[1]); brw_MOV(p, brw_message_reg(4), brw_imm_f(0)); break; default: brw_MOV(p, brw_message_reg(2), src[0]); brw_MOV(p, brw_message_reg(3), src[1]); brw_MOV(p, brw_message_reg(4), src[2]); break; } brw_MOV(p, brw_message_reg(5), src[3]); brw_MOV(p, brw_message_reg(6), brw_imm_f(0)); brw_SAMPLE(p, retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), 1, retype(payload_reg, BRW_REGISTER_TYPE_UW), unit + MAX_DRAW_BUFFERS, /* surface */ unit, /* sampler */ inst->DstReg.WriteMask, BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS, 4, 4, 0); } static void emit_tex(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_compile *p = &c->func; struct brw_reg dst[4], src[4], payload_reg; GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]; GLuint msg_len; GLuint i, nr; GLuint emit; GLboolean shadow = (c->key.shadowtex_mask & (1<SrcReg[0], i, 1); switch (inst->TexSrcTarget) { case TEXTURE_1D_INDEX: emit = WRITEMASK_X; nr = 1; break; case TEXTURE_2D_INDEX: case TEXTURE_RECT_INDEX: emit = WRITEMASK_XY; nr = 2; break; default: emit = WRITEMASK_XYZ; nr = 3; break; } msg_len = 1; for (i = 0; i < nr; i++) { static const GLuint swz[4] = {0,1,2,2}; if (emit & (1<DstReg.WriteMask, BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE, 4, shadow ? 6 : 4, 0); if (shadow) brw_MOV(p, dst[3], brw_imm_f(1.0)); } static void post_wm_emit( struct brw_wm_compile *c ) { GLuint nr_insns = c->fp->program.Base.NumInstructions; GLuint insn, target_insn; struct prog_instruction *inst1, *inst2; struct brw_instruction *brw_inst1, *brw_inst2; int offset; for (insn = 0; insn < nr_insns; insn++) { inst1 = &c->fp->program.Base.Instructions[insn]; brw_inst1 = inst1->Data; switch (inst1->Opcode) { case OPCODE_CAL: target_insn = inst1->BranchTarget; inst2 = &c->fp->program.Base.Instructions[target_insn]; brw_inst2 = inst2->Data; offset = brw_inst2 - brw_inst1; brw_set_src1(brw_inst1, brw_imm_d(offset*16)); break; default: break; } } } static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c) { #define MAX_IFSN 32 #define MAX_LOOP_DEPTH 32 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH]; struct brw_instruction *inst0, *inst1; int i, if_insn = 0, loop_insn = 0; struct brw_compile *p = &c->func; struct brw_indirect stack_index = brw_indirect(0, 0); c->reg_index = 0; prealloc_reg(c); brw_set_compression_control(p, BRW_COMPRESSION_NONE); brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack)); for (i = 0; i < c->nr_fp_insns; i++) { struct prog_instruction *inst = &c->prog_instructions[i]; struct prog_instruction *orig_inst; if ((orig_inst = inst->Data) != 0) orig_inst->Data = current_insn(p); if (inst->CondUpdate) brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); else brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE); switch (inst->Opcode) { case WM_PIXELXY: emit_pixel_xy(c, inst); break; case WM_DELTAXY: emit_delta_xy(c, inst); break; case WM_PIXELW: emit_pixel_w(c, inst); break; case WM_LINTERP: emit_linterp(c, inst); break; case WM_PINTERP: emit_pinterp(c, inst); break; case WM_CINTERP: emit_cinterp(c, inst); break; case WM_WPOSXY: emit_wpos_xy(c, inst); break; case WM_FB_WRITE: emit_fb_write(c, inst); break; case OPCODE_ABS: emit_abs(c, inst); break; case OPCODE_ADD: emit_add(c, inst); break; case OPCODE_SUB: emit_sub(c, inst); break; case OPCODE_FRC: emit_frc(c, inst); break; case OPCODE_FLR: emit_flr(c, inst); break; case OPCODE_LRP: emit_lrp(c, inst); break; case OPCODE_TRUNC: emit_trunc(c, inst); break; case OPCODE_MOV: emit_mov(c, inst); break; case OPCODE_DP3: emit_dp3(c, inst); break; case OPCODE_DP4: emit_dp4(c, inst); break; case OPCODE_XPD: emit_xpd(c, inst); break; case OPCODE_DPH: emit_dph(c, inst); break; case OPCODE_RCP: emit_rcp(c, inst); break; case OPCODE_RSQ: emit_rsq(c, inst); break; case OPCODE_SIN: emit_sin(c, inst); break; case OPCODE_COS: emit_cos(c, inst); break; case OPCODE_EX2: emit_ex2(c, inst); break; case OPCODE_LG2: emit_lg2(c, inst); break; case OPCODE_MAX: emit_max(c, inst); break; case OPCODE_MIN: emit_min(c, inst); break; case OPCODE_DDX: emit_ddx(c, inst); break; case OPCODE_DDY: emit_ddy(c, inst); break; case OPCODE_SLT: emit_slt(c, inst); break; case OPCODE_SLE: emit_sle(c, inst); break; case OPCODE_SGT: emit_sgt(c, inst); break; case OPCODE_SGE: emit_sge(c, inst); break; case OPCODE_SEQ: emit_seq(c, inst); break; case OPCODE_SNE: emit_sne(c, inst); break; case OPCODE_MUL: emit_mul(c, inst); break; case OPCODE_POW: emit_pow(c, inst); break; case OPCODE_MAD: emit_mad(c, inst); break; case OPCODE_NOISE1: emit_noise1(c, inst); break; case OPCODE_NOISE2: emit_noise2(c, inst); break; case OPCODE_NOISE3: emit_noise3(c, inst); break; case OPCODE_NOISE4: emit_noise4(c, inst); break; case OPCODE_TEX: emit_tex(c, inst); break; case OPCODE_TXB: emit_txb(c, inst); break; case OPCODE_KIL_NV: emit_kil(c); break; case OPCODE_IF: assert(if_insn < MAX_IFSN); if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8); break; case OPCODE_ELSE: if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]); break; case OPCODE_ENDIF: assert(if_insn > 0); brw_ENDIF(p, if_inst[--if_insn]); break; case OPCODE_BGNSUB: case OPCODE_ENDSUB: break; case OPCODE_CAL: brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_access_mode(p, BRW_ALIGN_1); brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16)); brw_set_access_mode(p, BRW_ALIGN_16); brw_ADD(p, get_addr_reg(stack_index), get_addr_reg(stack_index), brw_imm_d(4)); orig_inst = inst->Data; orig_inst->Data = &p->store[p->nr_insn]; brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); brw_pop_insn_state(p); break; case OPCODE_RET: brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_ADD(p, get_addr_reg(stack_index), get_addr_reg(stack_index), brw_imm_d(-4)); brw_set_access_mode(p, BRW_ALIGN_1); brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0)); brw_set_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); break; case OPCODE_BGNLOOP: loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8); break; case OPCODE_BRK: brw_BREAK(p); brw_set_predicate_control(p, BRW_PREDICATE_NONE); break; case OPCODE_CONT: brw_CONT(p); brw_set_predicate_control(p, BRW_PREDICATE_NONE); break; case OPCODE_ENDLOOP: loop_insn--; inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]); /* patch all the BREAK instructions from last BEGINLOOP */ while (inst0 > loop_inst[loop_insn]) { inst0--; if (inst0->header.opcode == BRW_OPCODE_BREAK) { inst0->bits3.if_else.jump_count = inst1 - inst0 + 1; inst0->bits3.if_else.pop_count = 0; } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) { inst0->bits3.if_else.jump_count = inst1 - inst0; inst0->bits3.if_else.pop_count = 0; } } break; default: _mesa_printf("unsupported IR in fragment shader %d\n", inst->Opcode); } if (inst->CondUpdate) brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); else brw_set_predicate_control(p, BRW_PREDICATE_NONE); } post_wm_emit(c); for (i = 0; i < c->fp->program.Base.NumInstructions; i++) c->fp->program.Base.Instructions[i].Data = NULL; } void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c) { brw_wm_pass_fp(c); brw_wm_emit_glsl(brw, c); c->prog_data.total_grf = c->reg_index; c->prog_data.total_scratch = 0; }