diff options
Diffstat (limited to 'src/mesa/drivers/dri/i965')
54 files changed, 2150 insertions, 1403 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile index 7c3ac0c14e..b05ba35d65 100644 --- a/src/mesa/drivers/dri/i965/Makefile +++ b/src/mesa/drivers/dri/i965/Makefile @@ -108,6 +108,7 @@ CXX_SOURCES = \ brw_fs.cpp \ brw_fs_channel_expressions.cpp \ brw_fs_reg_allocate.cpp \ + brw_fs_schedule_instructions.cpp \ brw_fs_vector_splitting.cpp ASM_SOURCES = diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c index d3a1233aac..412d82ab3c 100644 --- a/src/mesa/drivers/dri/i965/brw_cc.c +++ b/src/mesa/drivers/dri/i965/brw_cc.c @@ -35,6 +35,7 @@ #include "brw_defines.h" #include "brw_util.h" #include "main/macros.h" +#include "intel_batchbuffer.h" void brw_update_cc_vp(struct brw_context *brw) @@ -92,61 +93,61 @@ static void upload_cc_unit(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &brw->intel.ctx; - struct brw_cc_unit_state cc; - void *map; + struct brw_cc_unit_state *cc; - memset(&cc, 0, sizeof(cc)); + cc = brw_state_batch(brw, sizeof(*cc), 64, &brw->cc.state_offset); + memset(cc, 0, sizeof(*cc)); /* _NEW_STENCIL */ if (ctx->Stencil._Enabled) { const unsigned back = ctx->Stencil._BackFace; - cc.cc0.stencil_enable = 1; - cc.cc0.stencil_func = + cc->cc0.stencil_enable = 1; + cc->cc0.stencil_func = intel_translate_compare_func(ctx->Stencil.Function[0]); - cc.cc0.stencil_fail_op = + cc->cc0.stencil_fail_op = intel_translate_stencil_op(ctx->Stencil.FailFunc[0]); - cc.cc0.stencil_pass_depth_fail_op = + cc->cc0.stencil_pass_depth_fail_op = intel_translate_stencil_op(ctx->Stencil.ZFailFunc[0]); - cc.cc0.stencil_pass_depth_pass_op = + cc->cc0.stencil_pass_depth_pass_op = intel_translate_stencil_op(ctx->Stencil.ZPassFunc[0]); - cc.cc1.stencil_ref = ctx->Stencil.Ref[0]; - cc.cc1.stencil_write_mask = ctx->Stencil.WriteMask[0]; - cc.cc1.stencil_test_mask = ctx->Stencil.ValueMask[0]; + cc->cc1.stencil_ref = ctx->Stencil.Ref[0]; + cc->cc1.stencil_write_mask = ctx->Stencil.WriteMask[0]; + cc->cc1.stencil_test_mask = ctx->Stencil.ValueMask[0]; if (ctx->Stencil._TestTwoSide) { - cc.cc0.bf_stencil_enable = 1; - cc.cc0.bf_stencil_func = + cc->cc0.bf_stencil_enable = 1; + cc->cc0.bf_stencil_func = intel_translate_compare_func(ctx->Stencil.Function[back]); - cc.cc0.bf_stencil_fail_op = + cc->cc0.bf_stencil_fail_op = intel_translate_stencil_op(ctx->Stencil.FailFunc[back]); - cc.cc0.bf_stencil_pass_depth_fail_op = + cc->cc0.bf_stencil_pass_depth_fail_op = intel_translate_stencil_op(ctx->Stencil.ZFailFunc[back]); - cc.cc0.bf_stencil_pass_depth_pass_op = + cc->cc0.bf_stencil_pass_depth_pass_op = intel_translate_stencil_op(ctx->Stencil.ZPassFunc[back]); - cc.cc1.bf_stencil_ref = ctx->Stencil.Ref[back]; - cc.cc2.bf_stencil_write_mask = ctx->Stencil.WriteMask[back]; - cc.cc2.bf_stencil_test_mask = ctx->Stencil.ValueMask[back]; + cc->cc1.bf_stencil_ref = ctx->Stencil.Ref[back]; + cc->cc2.bf_stencil_write_mask = ctx->Stencil.WriteMask[back]; + cc->cc2.bf_stencil_test_mask = ctx->Stencil.ValueMask[back]; } /* Not really sure about this: */ if (ctx->Stencil.WriteMask[0] || (ctx->Stencil._TestTwoSide && ctx->Stencil.WriteMask[back])) - cc.cc0.stencil_write_enable = 1; + cc->cc0.stencil_write_enable = 1; } /* _NEW_COLOR */ if (ctx->Color._LogicOpEnabled && ctx->Color.LogicOp != GL_COPY) { - cc.cc2.logicop_enable = 1; - cc.cc5.logicop_func = intel_translate_logic_op(ctx->Color.LogicOp); + cc->cc2.logicop_enable = 1; + cc->cc5.logicop_func = intel_translate_logic_op(ctx->Color.LogicOp); } else if (ctx->Color.BlendEnabled) { - GLenum eqRGB = ctx->Color.BlendEquationRGB; - GLenum eqA = ctx->Color.BlendEquationA; - GLenum srcRGB = ctx->Color.BlendSrcRGB; - GLenum dstRGB = ctx->Color.BlendDstRGB; - GLenum srcA = ctx->Color.BlendSrcA; - GLenum dstA = ctx->Color.BlendDstA; + GLenum eqRGB = ctx->Color.Blend[0].EquationRGB; + GLenum eqA = ctx->Color.Blend[0].EquationA; + GLenum srcRGB = ctx->Color.Blend[0].SrcRGB; + GLenum dstRGB = ctx->Color.Blend[0].DstRGB; + GLenum srcA = ctx->Color.Blend[0].SrcA; + GLenum dstA = ctx->Color.Blend[0].DstA; /* If the renderbuffer is XRGB, we have to frob the blend function to * force the destination alpha to 1.0. This means replacing GL_DST_ALPHA @@ -167,58 +168,55 @@ static void upload_cc_unit(struct brw_context *brw) srcA = dstA = GL_ONE; } - cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB); - cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB); - cc.cc6.blend_function = brw_translate_blend_equation(eqRGB); + cc->cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB); + cc->cc6.src_blend_factor = brw_translate_blend_factor(srcRGB); + cc->cc6.blend_function = brw_translate_blend_equation(eqRGB); - cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA); - cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA); - cc.cc5.ia_blend_function = brw_translate_blend_equation(eqA); + cc->cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA); + cc->cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA); + cc->cc5.ia_blend_function = brw_translate_blend_equation(eqA); - cc.cc3.blend_enable = 1; - cc.cc3.ia_blend_enable = (srcA != srcRGB || + cc->cc3.blend_enable = 1; + cc->cc3.ia_blend_enable = (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB); } if (ctx->Color.AlphaEnabled) { - cc.cc3.alpha_test = 1; - cc.cc3.alpha_test_func = + cc->cc3.alpha_test = 1; + cc->cc3.alpha_test_func = intel_translate_compare_func(ctx->Color.AlphaFunc); - cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8; + cc->cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8; - UNCLAMPED_FLOAT_TO_UBYTE(cc.cc7.alpha_ref.ub[0], ctx->Color.AlphaRef); + UNCLAMPED_FLOAT_TO_UBYTE(cc->cc7.alpha_ref.ub[0], ctx->Color.AlphaRef); } if (ctx->Color.DitherFlag) { - cc.cc5.dither_enable = 1; - cc.cc6.y_dither_offset = 0; - cc.cc6.x_dither_offset = 0; + cc->cc5.dither_enable = 1; + cc->cc6.y_dither_offset = 0; + cc->cc6.x_dither_offset = 0; } /* _NEW_DEPTH */ if (ctx->Depth.Test) { - cc.cc2.depth_test = 1; - cc.cc2.depth_test_function = + cc->cc2.depth_test = 1; + cc->cc2.depth_test_function = intel_translate_compare_func(ctx->Depth.Func); - cc.cc2.depth_write_enable = ctx->Depth.Mask; + cc->cc2.depth_write_enable = ctx->Depth.Mask; } if (intel->stats_wm || unlikely(INTEL_DEBUG & DEBUG_STATS)) - cc.cc5.statistics_enable = 1; + cc->cc5.statistics_enable = 1; /* CACHE_NEW_CC_VP */ - cc.cc4.cc_viewport_state_offset = brw->cc.vp_bo->offset >> 5; /* reloc */ + cc->cc4.cc_viewport_state_offset = brw->cc.vp_bo->offset >> 5; /* reloc */ - map = brw_state_batch(brw, sizeof(cc), 64, - &brw->cc.state_bo, &brw->cc.state_offset); - memcpy(map, &cc, sizeof(cc)); brw->state.dirty.cache |= CACHE_NEW_CC_UNIT; /* Emit CC viewport relocation */ - drm_intel_bo_emit_reloc(brw->cc.state_bo, (brw->cc.state_offset + - offsetof(struct brw_cc_unit_state, - cc4)), + drm_intel_bo_emit_reloc(brw->intel.batch.bo, + (brw->cc.state_offset + + offsetof(struct brw_cc_unit_state, cc4)), brw->cc.vp_bo, 0, I915_GEM_DOMAIN_INSTRUCTION, 0); } @@ -235,18 +233,16 @@ const struct brw_tracked_state brw_cc_unit = { static void upload_blend_constant_color(struct brw_context *brw) { - struct gl_context *ctx = &brw->intel.ctx; - struct brw_blend_constant_color bcc; - - memset(&bcc, 0, sizeof(bcc)); - bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR; - bcc.header.length = sizeof(bcc)/4-2; - bcc.blend_constant_color[0] = ctx->Color.BlendColor[0]; - bcc.blend_constant_color[1] = ctx->Color.BlendColor[1]; - bcc.blend_constant_color[2] = ctx->Color.BlendColor[2]; - bcc.blend_constant_color[3] = ctx->Color.BlendColor[3]; - - BRW_CACHED_BATCH_STRUCT(brw, &bcc); + struct intel_context *intel = &brw->intel; + struct gl_context *ctx = &intel->ctx; + + BEGIN_BATCH(5); + OUT_BATCH(_3DSTATE_BLEND_CONSTANT_COLOR << 16 | (5-2)); + OUT_BATCH_F(ctx->Color.BlendColor[0]); + OUT_BATCH_F(ctx->Color.BlendColor[1]); + OUT_BATCH_F(ctx->Color.BlendColor[2]); + OUT_BATCH_F(ctx->Color.BlendColor[3]); + CACHED_BATCH(); } const struct brw_tracked_state brw_blend_constant_color = { diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 28549f2574..9483ec69d9 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -151,6 +151,22 @@ GLboolean brwCreateContext( int api, MIN2(ctx->Const.FragmentProgram.MaxNativeParameters, ctx->Const.FragmentProgram.MaxEnvParams); + /* Fragment shaders use real, 32-bit twos-complement integers for all + * integer types. + */ + ctx->Const.FragmentProgram.LowInt.RangeMin = 31; + ctx->Const.FragmentProgram.LowInt.RangeMax = 30; + ctx->Const.FragmentProgram.LowInt.Precision = 0; + ctx->Const.FragmentProgram.HighInt = ctx->Const.FragmentProgram.MediumInt + = ctx->Const.FragmentProgram.LowInt; + + /* Gen6 converts quads to polygon in beginning of 3D pipeline, + but we're not sure how it's actually done for vertex order, + that affect provoking vertex decision. Always use last vertex + convention for quad primitive which works as expected for now. */ + if (intel->gen == 6) + ctx->Const.QuadsFollowProvokingVertexConvention = GL_FALSE; + if (intel->is_g4x || intel->gen >= 5) { brw->CMD_VF_STATISTICS = CMD_VF_STATISTICS_GM45; brw->CMD_PIPELINE_SELECT = CMD_PIPELINE_SELECT_GM45; diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 7069724466..7b0551a92b 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -145,7 +145,7 @@ struct brw_context; #define BRW_NEW_NR_VS_SURFACES 0x80000 #define BRW_NEW_INDEX_BUFFER 0x100000 #define BRW_NEW_VS_CONSTBUF 0x200000 -#define BRW_NEW_WM_CONSTBUF 0x200000 +#define BRW_NEW_WM_CONSTBUF 0x400000 struct brw_state_flags { /** State update flags signalled by mesa internals */ @@ -408,21 +408,24 @@ struct brw_cached_batch_item { */ #define ATTRIB_BIT_DWORDS ((VERT_ATTRIB_MAX+31)/32) +struct brw_vertex_buffer { + /** Buffer object containing the uploaded vertex data */ + drm_intel_bo *bo; + uint32_t offset; + /** Byte stride between elements in the uploaded array */ + GLuint stride; +}; struct brw_vertex_element { const struct gl_client_array *glarray; + int buffer; + /** The corresponding Mesa vertex attribute */ gl_vert_attrib attrib; /** Size of a complete element */ GLuint element_size; - /** Number of uploaded elements for this input. */ - GLuint count; - /** Byte stride between elements in the uploaded array */ - GLuint stride; /** Offset of the first element within the buffer object */ unsigned int offset; - /** Buffer object containing the uploaded vertex data */ - drm_intel_bo *bo; }; @@ -457,12 +460,10 @@ struct brw_context GLboolean has_negative_rhw_bug; GLboolean has_aa_line_parameters; GLboolean has_pln; -; + struct { struct brw_state_flags dirty; - GLuint nr_color_regions; - struct intel_region *color_regions[MAX_DRAW_BUFFERS]; struct intel_region *depth_region; /** @@ -485,23 +486,27 @@ struct brw_context struct { struct brw_vertex_element inputs[VERT_ATTRIB_MAX]; + struct brw_vertex_buffer buffers[VERT_ATTRIB_MAX]; + struct { + uint32_t handle; + uint32_t offset; + uint32_t stride; + } current_buffers[VERT_ATTRIB_MAX]; struct brw_vertex_element *enabled[VERT_ATTRIB_MAX]; GLuint nr_enabled; - -#define BRW_NR_UPLOAD_BUFS 17 -#define BRW_UPLOAD_INIT_SIZE (128*1024) - - struct { - drm_intel_bo *bo; - GLuint offset; - } upload; + GLuint nr_buffers, nr_current_buffers; /* Summary of size and varying of active arrays, so we can check * for changes to this state: */ struct brw_vertex_info info; unsigned int min_index, max_index; + + /* Offset from start of vertex buffer so we can avoid redefining + * the same VB packed over and over again. + */ + unsigned int start_vertex_bias; } vb; struct { @@ -512,10 +517,10 @@ struct brw_context */ const struct _mesa_index_buffer *ib; - /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */ + /* Updates are signaled by BRW_NEW_INDEX_BUFFER. */ drm_intel_bo *bo; - unsigned int offset; - unsigned int size; + GLuint type; + /* Offset to index buffer index to use in CMD_3D_PRIM so that we can * avoid re-uploading the IB packet over and over if we're actually * referencing the same index buffer. @@ -528,11 +533,6 @@ struct brw_context const struct gl_vertex_program *vertex_program; const struct gl_fragment_program *fragment_program; - - /* For populating the gtt: - */ - GLuint next_free_page; - /* hw-dependent 3DSTATE_VF_STATISTICS opcode */ uint32_t CMD_VF_STATISTICS; /* hw-dependent 3DSTATE_PIPELINE_SELECT opcode */ @@ -612,9 +612,7 @@ struct brw_context drm_intel_bo *const_bo; /** Binding table of pointers to surf_bo entries */ - drm_intel_bo *bind_bo; uint32_t bind_bo_offset; - drm_intel_bo *surf_bo[BRW_VS_MAX_SURF]; uint32_t surf_offset[BRW_VS_MAX_SURF]; GLuint nr_surfaces; } vs; @@ -666,9 +664,7 @@ struct brw_context drm_intel_bo *sampler_bo; /** Binding table of pointers to surf_bo entries */ - drm_intel_bo *bind_bo; uint32_t bind_bo_offset; - drm_intel_bo *surf_bo[BRW_WM_MAX_SURF]; uint32_t surf_offset[BRW_WM_MAX_SURF]; drm_intel_bo *prog_bo; @@ -693,7 +689,6 @@ struct brw_context drm_intel_bo *depth_stencil_state_bo; drm_intel_bo *color_calc_state_bo; - drm_intel_bo *state_bo; uint32_t state_offset; } cc; @@ -706,6 +701,9 @@ struct brw_context /* Used to give every program string a unique id */ GLuint program_id; + + int num_prepare_atoms, num_emit_atoms; + struct brw_tracked_state prepare_atoms[64], emit_atoms[64]; }; @@ -841,4 +839,3 @@ float convert_param(enum param_conversion conversion, float param) GLboolean brw_do_cubemap_normalize(struct exec_list *instructions); #endif - diff --git a/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp b/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp index 35bea68121..8574169e47 100644 --- a/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp +++ b/src/mesa/drivers/dri/i965/brw_cubemap_normalize.cpp @@ -51,7 +51,7 @@ brw_cubemap_normalize_visitor::visit_leave(ir_texture *ir) if (ir->sampler->type->sampler_dimensionality != GLSL_SAMPLER_DIM_CUBE) return visit_continue; - void *mem_ctx = talloc_parent(ir); + void *mem_ctx = ralloc_parent(ir); ir_variable *var = new(mem_ctx) ir_variable(ir->coordinate->type, "coordinate", ir_var_auto); diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c index 877b22fec1..ae11c487a2 100644 --- a/src/mesa/drivers/dri/i965/brw_curbe.c +++ b/src/mesa/drivers/dri/i965/brw_curbe.c @@ -146,22 +146,24 @@ const struct brw_tracked_state brw_curbe_offsets = { */ void brw_upload_cs_urb_state(struct brw_context *brw) { - struct brw_cs_urb_state cs_urb; - memset(&cs_urb, 0, sizeof(cs_urb)); + struct intel_context *intel = &brw->intel; + BEGIN_BATCH(2); /* It appears that this is the state packet for the CS unit, ie. the * urb entries detailed here are housed in the CS range from the * URB_FENCE command. */ - cs_urb.header.opcode = CMD_CS_URB_STATE; - cs_urb.header.length = sizeof(cs_urb)/4 - 2; + OUT_BATCH(CMD_CS_URB_STATE << 16 | (2-2)); /* BRW_NEW_URB_FENCE */ - cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries; - cs_urb.bits0.urb_entry_size = brw->urb.csize - 1; - - assert(brw->urb.nr_cs_entries); - BRW_CACHED_BATCH_STRUCT(brw, &cs_urb); + if (brw->urb.csize == 0) { + OUT_BATCH(0); + } else { + /* BRW_NEW_URB_FENCE */ + assert(brw->urb.nr_cs_entries); + OUT_BATCH((brw->urb.csize - 1) << 4 | brw->urb.nr_cs_entries); + } + CACHED_BATCH(); } static GLfloat fixed_plane[6][4] = { diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 5c5b8259e1..5496b4fdd3 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -35,28 +35,6 @@ /* 3D state: */ -#define _3DOP_3DSTATE_PIPELINED 0x0 -#define _3DOP_3DSTATE_NONPIPELINED 0x1 -#define _3DOP_3DCONTROL 0x2 -#define _3DOP_3DPRIMITIVE 0x3 - -#define _3DSTATE_PIPELINED_POINTERS 0x00 -#define _3DSTATE_BINDING_TABLE_POINTERS 0x01 -#define _3DSTATE_VERTEX_BUFFERS 0x08 -#define _3DSTATE_VERTEX_ELEMENTS 0x09 -#define _3DSTATE_INDEX_BUFFER 0x0A -#define _3DSTATE_VF_STATISTICS 0x0B -#define _3DSTATE_DRAWING_RECTANGLE 0x00 -#define _3DSTATE_CONSTANT_COLOR 0x01 -#define _3DSTATE_SAMPLER_PALETTE_LOAD 0x02 -#define _3DSTATE_CHROMA_KEY 0x04 -#define _3DSTATE_DEPTH_BUFFER 0x05 -#define _3DSTATE_POLY_STIPPLE_OFFSET 0x06 -#define _3DSTATE_POLY_STIPPLE_PATTERN 0x07 -#define _3DSTATE_LINE_STIPPLE 0x08 -#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP 0x09 -#define _3DCONTROL 0x00 - #define PIPE_CONTROL_NOWRITE 0x00 #define PIPE_CONTROL_WRITEIMMEDIATE 0x01 #define PIPE_CONTROL_WRITEDEPTH 0x02 @@ -389,6 +367,7 @@ #define BRW_SURFACEFORMAT_R8_SSCALED 0x149 #define BRW_SURFACEFORMAT_R8_USCALED 0x14A #define BRW_SURFACEFORMAT_L8_UNORM_SRGB 0x14C +#define BRW_SURFACEFORMAT_DXT1_RGB_SRGB 0x180 #define BRW_SURFACEFORMAT_R1_UINT 0x181 #define BRW_SURFACEFORMAT_YCRCB_NORMAL 0x182 #define BRW_SURFACEFORMAT_YCRCB_SWAPUVY 0x183 @@ -700,6 +679,8 @@ #define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS 2 #define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE 0 #define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE 2 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0 +#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE 1 #define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO 2 #define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO 2 #define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO 2 @@ -838,13 +819,13 @@ #define CMD_PIPELINE_SELECT_965 0x6104 #define CMD_PIPELINE_SELECT_GM45 0x6904 -#define CMD_PIPELINED_STATE_POINTERS 0x7800 -#define CMD_BINDING_TABLE_PTRS 0x7801 +#define _3DSTATE_PIPELINED_POINTERS 0x7800 +#define _3DSTATE_BINDING_TABLE_POINTERS 0x7801 # define GEN6_BINDING_TABLE_MODIFY_VS (1 << 8) # define GEN6_BINDING_TABLE_MODIFY_GS (1 << 9) # define GEN6_BINDING_TABLE_MODIFY_PS (1 << 12) -#define CMD_3D_SAMPLER_STATE_POINTERS 0x7802 /* SNB+ */ +#define _3DSTATE_SAMPLER_STATE_POINTERS 0x7802 /* GEN6+ */ # define PS_SAMPLER_STATE_CHANGE (1 << 12) # define GS_SAMPLER_STATE_CHANGE (1 << 9) # define VS_SAMPLER_STATE_CHANGE (1 << 8) @@ -885,27 +866,29 @@ #define CMD_INDEX_BUFFER 0x780a #define CMD_VF_STATISTICS_965 0x780b #define CMD_VF_STATISTICS_GM45 0x680b -#define CMD_3D_CC_STATE_POINTERS 0x780e /* GEN6+ */ +#define _3DSTATE_CC_STATE_POINTERS 0x780e /* GEN6+ */ -#define CMD_URB 0x7805 /* GEN6+ */ +#define _3DSTATE_URB 0x7805 /* GEN6+ */ # define GEN6_URB_VS_SIZE_SHIFT 16 # define GEN6_URB_VS_ENTRIES_SHIFT 0 # define GEN6_URB_GS_ENTRIES_SHIFT 8 # define GEN6_URB_GS_SIZE_SHIFT 0 -#define CMD_VIEWPORT_STATE_POINTERS 0x780d /* GEN6+ */ +#define _3DSTATE_VIEWPORT_STATE_POINTERS 0x780d /* GEN6+ */ # define GEN6_CC_VIEWPORT_MODIFY (1 << 12) # define GEN6_SF_VIEWPORT_MODIFY (1 << 11) # define GEN6_CLIP_VIEWPORT_MODIFY (1 << 10) -#define CMD_3D_SCISSOR_STATE_POINTERS 0x780f /* GEN6+ */ +#define _3DSTATE_SCISSOR_STATE_POINTERS 0x780f /* GEN6+ */ -#define CMD_3D_VS_STATE 0x7810 /* GEN6+ */ +#define _3DSTATE_VS 0x7810 /* GEN6+ */ /* DW2 */ # define GEN6_VS_SPF_MODE (1 << 31) # define GEN6_VS_VECTOR_MASK_ENABLE (1 << 30) # define GEN6_VS_SAMPLER_COUNT_SHIFT 27 # define GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT 18 +# define GEN6_VS_FLOATING_POINT_MODE_IEEE_754 (0 << 16) +# define GEN6_VS_FLOATING_POINT_MODE_ALT (1 << 16) /* DW4 */ # define GEN6_VS_DISPATCH_START_GRF_SHIFT 20 # define GEN6_VS_URB_READ_LENGTH_SHIFT 11 @@ -916,7 +899,7 @@ # define GEN6_VS_CACHE_DISABLE (1 << 1) # define GEN6_VS_ENABLE (1 << 0) -#define CMD_3D_GS_STATE 0x7811 /* GEN6+ */ +#define _3DSTATE_GS 0x7811 /* GEN6+ */ /* DW2 */ # define GEN6_GS_SPF_MODE (1 << 31) # define GEN6_GS_VECTOR_MASK_ENABLE (1 << 30) @@ -934,7 +917,7 @@ /* DW6 */ # define GEN6_GS_ENABLE (1 << 15) -#define CMD_3D_CLIP_STATE 0x7812 /* GEN6+ */ +#define _3DSTATE_CLIP 0x7812 /* GEN6+ */ /* DW1 */ # define GEN6_CLIP_STATISTICS_ENABLE (1 << 10) /** @@ -964,7 +947,7 @@ # define GEN6_CLIP_MAX_POINT_WIDTH_SHIFT 6 # define GEN6_CLIP_FORCE_ZERO_RTAINDEX (1 << 5) -#define CMD_3D_SF_STATE 0x7813 /* GEN6+ */ +#define _3DSTATE_SF 0x7813 /* GEN6+ */ /* DW1 */ # define GEN6_SF_NUM_OUTPUTS_SHIFT 22 # define GEN6_SF_SWIZZLE_ENABLE (1 << 21) @@ -1029,18 +1012,27 @@ # define ATTRIBUTE_0_CONST_SOURCE_SHIFT 9 # define ATTRIBUTE_0_SWIZZLE_SHIFT 6 # define ATTRIBUTE_0_SOURCE_SHIFT 0 + +# define ATTRIBUTE_SWIZZLE_INPUTATTR 0 +# define ATTRIBUTE_SWIZZLE_INPUTATTR_FACING 1 +# define ATTRIBUTE_SWIZZLE_INPUTATTR_W 2 +# define ATTRIBUTE_SWIZZLE_INPUTATTR_FACING_W 3 +# define ATTRIBUTE_SWIZZLE_SHIFT 6 + /* DW16: Point sprite texture coordinate enables */ /* DW17: Constant interpolation enables */ /* DW18: attr 0-7 wrap shortest enables */ /* DW19: attr 8-16 wrap shortest enables */ -#define CMD_3D_WM_STATE 0x7814 /* GEN6+ */ +#define _3DSTATE_WM 0x7814 /* GEN6+ */ /* DW1: kernel pointer */ /* DW2 */ # define GEN6_WM_SPF_MODE (1 << 31) # define GEN6_WM_VECTOR_MASK_ENABLE (1 << 30) # define GEN6_WM_SAMPLER_COUNT_SHIFT 27 # define GEN6_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT 18 +# define GEN6_WM_FLOATING_POINT_MODE_IEEE_754 (0 << 16) +# define GEN6_WM_FLOATING_POINT_MODE_ALT (1 << 16) /* DW3: scratch space */ /* DW4 */ # define GEN6_WM_STATISTICS_ENABLE (1 << 31) @@ -1095,34 +1087,34 @@ /* DW7: kernel 1 pointer */ /* DW8: kernel 2 pointer */ -#define CMD_3D_CONSTANT_VS_STATE 0x7815 /* GEN6+ */ -#define CMD_3D_CONSTANT_GS_STATE 0x7816 /* GEN6+ */ -#define CMD_3D_CONSTANT_PS_STATE 0x7817 /* GEN6+ */ +#define _3DSTATE_CONSTANT_VS 0x7815 /* GEN6+ */ +#define _3DSTATE_CONSTANT_GS 0x7816 /* GEN6+ */ +#define _3DSTATE_CONSTANT_PS 0x7817 /* GEN6+ */ # define GEN6_CONSTANT_BUFFER_3_ENABLE (1 << 15) # define GEN6_CONSTANT_BUFFER_2_ENABLE (1 << 14) # define GEN6_CONSTANT_BUFFER_1_ENABLE (1 << 13) # define GEN6_CONSTANT_BUFFER_0_ENABLE (1 << 12) -#define CMD_3D_SAMPLE_MASK 0x7818 /* GEN6+ */ +#define _3DSTATE_SAMPLE_MASK 0x7818 /* GEN6+ */ -#define CMD_DRAW_RECT 0x7900 -#define CMD_BLEND_CONSTANT_COLOR 0x7901 -#define CMD_CHROMA_KEY 0x7904 -#define CMD_DEPTH_BUFFER 0x7905 -#define CMD_POLY_STIPPLE_OFFSET 0x7906 -#define CMD_POLY_STIPPLE_PATTERN 0x7907 -#define CMD_LINE_STIPPLE_PATTERN 0x7908 -#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909 -#define CMD_AA_LINE_PARAMETERS 0x790a +#define _3DSTATE_DRAWING_RECTANGLE 0x7900 +#define _3DSTATE_BLEND_CONSTANT_COLOR 0x7901 +#define _3DSTATE_CHROMA_KEY 0x7904 +#define _3DSTATE_DEPTH_BUFFER 0x7905 +#define _3DSTATE_POLY_STIPPLE_OFFSET 0x7906 +#define _3DSTATE_POLY_STIPPLE_PATTERN 0x7907 +#define _3DSTATE_LINE_STIPPLE_PATTERN 0x7908 +#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909 +#define _3DSTATE_AA_LINE_PARAMETERS 0x790a /* G45+ */ -#define CMD_GS_SVB_INDEX 0x790b /* CTG+ */ +#define _3DSTATE_GS_SVB_INDEX 0x790b /* CTG+ */ /* DW1 */ # define SVB_INDEX_SHIFT 29 # define SVB_LOAD_INTERNAL_VERTEX_COUNT (1 << 0) /* SNB+ */ /* DW2: SVB index */ /* DW3: SVB maximum index */ -#define CMD_3D_MULTISAMPLE 0x790d /* SNB+ */ +#define _3DSTATE_MULTISAMPLE 0x790d /* GEN6+ */ /* DW1 */ # define MS_PIXEL_LOCATION_CENTER (0 << 4) # define MS_PIXEL_LOCATION_UPPER_LEFT (1 << 4) @@ -1130,7 +1122,10 @@ # define MS_NUMSAMPLES_4 (2 << 1) # define MS_NUMSAMPLES_8 (3 << 1) -#define CMD_3D_CLEAR_PARAMS 0x7910 /* ILK+ */ +#define _3DSTATE_STENCIL_BUFFER 0x790e /* ILK, SNB */ +#define _3DSTATE_HIER_DEPTH_BUFFER 0x790f /* ILK, SNB */ + +#define _3DSTATE_CLEAR_PARAMS 0x7910 /* ILK+ */ # define DEPTH_CLEAR_VALID (1 << 15) /* DW1: depth clear value */ diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c index 6b61f7af15..111cb9974e 100644 --- a/src/mesa/drivers/dri/i965/brw_disasm.c +++ b/src/mesa/drivers/dri/i965/brw_disasm.c @@ -973,7 +973,7 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen) inst->bits3.dp_render_cache.send_commit_msg, inst->bits3.dp_render_cache.msg_length, inst->bits3.dp_render_cache.response_length); - } else if (gen >= 5) { + } else if (gen >= 5 /* FINISHME: || is_g4x */) { format (file, " (%d, %d, %d)", inst->bits3.dp_read_gen5.binding_table_index, inst->bits3.dp_read_gen5.msg_control, diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c index a1f403ca4e..f5abe021c4 100644 --- a/src/mesa/drivers/dri/i965/brw_draw.c +++ b/src/mesa/drivers/dri/i965/brw_draw.c @@ -145,9 +145,14 @@ static void brw_emit_prim(struct brw_context *brw, prim_packet.start_vert_location = prim->start; if (prim->indexed) prim_packet.start_vert_location += brw->ib.start_vertex_offset; + else + prim_packet.start_vert_location += brw->vb.start_vertex_bias; prim_packet.instance_count = 1; prim_packet.start_instance_location = 0; prim_packet.base_vert_location = prim->basevertex; + if (prim->indexed) + prim_packet.base_vert_location += brw->vb.start_vertex_bias; + /* If we're set to always flush, do it before and after the primitive emit. * We want to catch both missed flushes that hurt instruction/state cache @@ -155,14 +160,14 @@ static void brw_emit_prim(struct brw_context *brw, * the besides the draw code. */ if (intel->always_flush_cache) { - intel_batchbuffer_emit_mi_flush(intel->batch); + intel_batchbuffer_emit_mi_flush(intel); } if (prim_packet.verts_per_instance) { - intel_batchbuffer_data( brw->intel.batch, &prim_packet, - sizeof(prim_packet)); + intel_batchbuffer_data(&brw->intel, &prim_packet, + sizeof(prim_packet), false); } if (intel->always_flush_cache) { - intel_batchbuffer_emit_mi_flush(intel->batch); + intel_batchbuffer_emit_mi_flush(intel); } } @@ -172,13 +177,16 @@ static void brw_merge_inputs( struct brw_context *brw, struct brw_vertex_info old = brw->vb.info; GLuint i; - for (i = 0; i < VERT_ATTRIB_MAX; i++) - drm_intel_bo_unreference(brw->vb.inputs[i].bo); + for (i = 0; i < brw->vb.nr_buffers; i++) { + drm_intel_bo_unreference(brw->vb.buffers[i].bo); + brw->vb.buffers[i].bo = NULL; + } + brw->vb.nr_buffers = 0; - memset(&brw->vb.inputs, 0, sizeof(brw->vb.inputs)); memset(&brw->vb.info, 0, sizeof(brw->vb.info)); for (i = 0; i < VERT_ATTRIB_MAX; i++) { + brw->vb.inputs[i].buffer = -1; brw->vb.inputs[i].glarray = arrays[i]; brw->vb.inputs[i].attrib = (gl_vert_attrib) i; @@ -303,7 +311,6 @@ static GLboolean brw_try_draw_prims( struct gl_context *ctx, struct brw_context *brw = brw_context(ctx); GLboolean retval = GL_FALSE; GLboolean warn = GL_FALSE; - GLboolean first_time = GL_TRUE; GLuint i; if (ctx->NewState) @@ -351,13 +358,10 @@ static GLboolean brw_try_draw_prims( struct gl_context *ctx, * an upper bound of how much we might emit in a single * brw_try_draw_prims(). */ - intel_batchbuffer_require_space(intel->batch, intel->batch->size / 4); + intel_batchbuffer_require_space(intel, 1024, false); hw_prim = brw_set_prim(brw, &prim[i]); - - if (first_time || (brw->state.dirty.brw & BRW_NEW_PRIMITIVE)) { - first_time = GL_FALSE; - + if (brw->state.dirty.brw) { brw_validate_state(brw); /* Various fallback checks: */ @@ -370,7 +374,7 @@ static GLboolean brw_try_draw_prims( struct gl_context *ctx, if (dri_bufmgr_check_aperture_space(brw->state.validated_bos, brw->state.validated_bo_count)) { static GLboolean warned; - intel_batchbuffer_flush(intel->batch); + intel_batchbuffer_flush(intel); /* Validate the state after we flushed the batch (which would have * changed the set of dirty state). If we still fail to @@ -399,7 +403,7 @@ static GLboolean brw_try_draw_prims( struct gl_context *ctx, } if (intel->always_flush_batch) - intel_batchbuffer_flush(intel->batch); + intel_batchbuffer_flush(intel); out: brw_state_cache_check_size(brw); @@ -460,25 +464,32 @@ void brw_draw_init( struct brw_context *brw ) { struct gl_context *ctx = &brw->intel.ctx; struct vbo_context *vbo = vbo_context(ctx); + int i; /* Register our drawing function: */ vbo->draw_prims = brw_draw_prims; + + for (i = 0; i < VERT_ATTRIB_MAX; i++) + brw->vb.inputs[i].buffer = -1; + brw->vb.nr_buffers = 0; + brw->vb.nr_enabled = 0; } void brw_draw_destroy( struct brw_context *brw ) { int i; - if (brw->vb.upload.bo != NULL) { - drm_intel_bo_unreference(brw->vb.upload.bo); - brw->vb.upload.bo = NULL; + for (i = 0; i < brw->vb.nr_buffers; i++) { + drm_intel_bo_unreference(brw->vb.buffers[i].bo); + brw->vb.buffers[i].bo = NULL; } + brw->vb.nr_buffers = 0; - for (i = 0; i < VERT_ATTRIB_MAX; i++) { - drm_intel_bo_unreference(brw->vb.inputs[i].bo); - brw->vb.inputs[i].bo = NULL; + for (i = 0; i < brw->vb.nr_enabled; i++) { + brw->vb.enabled[i]->buffer = -1; } + brw->vb.nr_enabled = 0; drm_intel_bo_unreference(brw->ib.bo); brw->ib.bo = NULL; diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c index 405e161bdb..78885b58a8 100644 --- a/src/mesa/drivers/dri/i965/brw_draw_upload.c +++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c @@ -25,6 +25,7 @@ * **************************************************************************/ +#undef NDEBUG #include "main/glheader.h" #include "main/bufferobj.h" @@ -209,7 +210,7 @@ static GLuint get_surface_type( GLenum type, GLuint size, case GL_UNSIGNED_BYTE: return ubyte_types_scale[size]; case GL_FIXED: return float_types[size]; /* was uploaded as floats */ default: assert(0); return 0; - } + } } } @@ -227,11 +228,11 @@ static GLuint get_size( GLenum type ) case GL_UNSIGNED_SHORT: return sizeof(GLushort); case GL_UNSIGNED_BYTE: return sizeof(GLubyte); case GL_FIXED: return sizeof(GLfloat); /* will be uploaded as floats */ - default: return 0; - } + default: assert(0); return 0; + } } -static GLuint get_index_type(GLenum type) +static GLuint get_index_type(GLenum type) { switch (type) { case GL_UNSIGNED_BYTE: return BRW_INDEX_BYTE; @@ -241,60 +242,23 @@ static GLuint get_index_type(GLenum type) } } -static void wrap_buffers( struct brw_context *brw, - GLuint size ) -{ - if (size < BRW_UPLOAD_INIT_SIZE) - size = BRW_UPLOAD_INIT_SIZE; - - brw->vb.upload.offset = 0; - - if (brw->vb.upload.bo != NULL) - drm_intel_bo_unreference(brw->vb.upload.bo); - brw->vb.upload.bo = drm_intel_bo_alloc(brw->intel.bufmgr, "temporary VBO", - size, 1); -} - -static void get_space( struct brw_context *brw, - GLuint size, - drm_intel_bo **bo_return, - GLuint *offset_return ) -{ - size = ALIGN(size, 64); - - if (brw->vb.upload.bo == NULL || - brw->vb.upload.offset + size > brw->vb.upload.bo->size) { - wrap_buffers(brw, size); - } - - assert(*bo_return == NULL); - drm_intel_bo_reference(brw->vb.upload.bo); - *bo_return = brw->vb.upload.bo; - *offset_return = brw->vb.upload.offset; - brw->vb.upload.offset += size; -} - static void -copy_array_to_vbo_array( struct brw_context *brw, - struct brw_vertex_element *element, - GLuint dst_stride) +copy_array_to_vbo_array(struct brw_context *brw, + struct brw_vertex_element *element, + int min, int max, + struct brw_vertex_buffer *buffer, + GLuint dst_stride) { - GLuint size = element->count * dst_stride; - - get_space(brw, size, &element->bo, &element->offset); - - if (element->glarray->StrideB == 0) { - assert(element->count == 1); - element->stride = 0; - } else { - element->stride = dst_stride; - } + int src_stride = element->glarray->StrideB; + const unsigned char *src = element->glarray->Ptr + min * src_stride; + int count = max - min + 1; + GLuint size = count * dst_stride; /* upload as floats */ if (element->glarray->Type == GL_FIXED) { + char * const map = intel_upload_map(&brw->intel, size, dst_stride); + char *dst = map; drm_intel_bo *src_bo = NULL; - const char *src; - char *dst; GLint i, j; /* map source bo */ @@ -305,68 +269,58 @@ copy_array_to_vbo_array( struct brw_context *brw, src_bo = intel_bufferobj_buffer(&brw->intel, intel_buffer, INTEL_READ); drm_intel_gem_bo_map_gtt(src_bo); - src = (const char *) element->bo->virtual + - (unsigned long) element->glarray->Ptr; - } - else { - src = (const char *) element->glarray->Ptr; + src = (const unsigned char *) src_bo->virtual + (unsigned long) src; } - drm_intel_gem_bo_map_gtt(element->bo); - dst = (char *) element->bo->virtual + element->offset; - - for (i = 0; i < element->count; i++) { + while (count--) { const GLint *s = (GLint *) src; GLfloat *d = (GLfloat *) dst; + int i; - for (j = 0; j < element->glarray->Size; j++) - d[j] = s[j] / 65536.0f; + for (i = 0; i < element->glarray->Size; i++) + d[i] = s[i] / 65536.0f; - src += element->glarray->StrideB; - dst += dst_stride; + src += src_stride; + dst += dst_stride; } - drm_intel_gem_bo_unmap_gtt(element->bo); + intel_upload_unmap(&brw->intel, map, size, dst_stride, + &buffer->bo, &buffer->offset); + if (src_bo) drm_intel_gem_bo_unmap_gtt(src_bo); return; } - if (dst_stride == element->glarray->StrideB) { - drm_intel_gem_bo_map_gtt(element->bo); - memcpy((char *)element->bo->virtual + element->offset, - element->glarray->Ptr, size); - drm_intel_gem_bo_unmap_gtt(element->bo); + if (dst_stride == src_stride) { + intel_upload_data(&brw->intel, src, size, dst_stride, + &buffer->bo, &buffer->offset); } else { - char *dest; - const unsigned char *src = element->glarray->Ptr; - int i; - - drm_intel_gem_bo_map_gtt(element->bo); - dest = element->bo->virtual; - dest += element->offset; - - for (i = 0; i < element->count; i++) { - memcpy(dest, src, dst_stride); - src += element->glarray->StrideB; - dest += dst_stride; - } + char * const map = intel_upload_map(&brw->intel, size, dst_stride); + char *dst = map; - drm_intel_gem_bo_unmap_gtt(element->bo); + while (count--) { + memcpy(dst, src, dst_stride); + src += src_stride; + dst += dst_stride; + } + intel_upload_unmap(&brw->intel, map, size, dst_stride, + &buffer->bo, &buffer->offset); } + buffer->stride = dst_stride; } static void brw_prepare_vertices(struct brw_context *brw) { struct gl_context *ctx = &brw->intel.ctx; struct intel_context *intel = intel_context(ctx); - GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; - GLuint i; + GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; const unsigned char *ptr = NULL; - GLuint interleave = 0; + GLuint interleaved = 0, total_size = 0; unsigned int min_index = brw->vb.min_index; unsigned int max_index = brw->vb.max_index; + int delta, i, j; struct brw_vertex_element *upload[VERT_ATTRIB_MAX]; GLuint nr_uploads = 0; @@ -379,13 +333,20 @@ static void brw_prepare_vertices(struct brw_context *brw) /* Accumulate the list of enabled arrays. */ brw->vb.nr_enabled = 0; while (vs_inputs) { - GLuint i = _mesa_ffsll(vs_inputs) - 1; + GLuint i = ffs(vs_inputs) - 1; struct brw_vertex_element *input = &brw->vb.inputs[i]; vs_inputs &= ~(1 << i); - brw->vb.enabled[brw->vb.nr_enabled++] = input; + if (input->glarray->Size && get_size(input->glarray->Type)) + brw->vb.enabled[brw->vb.nr_enabled++] = input; } + if (brw->vb.nr_enabled == 0) + return; + + if (brw->vb.nr_buffers) + goto validate; + /* XXX: In the rare cases where this happens we fallback all * the way to software rasterization, although a tnl fallback * would be sufficient. I don't know of *any* real world @@ -397,24 +358,44 @@ static void brw_prepare_vertices(struct brw_context *brw) return; } - for (i = 0; i < brw->vb.nr_enabled; i++) { + for (i = j = 0; i < brw->vb.nr_enabled; i++) { struct brw_vertex_element *input = brw->vb.enabled[i]; + const struct gl_client_array *glarray = input->glarray; + int type_size = get_size(glarray->Type); - input->element_size = get_size(input->glarray->Type) * input->glarray->Size; + input->element_size = type_size * glarray->Size; - if (_mesa_is_bufferobj(input->glarray->BufferObj) && - input->glarray->Type != GL_FIXED) { + if (_mesa_is_bufferobj(glarray->BufferObj) && + glarray->Type != GL_FIXED) { struct intel_buffer_object *intel_buffer = - intel_buffer_object(input->glarray->BufferObj); - - /* Named buffer object: Just reference its contents directly. */ - drm_intel_bo_unreference(input->bo); - input->bo = intel_bufferobj_buffer(intel, intel_buffer, - INTEL_READ); - drm_intel_bo_reference(input->bo); - input->offset = (unsigned long)input->glarray->Ptr; - input->stride = input->glarray->StrideB; - input->count = input->glarray->_MaxElement; + intel_buffer_object(glarray->BufferObj); + int k; + + for (k = 0; k < i; k++) { + const struct gl_client_array *other = brw->vb.enabled[k]->glarray; + if (glarray->BufferObj == other->BufferObj && + glarray->StrideB == other->StrideB && + (uintptr_t)(glarray->Ptr - other->Ptr) < glarray->StrideB) + { + input->buffer = brw->vb.enabled[k]->buffer; + input->offset = glarray->Ptr - other->Ptr; + break; + } + } + if (k == i) { + struct brw_vertex_buffer *buffer = &brw->vb.buffers[j]; + + /* Named buffer object: Just reference its contents directly. */ + buffer->bo = intel_bufferobj_source(intel, + intel_buffer, type_size, + &buffer->offset); + drm_intel_bo_reference(buffer->bo); + buffer->offset += (uintptr_t)glarray->Ptr; + buffer->stride = glarray->StrideB; + + input->buffer = j++; + input->offset = 0; + } /* This is a common place to reach if the user mistakenly supplies * a pointer in place of a VBO offset. If we just let it go through, @@ -428,71 +409,170 @@ static void brw_prepare_vertices(struct brw_context *brw) * probably a service to the poor programmer to do so rather than * trying to just not render. */ - assert(input->offset < input->bo->size); + assert(input->offset < brw->vb.buffers[input->buffer].bo->size); } else { - input->count = input->glarray->StrideB ? max_index + 1 : 1; - if (input->bo != NULL) { - /* Already-uploaded vertex data is present from a previous - * prepare_vertices, but we had to re-validate state due to - * check_aperture failing and a new batch being produced. - */ - continue; - } - /* Queue the buffer object up to be uploaded in the next pass, * when we've decided if we're doing interleaved or not. */ - if (input->attrib == VERT_ATTRIB_POS) { + if (nr_uploads == 0) { /* Position array not properly enabled: */ - if (input->glarray->StrideB == 0) { + if (input->attrib == VERT_ATTRIB_POS && glarray->StrideB == 0) { intel->Fallback = GL_TRUE; /* boolean, not bitfield */ return; } - interleave = input->glarray->StrideB; - ptr = input->glarray->Ptr; + interleaved = glarray->StrideB; + ptr = glarray->Ptr; + } + else if (interleaved != glarray->StrideB || + (uintptr_t)(glarray->Ptr - ptr) > interleaved) + { + interleaved = 0; } - else if (interleave != input->glarray->StrideB || - (const unsigned char *)input->glarray->Ptr - ptr < 0 || - (const unsigned char *)input->glarray->Ptr - ptr > interleave) + else if ((uintptr_t)(glarray->Ptr - ptr) & (type_size -1)) { - interleave = 0; + /* enforce natural alignment (for doubles) */ + interleaved = 0; } upload[nr_uploads++] = input; + total_size = ALIGN(total_size, type_size); + total_size += input->element_size; } } + /* If we need to upload all the arrays, then we can trim those arrays to + * only the used elements [min_index, max_index] so long as we adjust all + * the values used in the 3DPRIMITIVE i.e. by setting the vertex bias. + */ + brw->vb.start_vertex_bias = 0; + delta = min_index; + if (nr_uploads == brw->vb.nr_enabled) { + brw->vb.start_vertex_bias = -delta; + delta = 0; + } + if (delta && !brw->intel.intelScreen->relaxed_relocations) + min_index = delta = 0; + /* Handle any arrays to be uploaded. */ - if (nr_uploads > 1 && interleave && interleave <= 256) { - /* All uploads are interleaved, so upload the arrays together as - * interleaved. First, upload the contents and set up upload[0]. - */ - copy_array_to_vbo_array(brw, upload[0], interleave); - - for (i = 1; i < nr_uploads; i++) { - /* Then, just point upload[i] at upload[0]'s buffer. */ - upload[i]->stride = interleave; - upload[i]->offset = upload[0]->offset + - ((const unsigned char *)upload[i]->glarray->Ptr - ptr); - upload[i]->bo = upload[0]->bo; - drm_intel_bo_reference(upload[i]->bo); + if (nr_uploads > 1) { + if (interleaved && interleaved <= 2*total_size) { + struct brw_vertex_buffer *buffer = &brw->vb.buffers[j]; + /* All uploads are interleaved, so upload the arrays together as + * interleaved. First, upload the contents and set up upload[0]. + */ + copy_array_to_vbo_array(brw, upload[0], min_index, max_index, + buffer, interleaved); + buffer->offset -= delta * interleaved; + + for (i = 0; i < nr_uploads; i++) { + /* Then, just point upload[i] at upload[0]'s buffer. */ + upload[i]->offset = + ((const unsigned char *)upload[i]->glarray->Ptr - ptr); + upload[i]->buffer = j; + } + j++; + + nr_uploads = 0; } - } - else { - /* Upload non-interleaved arrays */ - for (i = 0; i < nr_uploads; i++) { - copy_array_to_vbo_array(brw, upload[i], upload[i]->element_size); + else if (total_size < 2048) { + /* Upload non-interleaved arrays into a single interleaved array */ + struct brw_vertex_buffer *buffer; + int count = max_index - min_index + 1; + int offset; + char *map; + + map = intel_upload_map(&brw->intel, total_size * count, total_size); + for (i = offset = 0; i < nr_uploads; i++) { + const unsigned char *src = upload[i]->glarray->Ptr; + int size = upload[i]->element_size; + int stride = upload[i]->glarray->StrideB; + char *dst; + int n; + + offset = ALIGN(offset, get_size(upload[i]->glarray->Type)); + dst = map + offset; + src += min_index * stride; + + if (upload[i]->glarray->Type == GL_FIXED) { + for (n = 0; n < count; n++) { + const GLint *s = (GLint *) src; + GLfloat *d = (GLfloat *) dst; + int k; + + for (k = 0; k < upload[i]->glarray->Size; k++) { + d[k] = s[k] / 65536.0f; + } + + src += stride; + dst += total_size; + } + } + else { + for (n = 0; n < count; n++) { + memcpy(dst, src, size); + src += stride; + dst += total_size; + } + } + + upload[i]->offset = offset; + upload[i]->buffer = j; + + offset += size; + } + assert(offset == total_size); + buffer = &brw->vb.buffers[j++]; + intel_upload_unmap(&brw->intel, map, offset * count, offset, + &buffer->bo, &buffer->offset); + buffer->stride = offset; + buffer->offset -= delta * offset; + + nr_uploads = 0; } } + /* Upload non-interleaved arrays */ + for (i = 0; i < nr_uploads; i++) { + struct brw_vertex_buffer *buffer = &brw->vb.buffers[j]; + copy_array_to_vbo_array(brw, upload[i], min_index, max_index, + buffer, upload[i]->element_size); + buffer->offset -= delta * buffer->stride; + upload[i]->buffer = j++; + upload[i]->offset = 0; + } - brw_prepare_query_begin(brw); + /* can we simply extend the current vb? */ + if (j == brw->vb.nr_current_buffers) { + int delta = 0; + for (i = 0; i < j; i++) { + int d; + + if (brw->vb.current_buffers[i].handle != brw->vb.buffers[i].bo->handle || + brw->vb.current_buffers[i].stride != brw->vb.buffers[i].stride) + break; + + d = brw->vb.buffers[i].offset - brw->vb.current_buffers[i].offset; + if (i == 0) + delta = d / brw->vb.current_buffers[i].stride; + if (delta * brw->vb.current_buffers[i].stride != d) + break; + } - for (i = 0; i < brw->vb.nr_enabled; i++) { - struct brw_vertex_element *input = brw->vb.enabled[i]; + if (i == j) { + brw->vb.start_vertex_bias += delta; + while (--j >= 0) + drm_intel_bo_unreference(brw->vb.buffers[j].bo); + j = 0; + } + } - brw_add_validated_bo(brw, input->bo); + brw->vb.nr_buffers = j; + +validate: + brw_prepare_query_begin(brw); + for (i = 0; i < brw->vb.nr_buffers; i++) { + brw_add_validated_bo(brw, brw->vb.buffers[i].bo); } } @@ -529,49 +609,44 @@ static void brw_emit_vertices(struct brw_context *brw) (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT)); - ADVANCE_BATCH(); + CACHED_BATCH(); return; } /* Now emit VB and VEP state packets. - * - * This still defines a hardware VB for each input, even if they - * are interleaved or from the same VBO. TBD if this makes a - * performance difference. */ - BEGIN_BATCH(1 + brw->vb.nr_enabled * 4); - OUT_BATCH((CMD_VERTEX_BUFFER << 16) | - ((1 + brw->vb.nr_enabled * 4) - 2)); - for (i = 0; i < brw->vb.nr_enabled; i++) { - struct brw_vertex_element *input = brw->vb.enabled[i]; - uint32_t dw0; + if (brw->vb.nr_buffers) { + BEGIN_BATCH(1 + 4*brw->vb.nr_buffers); + OUT_BATCH((CMD_VERTEX_BUFFER << 16) | (4*brw->vb.nr_buffers - 1)); + for (i = 0; i < brw->vb.nr_buffers; i++) { + struct brw_vertex_buffer *buffer = &brw->vb.buffers[i]; + uint32_t dw0; + + if (intel->gen >= 6) { + dw0 = GEN6_VB0_ACCESS_VERTEXDATA | (i << GEN6_VB0_INDEX_SHIFT); + } else { + dw0 = BRW_VB0_ACCESS_VERTEXDATA | (i << BRW_VB0_INDEX_SHIFT); + } - if (intel->gen >= 6) { - dw0 = GEN6_VB0_ACCESS_VERTEXDATA | - (i << GEN6_VB0_INDEX_SHIFT); - } else { - dw0 = BRW_VB0_ACCESS_VERTEXDATA | - (i << BRW_VB0_INDEX_SHIFT); + OUT_BATCH(dw0 | (buffer->stride << BRW_VB0_PITCH_SHIFT)); + OUT_RELOC(buffer->bo, I915_GEM_DOMAIN_VERTEX, 0, buffer->offset); + if (intel->gen >= 5) { + OUT_RELOC(buffer->bo, I915_GEM_DOMAIN_VERTEX, 0, buffer->bo->size - 1); + } else + OUT_BATCH(buffer->bo->size / buffer->stride); + OUT_BATCH(0); /* Instance data step rate */ + + brw->vb.current_buffers[i].handle = buffer->bo->handle; + brw->vb.current_buffers[i].offset = buffer->offset; + brw->vb.current_buffers[i].stride = buffer->stride; } - - OUT_BATCH(dw0 | - (input->stride << BRW_VB0_PITCH_SHIFT)); - OUT_RELOC(input->bo, - I915_GEM_DOMAIN_VERTEX, 0, - input->offset); - if (intel->gen >= 5) { - OUT_RELOC(input->bo, - I915_GEM_DOMAIN_VERTEX, 0, - input->bo->size - 1); - } else - OUT_BATCH(input->stride ? input->count : 0); - OUT_BATCH(0); /* Instance data step rate */ + brw->vb.nr_current_buffers = i; + ADVANCE_BATCH(); } - ADVANCE_BATCH(); BEGIN_BATCH(1 + brw->vb.nr_enabled * 2); - OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2)); + OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | (2*brw->vb.nr_enabled - 1)); for (i = 0; i < brw->vb.nr_enabled; i++) { struct brw_vertex_element *input = brw->vb.enabled[i]; uint32_t format = get_surface_type(input->glarray->Type, @@ -592,15 +667,15 @@ static void brw_emit_vertices(struct brw_context *brw) } if (intel->gen >= 6) { - OUT_BATCH((i << GEN6_VE0_INDEX_SHIFT) | + OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | (format << BRW_VE0_FORMAT_SHIFT) | - (0 << BRW_VE0_SRC_OFFSET_SHIFT)); + (input->offset << BRW_VE0_SRC_OFFSET_SHIFT)); } else { - OUT_BATCH((i << BRW_VE0_INDEX_SHIFT) | + OUT_BATCH((input->buffer << BRW_VE0_INDEX_SHIFT) | BRW_VE0_VALID | (format << BRW_VE0_FORMAT_SHIFT) | - (0 << BRW_VE0_SRC_OFFSET_SHIFT)); + (input->offset << BRW_VE0_SRC_OFFSET_SHIFT)); } if (intel->gen >= 5) @@ -615,7 +690,7 @@ static void brw_emit_vertices(struct brw_context *brw) (comp3 << BRW_VE1_COMPONENT_3_SHIFT) | ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT)); } - ADVANCE_BATCH(); + CACHED_BATCH(); } const struct brw_tracked_state brw_vertices = { @@ -644,25 +719,19 @@ static void brw_prepare_indices(struct brw_context *brw) ib_type_size = get_size(index_buffer->type); ib_size = ib_type_size * index_buffer->count; - bufferobj = index_buffer->obj;; + bufferobj = index_buffer->obj; /* Turn into a proper VBO: */ if (!_mesa_is_bufferobj(bufferobj)) { - brw->ib.start_vertex_offset = 0; /* Get new bufferobj, offset: */ - get_space(brw, ib_size, &bo, &offset); - - /* Straight upload - */ - drm_intel_gem_bo_map_gtt(bo); - memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size); - drm_intel_gem_bo_unmap_gtt(bo); + intel_upload_data(&brw->intel, index_buffer->ptr, ib_size, ib_type_size, + &bo, &offset); + brw->ib.start_vertex_offset = offset / ib_type_size; } else { offset = (GLuint) (unsigned long) index_buffer->ptr; - brw->ib.start_vertex_offset = 0; /* If the index buffer isn't aligned to its element size, we have to * rebase it into a temporary. @@ -674,41 +743,42 @@ static void brw_prepare_indices(struct brw_context *brw) bufferobj); map += offset; - get_space(brw, ib_size, &bo, &offset); - - drm_intel_bo_subdata(bo, offset, ib_size, map); + intel_upload_data(&brw->intel, map, ib_size, ib_type_size, + &bo, &offset); + brw->ib.start_vertex_offset = offset / ib_type_size; ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER_ARB, bufferobj); } else { - bo = intel_bufferobj_buffer(intel, intel_buffer_object(bufferobj), - INTEL_READ); - drm_intel_bo_reference(bo); - /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading * the index buffer state when we're just moving the start index * of our drawing. */ brw->ib.start_vertex_offset = offset / ib_type_size; - offset = 0; - ib_size = bo->size; + + bo = intel_bufferobj_source(intel, + intel_buffer_object(bufferobj), + ib_type_size, + &offset); + drm_intel_bo_reference(bo); + + brw->ib.start_vertex_offset += offset / ib_type_size; } } - if (brw->ib.bo != bo || - brw->ib.offset != offset || - brw->ib.size != ib_size) - { + if (brw->ib.bo != bo) { drm_intel_bo_unreference(brw->ib.bo); brw->ib.bo = bo; - brw->ib.offset = offset; - brw->ib.size = ib_size; + brw_add_validated_bo(brw, brw->ib.bo); brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER; } else { drm_intel_bo_unreference(bo); } - brw_add_validated_bo(brw, brw->ib.bo); + if (index_buffer->type != brw->ib.type) { + brw->ib.type = index_buffer->type; + brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER; + } } const struct brw_tracked_state brw_indices = { @@ -728,29 +798,18 @@ static void brw_emit_index_buffer(struct brw_context *brw) if (index_buffer == NULL) return; - /* Emit the indexbuffer packet: - */ - { - struct brw_indexbuffer ib; - - memset(&ib, 0, sizeof(ib)); - - ib.header.bits.opcode = CMD_INDEX_BUFFER; - ib.header.bits.length = sizeof(ib)/4 - 2; - ib.header.bits.index_format = get_index_type(index_buffer->type); - ib.header.bits.cut_index_enable = 0; - - BEGIN_BATCH(4); - OUT_BATCH( ib.header.dword ); - OUT_RELOC(brw->ib.bo, - I915_GEM_DOMAIN_VERTEX, 0, - brw->ib.offset); - OUT_RELOC(brw->ib.bo, - I915_GEM_DOMAIN_VERTEX, 0, - brw->ib.offset + brw->ib.size - 1); - OUT_BATCH( 0 ); - ADVANCE_BATCH(); - } + BEGIN_BATCH(3); + OUT_BATCH(CMD_INDEX_BUFFER << 16 | + /* cut index enable << 10 */ + get_index_type(index_buffer->type) << 8 | + 1); + OUT_RELOC(brw->ib.bo, + I915_GEM_DOMAIN_VERTEX, 0, + 0); + OUT_RELOC(brw->ib.bo, + I915_GEM_DOMAIN_VERTEX, 0, + brw->ib.bo->size - 1); + ADVANCE_BATCH(); } const struct brw_tracked_state brw_index_buffer = { diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 4dbdc52210..119ffc7237 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -861,7 +861,8 @@ void brw_fb_WRITE(struct brw_compile *p, GLuint binding_table_index, GLuint msg_length, GLuint response_length, - GLboolean eot); + GLboolean eot, + GLboolean header_present); void brw_SAMPLE(struct brw_compile *p, struct brw_reg dest, diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index f62fc7ebfb..88131c432e 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -536,6 +536,16 @@ brw_set_dp_read_message(struct brw_context *brw, insn->bits3.dp_read_gen5.end_of_thread = 0; insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ; insn->bits2.send_gen5.end_of_thread = 0; + } else if (intel->is_g4x) { + insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/ + insn->bits3.dp_read_g4x.msg_control = msg_control; /*8:10*/ + insn->bits3.dp_read_g4x.msg_type = msg_type; /*11:13*/ + insn->bits3.dp_read_g4x.target_cache = target_cache; /*14:15*/ + insn->bits3.dp_read_g4x.response_length = response_length; /*16:19*/ + insn->bits3.dp_read_g4x.msg_length = msg_length; /*20:23*/ + insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/ + insn->bits3.dp_read_g4x.pad1 = 0; + insn->bits3.dp_read_g4x.end_of_thread = 0; } else { insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/ insn->bits3.dp_read.msg_control = msg_control; /*8:11*/ @@ -1708,29 +1718,22 @@ void brw_dp_READ_4_vs(struct brw_compile *p, GLuint location, GLuint bind_table_index) { + struct intel_context *intel = &p->brw->intel; struct brw_instruction *insn; GLuint msg_reg_nr = 1; - struct brw_reg b; - /* - printf("vs const read msg, location %u, msg_reg_nr %d\n", - location, msg_reg_nr); - */ + if (intel->gen >= 6) + location /= 16; /* Setup MRF[1] with location/offset into const buffer */ brw_push_insn_state(p); + brw_set_access_mode(p, BRW_ALIGN_1); brw_set_compression_control(p, BRW_COMPRESSION_NONE); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_predicate_control(p, BRW_PREDICATE_NONE); - - /* XXX I think we're setting all the dwords of MRF[1] to 'location'. - * when the docs say only dword[2] should be set. Hmmm. But it works. - */ - b = brw_message_reg(msg_reg_nr); - b = retype(b, BRW_REGISTER_TYPE_UD); - /*b = get_element_ud(b, 2);*/ - brw_MOV(p, b, brw_imm_ud(location)); - + brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2), + BRW_REGISTER_TYPE_UD), + brw_imm_ud(location)); brw_pop_insn_state(p); insn = next_insn(p, BRW_OPCODE_SEND); @@ -1741,7 +1744,11 @@ void brw_dp_READ_4_vs(struct brw_compile *p, insn->header.mask_control = BRW_MASK_DISABLE; brw_set_dest(p, insn, dest); - brw_set_src0(insn, brw_null_reg()); + if (intel->gen >= 6) { + brw_set_src0(insn, brw_message_reg(msg_reg_nr)); + } else { + brw_set_src0(insn, brw_null_reg()); + } brw_set_dp_read_message(p->brw, insn, @@ -1768,6 +1775,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p, /* Setup MRF[1] with offset into const buffer */ brw_push_insn_state(p); + brw_set_access_mode(p, BRW_ALIGN_1); brw_set_compression_control(p, BRW_COMPRESSION_NONE); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_predicate_control(p, BRW_PREDICATE_NONE); @@ -1775,7 +1783,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p, /* M1.0 is block offset 0, M1.4 is block offset 1, all other * fields ignored. */ - brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), + brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D), addr_reg, brw_imm_d(offset)); brw_pop_insn_state(p); @@ -1816,12 +1824,12 @@ void brw_fb_WRITE(struct brw_compile *p, GLuint binding_table_index, GLuint msg_length, GLuint response_length, - GLboolean eot) + GLboolean eot, + GLboolean header_present) { struct intel_context *intel = &p->brw->intel; struct brw_instruction *insn; GLuint msg_control, msg_type; - GLboolean header_present = GL_TRUE; if (intel->gen >= 6 && binding_table_index == 0) { insn = next_insn(p, BRW_OPCODE_SENDC); @@ -1833,9 +1841,6 @@ void brw_fb_WRITE(struct brw_compile *p, insn->header.compression_control = BRW_COMPRESSION_NONE; if (intel->gen >= 6) { - if (msg_length == 4) - header_present = GL_FALSE; - /* headerless version, just submit color payload */ src0 = brw_message_reg(msg_reg_nr); @@ -1940,7 +1945,8 @@ void brw_SAMPLE(struct brw_compile *p, brw_set_compression_control(p, BRW_COMPRESSION_NONE); brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, m1, brw_vec8_grf(0,0)); + brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), + retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD)); brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12)); brw_pop_insn_state(p); @@ -2001,7 +2007,8 @@ void brw_SAMPLE(struct brw_compile *p, */ brw_push_insn_state(p); brw_set_compression_control(p, BRW_COMPRESSION_NONE); - brw_MOV(p, reg, reg); + brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD), + retype(reg, BRW_REGISTER_TYPE_UD)); brw_pop_insn_state(p); } @@ -2033,7 +2040,8 @@ void brw_urb_WRITE(struct brw_compile *p, if (intel->gen >= 6) { brw_push_insn_state(p); brw_set_mask_control( p, BRW_MASK_DISABLE ); - brw_MOV(p, brw_message_reg(msg_reg_nr), src0); + brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD), + retype(src0, BRW_REGISTER_TYPE_UD)); brw_pop_insn_state(p); src0 = brw_message_reg(msg_reg_nr); } diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c index 6796fb208d..d0b0c22abf 100644 --- a/src/mesa/drivers/dri/i965/brw_fallback.c +++ b/src/mesa/drivers/dri/i965/brw_fallback.c @@ -36,8 +36,6 @@ #include "swrast/swrast.h" #include "tnl/tnl.h" #include "brw_context.h" -#include "intel_fbo.h" -#include "intel_regions.h" #define FILE_DEBUG_FLAG DEBUG_FALLBACKS @@ -63,49 +61,14 @@ static GLboolean do_check_fallback(struct brw_context *brw) for (i = 0; i < BRW_MAX_TEX_UNIT; i++) { struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i]; if (texUnit->_ReallyEnabled) { - struct intel_texture_object *intelObj = intel_texture_object(texUnit->_Current); - struct gl_texture_image *texImage = intelObj->base.Image[0][intelObj->firstLevel]; + struct gl_texture_object *tex_obj = texUnit->_Current; + struct gl_texture_image *texImage = tex_obj->Image[0][tex_obj->BaseLevel]; if (texImage->Border) { DBG("FALLBACK: texture border\n"); return GL_TRUE; } } } - - /* _NEW_STENCIL - */ - if (ctx->Stencil._Enabled && - (ctx->DrawBuffer->Name == 0 && !brw->intel.hw_stencil)) { - DBG("FALLBACK: stencil\n"); - return GL_TRUE; - } - - /* _NEW_BUFFERS */ - if (!brw->has_surface_tile_offset) { - for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) { - struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i]; - struct intel_renderbuffer *irb = intel_renderbuffer(rb); - - /* The original gen4 hardware couldn't set up WM surfaces pointing - * at an offset within a tile, which can happen when rendering to - * anything but the base level of a texture or the +X face/0 depth. - * This was fixed with the 4 Series hardware. - * - * For these original chips, you would have to make the depth and - * color destination surfaces include information on the texture - * type, LOD, face, and various limits to use them as a destination. - * I would have done this, but there's also a nasty requirement that - * the depth and the color surfaces all be of the same LOD, which - * may be a worse requirement than this alignment. (Also, we may - * want to just demote the texture to untiled, instead). - */ - if (irb->region && irb->region->tiling != I915_TILING_NONE && - (irb->region->draw_offset & 4095)) { - DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n"); - return GL_TRUE; - } - } - } return GL_FALSE; } @@ -117,7 +80,7 @@ static void check_fallback(struct brw_context *brw) const struct brw_tracked_state brw_check_fallback = { .dirty = { - .mesa = _NEW_BUFFERS | _NEW_RENDERMODE | _NEW_TEXTURE | _NEW_STENCIL, + .mesa = _NEW_RENDERMODE | _NEW_TEXTURE | _NEW_STENCIL, .brw = 0, .cache = 0 }, diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 6bb195b487..2c997b4eb3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -41,13 +41,13 @@ extern "C" { #include "brw_context.h" #include "brw_eu.h" #include "brw_wm.h" -#include "talloc.h" } #include "brw_fs.h" #include "../glsl/glsl_types.h" #include "../glsl/ir_optimization.h" #include "../glsl/ir_print_visitor.h" +#define MAX_INSTRUCTION (1 << 30) static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg); struct gl_shader * @@ -55,7 +55,7 @@ brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type) { struct brw_shader *shader; - shader = talloc_zero(NULL, struct brw_shader); + shader = rzalloc(NULL, struct brw_shader); if (shader) { shader->base.Type = type; shader->base.Name = name; @@ -69,7 +69,7 @@ struct gl_shader_program * brw_new_shader_program(struct gl_context *ctx, GLuint name) { struct brw_shader_program *prog; - prog = talloc_zero(NULL, struct brw_shader_program); + prog = rzalloc(NULL, struct brw_shader_program); if (prog) { prog->base.Name = name; _mesa_init_shader_program(ctx, &prog->base); @@ -89,14 +89,17 @@ brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader) GLboolean brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) { + struct brw_context *brw = brw_context(ctx); + struct intel_context *intel = &brw->intel; + struct brw_shader *shader = (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; if (shader != NULL) { - void *mem_ctx = talloc_new(NULL); + void *mem_ctx = ralloc_context(NULL); bool progress; if (shader->ir) - talloc_free(shader->ir); + ralloc_free(shader->ir); shader->ir = new(shader) exec_list; clone_ir_list(mem_ctx, shader->ir, shader->base.ir); @@ -107,8 +110,24 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) SUB_TO_ADD_NEG | EXP_TO_EXP2 | LOG_TO_LOG2); + + /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this, + * if-statements need to be flattened. + */ + if (intel->gen < 6) + lower_if_to_cond_assign(shader->ir, 16); + do_lower_texture_projection(shader->ir); + do_vec_index_to_cond_assign(shader->ir); brw_do_cubemap_normalize(shader->ir); + lower_noise(shader->ir); + lower_quadop_vector(shader->ir, false); + lower_variable_index_to_cond_assign(shader->ir, + GL_TRUE, /* input */ + GL_TRUE, /* output */ + GL_TRUE, /* temp */ + GL_TRUE /* uniform */ + ); do { progress = false; @@ -123,22 +142,12 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) ) || progress; progress = do_common_optimization(shader->ir, true, 32) || progress; - - progress = lower_noise(shader->ir) || progress; - progress = - lower_variable_index_to_cond_assign(shader->ir, - GL_TRUE, /* input */ - GL_TRUE, /* output */ - GL_TRUE, /* temp */ - GL_TRUE /* uniform */ - ) || progress; - progress = lower_quadop_vector(shader->ir, false) || progress; } while (progress); validate_ir_tree(shader->ir); reparent_ir(shader->ir, shader->ir); - talloc_free(mem_ctx); + ralloc_free(mem_ctx); } if (!_mesa_ir_link_shader(ctx, prog)) @@ -202,6 +211,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst) return 2; case FS_OPCODE_TEX: case FS_OPCODE_TXB: + case FS_OPCODE_TXD: case FS_OPCODE_TXL: return 1; case FS_OPCODE_FB_WRITE: @@ -225,8 +235,8 @@ fs_visitor::virtual_grf_alloc(int size) virtual_grf_array_size = 16; else virtual_grf_array_size *= 2; - virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes, - int, virtual_grf_array_size); + virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int, + virtual_grf_array_size); /* This slot is always unused. */ virtual_grf_sizes[0] = 0; @@ -304,7 +314,6 @@ int fs_visitor::setup_uniform_values(int loc, const glsl_type *type) { unsigned int offset = 0; - float *vec_values; if (type->is_matrix()) { const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT, @@ -323,7 +332,6 @@ fs_visitor::setup_uniform_values(int loc, const glsl_type *type) case GLSL_TYPE_UINT: case GLSL_TYPE_INT: case GLSL_TYPE_BOOL: - vec_values = fp->Base.Parameters->ParameterValues[loc]; for (unsigned int i = 0; i < type->vector_elements; i++) { unsigned int param = c->prog_data.nr_params++; @@ -347,8 +355,8 @@ fs_visitor::setup_uniform_values(int loc, const glsl_type *type) c->prog_data.param_convert[param] = PARAM_NO_CONVERT; break; } - - c->prog_data.param[param] = &vec_values[i]; + this->param_index[param] = loc; + this->param_offset[param] = i; } return 1; @@ -419,7 +427,6 @@ fs_visitor::setup_builtin_uniform_values(ir_variable *ir) */ int index = _mesa_add_state_reference(this->fp->Base.Parameters, (gl_state_index *)tokens); - float *vec_values = this->fp->Base.Parameters->ParameterValues[index]; /* Add each of the unique swizzles of the element as a * parameter. This'll end up matching the expected layout of @@ -434,7 +441,9 @@ fs_visitor::setup_builtin_uniform_values(ir_variable *ir) c->prog_data.param_convert[c->prog_data.nr_params] = PARAM_NO_CONVERT; - c->prog_data.param[c->prog_data.nr_params++] = &vec_values[swiz]; + this->param_index[c->prog_data.nr_params] = index; + this->param_offset[c->prog_data.nr_params] = swiz; + c->prog_data.nr_params++; } } } @@ -474,8 +483,13 @@ fs_visitor::emit_fragcoord_interpolation(ir_variable *ir) wpos.reg_offset++; /* gl_FragCoord.z */ - emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, - interp_reg(FRAG_ATTRIB_WPOS, 2))); + if (intel->gen >= 6) { + emit(fs_inst(BRW_OPCODE_MOV, wpos, + fs_reg(brw_vec8_grf(c->source_depth_reg, 0)))); + } else { + emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y, + interp_reg(FRAG_ATTRIB_WPOS, 2))); + } wpos.reg_offset++; /* gl_FragCoord.w: Already set up in emit_interpolation */ @@ -518,25 +532,40 @@ fs_visitor::emit_general_interpolation(ir_variable *ir) continue; } - for (unsigned int c = 0; c < type->vector_elements; c++) { - struct brw_reg interp = interp_reg(location, c); - emit(fs_inst(FS_OPCODE_LINTERP, - attr, - this->delta_x, - this->delta_y, - fs_reg(interp))); - attr.reg_offset++; - } - - if (intel->gen < 6) { - attr.reg_offset -= type->vector_elements; + if (c->key.flat_shade && (location == FRAG_ATTRIB_COL0 || + location == FRAG_ATTRIB_COL1)) { + /* Constant interpolation (flat shading) case. The SF has + * handed us defined values in only the constant offset + * field of the setup reg. + */ for (unsigned int c = 0; c < type->vector_elements; c++) { - emit(fs_inst(BRW_OPCODE_MUL, - attr, + struct brw_reg interp = interp_reg(location, c); + interp = suboffset(interp, 3); + emit(fs_inst(FS_OPCODE_CINTERP, attr, fs_reg(interp))); + attr.reg_offset++; + } + } else { + /* Perspective interpolation case. */ + for (unsigned int c = 0; c < type->vector_elements; c++) { + struct brw_reg interp = interp_reg(location, c); + emit(fs_inst(FS_OPCODE_LINTERP, attr, - this->pixel_w)); + this->delta_x, + this->delta_y, + fs_reg(interp))); attr.reg_offset++; } + + if (intel->gen < 6) { + attr.reg_offset -= type->vector_elements; + for (unsigned int c = 0; c < type->vector_elements; c++) { + emit(fs_inst(BRW_OPCODE_MUL, + attr, + attr, + this->pixel_w)); + attr.reg_offset++; + } + } } location++; } @@ -631,14 +660,18 @@ fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1) assert(opcode == FS_OPCODE_POW); if (intel->gen >= 6) { - /* Can't do hstride == 0 args to gen6 math, so expand it out. */ - if (src0.file == UNIFORM) { + /* Can't do hstride == 0 args to gen6 math, so expand it out. + * + * The hardware ignores source modifiers (negate and abs) on math + * instructions, so we also move to a temp to set those up. + */ + if (src0.file == UNIFORM || src0.abs || src0.negate) { fs_reg expanded = fs_reg(this, glsl_type::float_type); emit(fs_inst(BRW_OPCODE_MOV, expanded, src0)); src0 = expanded; } - if (src1.file == UNIFORM) { + if (src1.file == UNIFORM || src1.abs || src1.negate) { fs_reg expanded = fs_reg(this, glsl_type::float_type); emit(fs_inst(BRW_OPCODE_MOV, expanded, src1)); src1 = expanded; @@ -770,6 +803,30 @@ fs_visitor::try_emit_saturate(ir_expression *ir) return true; } +static uint32_t +brw_conditional_for_comparison(unsigned int op) +{ + switch (op) { + case ir_binop_less: + return BRW_CONDITIONAL_L; + case ir_binop_greater: + return BRW_CONDITIONAL_G; + case ir_binop_lequal: + return BRW_CONDITIONAL_LE; + case ir_binop_gequal: + return BRW_CONDITIONAL_GE; + case ir_binop_equal: + case ir_binop_all_equal: /* same as equal for scalars */ + return BRW_CONDITIONAL_Z; + case ir_binop_nequal: + case ir_binop_any_nequal: /* same as nequal for scalars */ + return BRW_CONDITIONAL_NZ; + default: + assert(!"not reached: bad operation for comparison"); + return BRW_CONDITIONAL_NZ; + } +} + void fs_visitor::visit(ir_expression *ir) { @@ -819,6 +876,7 @@ fs_visitor::visit(ir_expression *ir) break; case ir_unop_abs: op[0].abs = true; + op[0].negate = false; this->result = op[0]; break; case ir_unop_sign: @@ -885,35 +943,20 @@ fs_visitor::visit(ir_expression *ir) break; case ir_binop_less: - inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_L; - emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1))); - break; case ir_binop_greater: - inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_G; - emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1))); - break; case ir_binop_lequal: - inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_LE; - emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1))); - break; case ir_binop_gequal: - inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_GE; - emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1))); - break; case ir_binop_equal: - case ir_binop_all_equal: /* same as nequal for scalars */ - inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_Z; - emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1))); - break; + case ir_binop_all_equal: case ir_binop_nequal: - case ir_binop_any_nequal: /* same as nequal for scalars */ - inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_NZ; + case ir_binop_any_nequal: + temp = this->result; + /* original gen4 does implicit conversion before comparison. */ + if (intel->gen < 5) + temp.type = op[0].type; + + inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], op[1])); + inst->conditional_mod = brw_conditional_for_comparison(ir->operation); emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1))); break; @@ -958,7 +1001,12 @@ fs_visitor::visit(ir_expression *ir) break; case ir_unop_f2b: case ir_unop_i2b: - inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f))); + temp = this->result; + /* original gen4 does implicit conversion before comparison. */ + if (intel->gen < 5) + temp.type = op[0].type; + + inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f))); inst->conditional_mod = BRW_CONDITIONAL_NZ; inst = emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(1))); @@ -1151,6 +1199,8 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate) } /* gen4's SIMD8 sampler always has the slots for u,v,r present. */ mlen += 3; + } else if (ir->op == ir_txd) { + assert(!"TXD isn't supported on gen4 yet."); } else { /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod * instructions. We'll need to do SIMD16 here. @@ -1204,6 +1254,8 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate) inst = emit(fs_inst(FS_OPCODE_TXL, dst)); break; case ir_txd: + inst = emit(fs_inst(FS_OPCODE_TXD, dst)); + break; case ir_txf: assert(!"GLSL 1.30 features unsupported"); break; @@ -1292,6 +1344,37 @@ fs_visitor::visit(ir_texture *ir) ir->coordinate->accept(this); fs_reg coordinate = this->result; + if (ir->offset != NULL) { + ir_constant *offset = ir->offset->as_constant(); + assert(offset != NULL); + + signed char offsets[3]; + for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) + offsets[i] = (signed char) offset->value.i[i]; + + /* Combine all three offsets into a single unsigned dword: + * + * bits 11:8 - U Offset (X component) + * bits 7:4 - V Offset (Y component) + * bits 3:0 - R Offset (Z component) + */ + unsigned offset_bits = 0; + for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) { + const unsigned shift = 4 * (2 - i); + offset_bits |= (offsets[i] << shift) & (0xF << shift); + } + + /* Explicitly set up the message header by copying g0 to msg reg m1. */ + emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD), + fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD))); + + /* Then set the offset bits in DWord 2 of the message header. */ + emit(fs_inst(BRW_OPCODE_MOV, + fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2), + BRW_REGISTER_TYPE_UD)), + fs_reg(brw_imm_uw(offset_bits)))); + } + /* Should be lowered by do_lower_texture_projection */ assert(!ir->projector); @@ -1323,10 +1406,13 @@ fs_visitor::visit(ir_texture *ir) fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1); GLuint index = _mesa_add_state_reference(params, (gl_state_index *)tokens); - float *vec_values = this->fp->Base.Parameters->ParameterValues[index]; - c->prog_data.param[c->prog_data.nr_params++] = &vec_values[0]; - c->prog_data.param[c->prog_data.nr_params++] = &vec_values[1]; + this->param_index[c->prog_data.nr_params] = index; + this->param_offset[c->prog_data.nr_params] = 0; + c->prog_data.nr_params++; + this->param_index[c->prog_data.nr_params] = index; + this->param_offset[c->prog_data.nr_params] = 1; + c->prog_data.nr_params++; fs_reg dst = fs_reg(this, ir->coordinate->type); fs_reg src = coordinate; @@ -1349,6 +1435,14 @@ fs_visitor::visit(ir_texture *ir) inst = emit_texture_gen5(ir, dst, coordinate); } + /* If there's an offset, we already set up m1. To avoid the implied move, + * use the null register. Otherwise, we want an implied move from g0. + */ + if (ir->offset != NULL) + inst->src[0] = fs_reg(brw_null_reg()); + else + inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW)); + inst->sampler = sampler; this->result = dst; @@ -1356,7 +1450,10 @@ fs_visitor::visit(ir_texture *ir) if (ir->shadow_comparitor) inst->shadow_compare = true; - if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { + if (ir->type == glsl_type::float_type) { + /* Ignore DEPTH_TEXTURE_MODE swizzling. */ + assert(ir->sampler->type->sampler_shadow); + } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) { fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type); for (int i = 0; i < 4; i++) { @@ -1541,7 +1638,7 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f))); } else { - inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0])); + inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_f, op[0])); } inst->conditional_mod = BRW_CONDITIONAL_NZ; break; @@ -1556,31 +1653,18 @@ fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir) break; case ir_binop_greater: - inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_G; - break; case ir_binop_gequal: - inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_GE; - break; case ir_binop_less: - inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_L; - break; case ir_binop_lequal: - inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_LE; - break; case ir_binop_equal: case ir_binop_all_equal: - inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_Z; - break; case ir_binop_nequal: case ir_binop_any_nequal: - inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_NZ; + inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1])); + inst->conditional_mod = + brw_conditional_for_comparison(expr->operation); break; + default: assert(!"not reached"); this->fail = true; @@ -1659,30 +1743,16 @@ fs_visitor::emit_if_gen6(ir_if *ir) return; case ir_binop_greater: - inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_G; - return; case ir_binop_gequal: - inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_GE; - return; case ir_binop_less: - inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_L; - return; case ir_binop_lequal: - inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_LE; - return; case ir_binop_equal: case ir_binop_all_equal: - inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_Z; - return; case ir_binop_nequal: case ir_binop_any_nequal: inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1])); - inst->conditional_mod = BRW_CONDITIONAL_NZ; + inst->conditional_mod = + brw_conditional_for_comparison(expr->operation); return; default: assert(!"not reached"); @@ -1764,32 +1834,9 @@ fs_visitor::visit(ir_loop *ir) this->base_ir = ir->to; ir->to->accept(this); - fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, + fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result)); - switch (ir->cmp) { - case ir_binop_equal: - inst->conditional_mod = BRW_CONDITIONAL_Z; - break; - case ir_binop_nequal: - inst->conditional_mod = BRW_CONDITIONAL_NZ; - break; - case ir_binop_gequal: - inst->conditional_mod = BRW_CONDITIONAL_GE; - break; - case ir_binop_lequal: - inst->conditional_mod = BRW_CONDITIONAL_LE; - break; - case ir_binop_greater: - inst->conditional_mod = BRW_CONDITIONAL_G; - break; - case ir_binop_less: - inst->conditional_mod = BRW_CONDITIONAL_L; - break; - default: - assert(!"not reached: unknown loop condition"); - this->fail = true; - break; - } + inst->conditional_mod = brw_conditional_for_comparison(ir->cmp); inst = emit(fs_inst(BRW_OPCODE_BREAK)); inst->predicated = true; @@ -2067,7 +2114,7 @@ fs_visitor::emit_fb_writes() } for (int target = 0; target < c->key.nr_color_regions; target++) { - this->current_annotation = talloc_asprintf(this->mem_ctx, + this->current_annotation = ralloc_asprintf(this->mem_ctx, "FB write target %d", target); if (this->frag_color || this->frag_data) { @@ -2093,6 +2140,17 @@ fs_visitor::emit_fb_writes() } if (c->key.nr_color_regions == 0) { + if (c->key.alpha_test && (this->frag_color || this->frag_data)) { + /* If the alpha test is enabled but there's no color buffer, + * we still need to send alpha out the pipeline to our null + * renderbuffer. + */ + color.reg_offset += 3; + emit(fs_inst(BRW_OPCODE_MOV, + fs_reg(MRF, color_mrf + 3), + color)); + } + fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE, reg_undef, reg_undef)); inst->base_mrf = 0; @@ -2158,7 +2216,8 @@ fs_visitor::generate_fb_write(fs_inst *inst) inst->target, inst->mlen, 0, - eot); + eot, + inst->header_present); } void @@ -2244,7 +2303,7 @@ fs_visitor::generate_math(fs_inst *inst, } void -fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst) +fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src) { int msg_type = -1; int rlen = 4; @@ -2266,6 +2325,16 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst) msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5; } break; + case FS_OPCODE_TXL: + if (inst->shadow_compare) { + msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE_GEN5; + } else { + msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_LOD_GEN5; + } + break; + case FS_OPCODE_TXD: + assert(!"TXD isn't supported on gen5+ yet."); + break; } } else { switch (inst->opcode) { @@ -2283,13 +2352,26 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst) case FS_OPCODE_TXB: if (inst->shadow_compare) { assert(inst->mlen == 6); - msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE; + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE; } else { assert(inst->mlen == 9); msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS; simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; } break; + case FS_OPCODE_TXL: + if (inst->shadow_compare) { + assert(inst->mlen == 6); + msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE; + } else { + assert(inst->mlen == 9); + msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD; + simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16; + } + break; + case FS_OPCODE_TXD: + assert(!"TXD isn't supported on gen4 yet."); + break; } } assert(msg_type != -1); @@ -2302,7 +2384,7 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst) brw_SAMPLE(p, retype(dst, BRW_REGISTER_TYPE_UW), inst->base_mrf, - retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW), + src, SURF_INDEX_TEXTURE(inst->sampler), inst->sampler, WRITEMASK_XYZW, @@ -2502,6 +2584,22 @@ fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst) } } +/** + * To be called after the last _mesa_add_state_reference() call, to + * set up prog_data.param[] for assign_curb_setup() and + * setup_pull_constants(). + */ +void +fs_visitor::setup_paramvalues_refs() +{ + /* Set up the pointers to ParamValues now that that array is finalized. */ + for (unsigned int i = 0; i < c->prog_data.nr_params; i++) { + c->prog_data.param[i] = + fp->Base.Parameters->ParameterValues[this->param_index[i]] + + this->param_offset[i]; + } +} + void fs_visitor::assign_curb_setup() { @@ -2575,12 +2673,15 @@ fs_visitor::assign_urb_setup() foreach_iter(exec_list_iterator, iter, this->instructions) { fs_inst *inst = (fs_inst *)iter.get(); - if (inst->opcode != FS_OPCODE_LINTERP) - continue; - - assert(inst->src[2].file == FIXED_HW_REG); + if (inst->opcode == FS_OPCODE_LINTERP) { + assert(inst->src[2].file == FIXED_HW_REG); + inst->src[2].fixed_hw_reg.nr += urb_start; + } - inst->src[2].fixed_hw_reg.nr += urb_start; + if (inst->opcode == FS_OPCODE_CINTERP) { + assert(inst->src[0].file == FIXED_HW_REG); + inst->src[0].fixed_hw_reg.nr += urb_start; + } } this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length; @@ -2628,10 +2729,7 @@ fs_visitor::split_virtual_grfs() fs_inst *inst = (fs_inst *)iter.get(); /* Texturing produces 4 contiguous registers, so no splitting. */ - if ((inst->opcode == FS_OPCODE_TEX || - inst->opcode == FS_OPCODE_TXB || - inst->opcode == FS_OPCODE_TXL) && - inst->dst.file == GRF) { + if (inst->is_tex()) { split_grf[inst->dst.reg] = false; } } @@ -2671,6 +2769,7 @@ fs_visitor::split_virtual_grfs() } } } + this->live_intervals_valid = false; } /** @@ -2739,14 +2838,17 @@ void fs_visitor::calculate_live_intervals() { int num_vars = this->virtual_grf_next; - int *def = talloc_array(mem_ctx, int, num_vars); - int *use = talloc_array(mem_ctx, int, num_vars); + int *def = ralloc_array(mem_ctx, int, num_vars); + int *use = ralloc_array(mem_ctx, int, num_vars); int loop_depth = 0; int loop_start = 0; int bb_header_ip = 0; + if (this->live_intervals_valid) + return; + for (int i = 0; i < num_vars; i++) { - def[i] = 1 << 30; + def[i] = MAX_INSTRUCTION; use[i] = -1; } @@ -2820,10 +2922,12 @@ fs_visitor::calculate_live_intervals() } } - talloc_free(this->virtual_grf_def); - talloc_free(this->virtual_grf_use); + ralloc_free(this->virtual_grf_def); + ralloc_free(this->virtual_grf_use); this->virtual_grf_def = def; this->virtual_grf_use = use; + + this->live_intervals_valid = true; } /** @@ -2839,6 +2943,8 @@ fs_visitor::propagate_constants() { bool progress = false; + calculate_live_intervals(); + foreach_iter(exec_list_iterator, iter, this->instructions) { fs_inst *inst = (fs_inst *)iter.get(); @@ -2896,6 +3002,7 @@ fs_visitor::propagate_constants() /* Fit this constant in by commuting the operands */ scan_inst->src[0] = scan_inst->src[1]; scan_inst->src[1] = inst->src[0]; + progress = true; } break; case BRW_OPCODE_CMP: @@ -2910,12 +3017,15 @@ fs_visitor::propagate_constants() if (scan_inst->dst.file == GRF && scan_inst->dst.reg == inst->dst.reg && (scan_inst->dst.reg_offset == inst->dst.reg_offset || - scan_inst->opcode == FS_OPCODE_TEX)) { + scan_inst->is_tex())) { break; } } } + if (progress) + this->live_intervals_valid = false; + return progress; } /** @@ -2930,6 +3040,8 @@ fs_visitor::dead_code_eliminate() bool progress = false; int pc = 0; + calculate_live_intervals(); + foreach_iter(exec_list_iterator, iter, this->instructions) { fs_inst *inst = (fs_inst *)iter.get(); @@ -2941,6 +3053,9 @@ fs_visitor::dead_code_eliminate() pc++; } + if (progress) + live_intervals_valid = false; + return progress; } @@ -2948,10 +3063,35 @@ bool fs_visitor::register_coalesce() { bool progress = false; + int if_depth = 0; + int loop_depth = 0; foreach_iter(exec_list_iterator, iter, this->instructions) { fs_inst *inst = (fs_inst *)iter.get(); + /* Make sure that we dominate the instructions we're going to + * scan for interfering with our coalescing, or we won't have + * scanned enough to see if anything interferes with our + * coalescing. We don't dominate the following instructions if + * we're in a loop or an if block. + */ + switch (inst->opcode) { + case BRW_OPCODE_DO: + loop_depth++; + break; + case BRW_OPCODE_WHILE: + loop_depth--; + break; + case BRW_OPCODE_IF: + if_depth++; + break; + case BRW_OPCODE_ENDIF: + if_depth--; + break; + } + if (loop_depth || if_depth) + continue; + if (inst->opcode != BRW_OPCODE_MOV || inst->predicated || inst->saturate || @@ -2959,6 +3099,8 @@ fs_visitor::register_coalesce() inst->dst.type != inst->src[0].type) continue; + bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate; + /* Found a move of a GRF to a GRF. Let's see if we can coalesce * them: check for no writes to either one until the exit of the * program. @@ -2969,37 +3111,33 @@ fs_visitor::register_coalesce() for (; scan_iter.has_next(); scan_iter.next()) { fs_inst *scan_inst = (fs_inst *)scan_iter.get(); - if (scan_inst->opcode == BRW_OPCODE_DO || - scan_inst->opcode == BRW_OPCODE_WHILE || - scan_inst->opcode == BRW_OPCODE_ENDIF) { - interfered = true; - iter = scan_iter; - break; - } - if (scan_inst->dst.file == GRF) { if (scan_inst->dst.reg == inst->dst.reg && (scan_inst->dst.reg_offset == inst->dst.reg_offset || - scan_inst->opcode == FS_OPCODE_TEX)) { + scan_inst->is_tex())) { interfered = true; break; } if (scan_inst->dst.reg == inst->src[0].reg && (scan_inst->dst.reg_offset == inst->src[0].reg_offset || - scan_inst->opcode == FS_OPCODE_TEX)) { + scan_inst->is_tex())) { interfered = true; break; } } + + /* The gen6 MATH instruction can't handle source modifiers, so avoid + * coalescing those for now. We should do something more specific. + */ + if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) { + interfered = true; + break; + } } if (interfered) { continue; } - /* Update live interval so we don't have to recalculate. */ - this->virtual_grf_use[inst->src[0].reg] = MAX2(virtual_grf_use[inst->src[0].reg], - virtual_grf_use[inst->dst.reg]); - /* Rewrite the later usage to point at the source of the move to * be removed. */ @@ -3024,6 +3162,9 @@ fs_visitor::register_coalesce() progress = true; } + if (progress) + live_intervals_valid = false; + return progress; } @@ -3034,6 +3175,8 @@ fs_visitor::compute_to_mrf() bool progress = false; int next_ip = 0; + calculate_live_intervals(); + foreach_iter(exec_list_iterator, iter, this->instructions) { fs_inst *inst = (fs_inst *)iter.get(); @@ -3066,7 +3209,7 @@ fs_visitor::compute_to_mrf() * into a compute-to-MRF. */ - if (scan_inst->opcode == FS_OPCODE_TEX) { + if (scan_inst->is_tex()) { /* texturing writes several continuous regs, so we can't * compute-to-mrf that. */ @@ -3087,14 +3230,7 @@ fs_visitor::compute_to_mrf() /* gen6 math instructions must have the destination be * GRF, so no compute-to-MRF for them. */ - if (scan_inst->opcode == FS_OPCODE_RCP || - scan_inst->opcode == FS_OPCODE_RSQ || - scan_inst->opcode == FS_OPCODE_SQRT || - scan_inst->opcode == FS_OPCODE_EXP2 || - scan_inst->opcode == FS_OPCODE_LOG2 || - scan_inst->opcode == FS_OPCODE_SIN || - scan_inst->opcode == FS_OPCODE_COS || - scan_inst->opcode == FS_OPCODE_POW) { + if (scan_inst->is_math()) { break; } } @@ -3116,6 +3252,7 @@ fs_visitor::compute_to_mrf() */ if (scan_inst->opcode == BRW_OPCODE_DO || scan_inst->opcode == BRW_OPCODE_WHILE || + scan_inst->opcode == BRW_OPCODE_ELSE || scan_inst->opcode == BRW_OPCODE_ENDIF) { break; } @@ -3202,7 +3339,7 @@ fs_visitor::remove_duplicate_mrf_writes() } if (inst->mlen > 0) { - /* Found a SEND instruction, which will include two of fewer + /* Found a SEND instruction, which will include two or fewer * implied MRF writes. We could do better here. */ for (int i = 0; i < implied_mrf_writes(inst); i++) { @@ -3237,15 +3374,16 @@ fs_visitor::virtual_grf_interferes(int a, int b) int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]); int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]); - /* For dead code, just check if the def interferes with the other range. */ - if (this->virtual_grf_use[a] == -1) { - return (this->virtual_grf_def[a] >= this->virtual_grf_def[b] && - this->virtual_grf_def[a] < this->virtual_grf_use[b]); - } - if (this->virtual_grf_use[b] == -1) { - return (this->virtual_grf_def[b] >= this->virtual_grf_def[a] && - this->virtual_grf_def[b] < this->virtual_grf_use[a]); - } + /* We can't handle dead register writes here, without iterating + * over the whole instruction stream to find every single dead + * write to that register to compare to the live interval of the + * other register. Just assert that dead_code_eliminate() has been + * called. + */ + assert((this->virtual_grf_use[a] != -1 || + this->virtual_grf_def[a] == MAX_INSTRUCTION) && + (this->virtual_grf_use[b] != -1 || + this->virtual_grf_def[b] == MAX_INSTRUCTION)); return start < end; } @@ -3280,6 +3418,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) break; default: assert(!"not reached"); + brw_reg = brw_null_reg(); break; } break; @@ -3294,6 +3433,10 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg) assert(!"not reached"); brw_reg = brw_null_reg(); break; + default: + assert(!"not reached"); + brw_reg = brw_null_reg(); + break; } if (reg->abs) brw_reg = brw_abs(brw_reg); @@ -3307,20 +3450,25 @@ void fs_visitor::generate_code() { int last_native_inst = 0; - struct brw_instruction *if_stack[16], *loop_stack[16]; - int if_stack_depth = 0, loop_stack_depth = 0; - int if_depth_in_loop[16]; const char *last_annotation_string = NULL; ir_instruction *last_annotation_ir = NULL; + int if_stack_array_size = 16; + int loop_stack_array_size = 16; + int if_stack_depth = 0, loop_stack_depth = 0; + brw_instruction **if_stack = + rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size); + brw_instruction **loop_stack = + rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size); + int *if_depth_in_loop = + rzalloc_array(this->mem_ctx, int, loop_stack_array_size); + + if (unlikely(INTEL_DEBUG & DEBUG_WM)) { printf("Native code for fragment shader %d:\n", ctx->Shader.CurrentFragmentProgram->Name); } - if_depth_in_loop[loop_stack_depth] = 0; - - memset(&if_stack, 0, sizeof(if_stack)); foreach_iter(exec_list_iterator, iter, this->instructions) { fs_inst *inst = (fs_inst *)iter.get(); struct brw_reg src[3], dst; @@ -3404,7 +3552,6 @@ fs_visitor::generate_code() break; case BRW_OPCODE_IF: - assert(if_stack_depth < 16); if (inst->src[0].file != BAD_FILE) { assert(intel->gen >= 6); if_stack[if_stack_depth] = brw_IF_gen6(p, inst->conditional_mod, src[0], src[1]); @@ -3413,6 +3560,11 @@ fs_visitor::generate_code() } if_depth_in_loop[loop_stack_depth]++; if_stack_depth++; + if (if_stack_array_size <= if_stack_depth) { + if_stack_array_size *= 2; + if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *, + if_stack_array_size); + } break; case BRW_OPCODE_ELSE: @@ -3427,6 +3579,13 @@ fs_visitor::generate_code() case BRW_OPCODE_DO: loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8); + if (loop_stack_array_size <= loop_stack_depth) { + loop_stack_array_size *= 2; + loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *, + loop_stack_array_size); + if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int, + loop_stack_array_size); + } if_depth_in_loop[loop_stack_depth] = 0; break; @@ -3480,13 +3639,17 @@ fs_visitor::generate_code() case FS_OPCODE_COS: generate_math(inst, dst, src); break; + case FS_OPCODE_CINTERP: + brw_MOV(p, dst, src[0]); + break; case FS_OPCODE_LINTERP: generate_linterp(inst, dst, src); break; case FS_OPCODE_TEX: case FS_OPCODE_TXB: + case FS_OPCODE_TXD: case FS_OPCODE_TXL: - generate_tex(inst, dst); + generate_tex(inst, dst, src[0]); break; case FS_OPCODE_DISCARD_NOT: generate_discard_not(inst, dst); @@ -3542,6 +3705,10 @@ fs_visitor::generate_code() last_native_inst = p->nr_insn; } + ralloc_free(if_stack); + ralloc_free(loop_stack); + ralloc_free(if_depth_in_loop); + brw_set_uip_jip(p); /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS @@ -3617,10 +3784,9 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) v.emit_fb_writes(); v.split_virtual_grfs(); - v.setup_pull_constants(); - v.assign_curb_setup(); - v.assign_urb_setup(); + v.setup_paramvalues_refs(); + v.setup_pull_constants(); bool progress; do { @@ -3628,20 +3794,23 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) progress = v.remove_duplicate_mrf_writes() || progress; - v.calculate_live_intervals(); progress = v.propagate_constants() || progress; progress = v.register_coalesce() || progress; progress = v.compute_to_mrf() || progress; progress = v.dead_code_eliminate() || progress; } while (progress); + v.schedule_instructions(); + + v.assign_curb_setup(); + v.assign_urb_setup(); + if (0) { /* Debug of register spilling: Go spill everything. */ int virtual_grf_count = v.virtual_grf_next; for (int i = 1; i < virtual_grf_count; i++) { v.spill_reg(i); } - v.calculate_live_intervals(); } if (0) @@ -3650,8 +3819,6 @@ brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c) while (!v.assign_regs()) { if (v.fail) break; - - v.calculate_live_intervals(); } } } diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index de7b15312a..dc030ae5b5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -41,7 +41,6 @@ extern "C" { #include "brw_context.h" #include "brw_eu.h" #include "brw_wm.h" -#include "talloc.h" } #include "../glsl/glsl_types.h" #include "../glsl/ir.h" @@ -68,9 +67,11 @@ enum fs_opcodes { FS_OPCODE_COS, FS_OPCODE_DDX, FS_OPCODE_DDY, + FS_OPCODE_CINTERP, FS_OPCODE_LINTERP, FS_OPCODE_TEX, FS_OPCODE_TXB, + FS_OPCODE_TXD, FS_OPCODE_TXL, FS_OPCODE_DISCARD_NOT, FS_OPCODE_DISCARD_AND, @@ -82,13 +83,13 @@ enum fs_opcodes { class fs_reg { public: - /* Callers of this talloc-based new need not call delete. It's - * easier to just talloc_free 'ctx' (or any of its ancestors). */ + /* Callers of this ralloc-based new need not call delete. It's + * easier to just ralloc_free 'ctx' (or any of its ancestors). */ static void* operator new(size_t size, void *ctx) { void *node; - node = talloc_size(ctx, size); + node = ralloc_size(ctx, size); assert(node != NULL); return node; @@ -192,13 +193,13 @@ static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D); class fs_inst : public exec_node { public: - /* Callers of this talloc-based new need not call delete. It's - * easier to just talloc_free 'ctx' (or any of its ancestors). */ + /* Callers of this ralloc-based new need not call delete. It's + * easier to just ralloc_free 'ctx' (or any of its ancestors). */ static void* operator new(size_t size, void *ctx) { void *node; - node = talloc_zero_size(ctx, size); + node = rzalloc_size(ctx, size); assert(node != NULL); return node; @@ -305,6 +306,26 @@ public: offset == inst->offset); } + bool is_tex() + { + return (opcode == FS_OPCODE_TEX || + opcode == FS_OPCODE_TXB || + opcode == FS_OPCODE_TXD || + opcode == FS_OPCODE_TXL); + } + + bool is_math() + { + return (opcode == FS_OPCODE_RCP || + opcode == FS_OPCODE_RSQ || + opcode == FS_OPCODE_SQRT || + opcode == FS_OPCODE_EXP2 || + opcode == FS_OPCODE_LOG2 || + opcode == FS_OPCODE_SIN || + opcode == FS_OPCODE_COS || + opcode == FS_OPCODE_POW); + } + int opcode; /* BRW_OPCODE_* or FS_OPCODE_* */ fs_reg dst; fs_reg src[3]; @@ -341,13 +362,30 @@ public: this->fp = brw->fragment_program; this->intel = &brw->intel; this->ctx = &intel->ctx; - this->mem_ctx = talloc_new(NULL); + this->mem_ctx = ralloc_context(NULL); this->shader = shader; this->fail = false; this->variable_ht = hash_table_ctor(0, hash_table_pointer_hash, hash_table_pointer_compare); + /* There's a question that appears to be left open in the spec: + * How do implicit dst conversions interact with the CMP + * instruction or conditional mods? On gen6, the instruction: + * + * CMP null<d> src0<f> src1<f> + * + * will do src1 - src0 and compare that result as if it was an + * integer. On gen4, it will do src1 - src0 as float, convert + * the result to int, and compare as int. In between, it + * appears that it does src1 - src0 and does the compare in the + * execution type so dst type doesn't matter. + */ + if (this->intel->gen > 4) + this->reg_null_cmp = reg_null_d; + else + this->reg_null_cmp = reg_null_f; + this->frag_color = NULL; this->frag_data = NULL; this->frag_depth = NULL; @@ -361,13 +399,14 @@ public: this->virtual_grf_array_size = 0; this->virtual_grf_def = NULL; this->virtual_grf_use = NULL; + this->live_intervals_valid = false; this->kill_emitted = false; } ~fs_visitor() { - talloc_free(this->mem_ctx); + ralloc_free(this->mem_ctx); hash_table_dtor(this->variable_ht); } @@ -393,6 +432,7 @@ public: void visit(ir_function_signature *ir); fs_inst *emit(fs_inst inst); + void setup_paramvalues_refs(); void assign_curb_setup(); void calculate_urb_setup(); void assign_urb_setup(); @@ -409,11 +449,13 @@ public: bool dead_code_eliminate(); bool remove_duplicate_mrf_writes(); bool virtual_grf_interferes(int a, int b); + void schedule_instructions(); + void generate_code(); void generate_fb_write(fs_inst *inst); void generate_linterp(fs_inst *inst, struct brw_reg dst, struct brw_reg *src); - void generate_tex(fs_inst *inst, struct brw_reg dst); + void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src); void generate_math(fs_inst *inst, struct brw_reg dst, struct brw_reg *src); void generate_discard_not(fs_inst *inst, struct brw_reg temp); void generate_discard_and(fs_inst *inst, struct brw_reg temp); @@ -457,11 +499,18 @@ public: void *mem_ctx; exec_list instructions; + /* Delayed setup of c->prog_data.params[] due to realloc of + * ParamValues[] during compile. + */ + int param_index[MAX_UNIFORMS * 4]; + int param_offset[MAX_UNIFORMS * 4]; + int *virtual_grf_sizes; int virtual_grf_next; int virtual_grf_array_size; int *virtual_grf_def; int *virtual_grf_use; + bool live_intervals_valid; struct hash_table *variable_ht; ir_variable *frag_color, *frag_data, *frag_depth; @@ -485,6 +534,7 @@ public: fs_reg pixel_w; fs_reg delta_x; fs_reg delta_y; + fs_reg reg_null_cmp; int grf_used; }; diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index 20bfa4c3ea..7f3f52854d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -141,7 +141,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) return visit_continue; if (!this->mem_ctx) - this->mem_ctx = talloc_parent(ir); + this->mem_ctx = ralloc_parent(ir); for (i = 0; i < expr->get_num_operands(); i++) { if (expr->operands[i]->type->is_vector()) { diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index bbb210cd44..f027742317 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -41,7 +41,6 @@ extern "C" { #include "brw_context.h" #include "brw_eu.h" #include "brw_wm.h" -#include "talloc.h" } #include "brw_fs.h" #include "../glsl/glsl_types.h" @@ -94,6 +93,8 @@ fs_visitor::assign_regs() int class_count = 0; int aligned_pair_class = -1; + calculate_live_intervals(); + /* Set up the register classes. * * The base registers store a scalar value. For texture samples, @@ -232,8 +233,8 @@ fs_visitor::assign_regs() } - talloc_free(g); - talloc_free(regs); + ralloc_free(g); + ralloc_free(regs); return false; } @@ -271,8 +272,8 @@ fs_visitor::assign_regs() this->grf_used = last_grf + 1; - talloc_free(g); - talloc_free(regs); + ralloc_free(g); + ralloc_free(regs); return true; } @@ -416,4 +417,6 @@ fs_visitor::spill_reg(int spill_reg) } } } + + this->live_intervals_valid = false; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp new file mode 100644 index 0000000000..bff8f82f3f --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp @@ -0,0 +1,488 @@ +/* + * Copyright © 2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <eric@anholt.net> + * + */ + +extern "C" { + +#include <sys/types.h> + +#include "main/macros.h" +#include "main/shaderobj.h" +#include "main/uniforms.h" +#include "program/prog_optimize.h" +#include "program/register_allocate.h" +#include "program/sampler.h" +#include "program/hash_table.h" +#include "brw_context.h" +#include "brw_eu.h" +#include "brw_wm.h" +} +#include "brw_fs.h" +#include "../glsl/glsl_types.h" +#include "../glsl/ir_optimization.h" +#include "../glsl/ir_print_visitor.h" + +/** @file brw_fs_schedule_instructions.cpp + * + * List scheduling of FS instructions. + * + * The basic model of the list scheduler is to take a basic block, + * compute a DAG of the dependencies (RAW ordering with latency, WAW + * ordering, WAR ordering), and make a list of the DAG heads. + * Heuristically pick a DAG head, then put all the children that are + * now DAG heads into the list of things to schedule. + * + * The heuristic is the important part. We're trying to be cheap, + * since actually computing the optimal scheduling is NP complete. + * What we do is track a "current clock". When we schedule a node, we + * update the earliest-unblocked clock time of its children, and + * increment the clock. Then, when trying to schedule, we just pick + * the earliest-unblocked instruction to schedule. + * + * Note that often there will be many things which could execute + * immediately, and there are a range of heuristic options to choose + * from in picking among those. + */ + +class schedule_node : public exec_node +{ +public: + schedule_node(fs_inst *inst) + { + this->inst = inst; + this->child_array_size = 0; + this->children = NULL; + this->child_latency = NULL; + this->child_count = 0; + this->parent_count = 0; + this->unblocked_time = 0; + + int chans = 8; + int math_latency = 22; + + switch (inst->opcode) { + case FS_OPCODE_RCP: + this->latency = 1 * chans * math_latency; + break; + case FS_OPCODE_RSQ: + this->latency = 2 * chans * math_latency; + break; + case FS_OPCODE_SQRT: + case FS_OPCODE_LOG2: + /* full precision log. partial is 2. */ + this->latency = 3 * chans * math_latency; + break; + case FS_OPCODE_EXP2: + /* full precision. partial is 3, same throughput. */ + this->latency = 4 * chans * math_latency; + break; + case FS_OPCODE_POW: + this->latency = 8 * chans * math_latency; + break; + case FS_OPCODE_SIN: + case FS_OPCODE_COS: + /* minimum latency, max is 12 rounds. */ + this->latency = 5 * chans * math_latency; + break; + default: + this->latency = 2; + break; + } + } + + fs_inst *inst; + schedule_node **children; + int *child_latency; + int child_count; + int parent_count; + int child_array_size; + int unblocked_time; + int latency; +}; + +class instruction_scheduler { +public: + instruction_scheduler(fs_visitor *v, void *mem_ctx, int virtual_grf_count) + { + this->v = v; + this->mem_ctx = ralloc_context(mem_ctx); + this->virtual_grf_count = virtual_grf_count; + this->instructions.make_empty(); + this->instructions_to_schedule = 0; + } + + ~instruction_scheduler() + { + ralloc_free(this->mem_ctx); + } + void add_barrier_deps(schedule_node *n); + void add_dep(schedule_node *before, schedule_node *after, int latency); + + void add_inst(fs_inst *inst); + void calculate_deps(); + void schedule_instructions(fs_inst *next_block_header); + + void *mem_ctx; + + int instructions_to_schedule; + int virtual_grf_count; + exec_list instructions; + fs_visitor *v; +}; + +void +instruction_scheduler::add_inst(fs_inst *inst) +{ + schedule_node *n = new(mem_ctx) schedule_node(inst); + + assert(!inst->is_head_sentinel()); + assert(!inst->is_tail_sentinel()); + + this->instructions_to_schedule++; + + inst->remove(); + instructions.push_tail(n); +} + +/** + * Add a dependency between two instruction nodes. + * + * The @after node will be scheduled after @before. We will try to + * schedule it @latency cycles after @before, but no guarantees there. + */ +void +instruction_scheduler::add_dep(schedule_node *before, schedule_node *after, + int latency) +{ + if (!before || !after) + return; + + assert(before != after); + + for (int i = 0; i < before->child_count; i++) { + if (before->children[i] == after) { + before->child_latency[i] = MAX2(before->child_latency[i], latency); + return; + } + } + + if (before->child_array_size <= before->child_count) { + if (before->child_array_size < 16) + before->child_array_size = 16; + else + before->child_array_size *= 2; + + before->children = reralloc(mem_ctx, before->children, + schedule_node *, + before->child_array_size); + before->child_latency = reralloc(mem_ctx, before->child_latency, + int, before->child_array_size); + } + + before->children[before->child_count] = after; + before->child_latency[before->child_count] = latency; + before->child_count++; + after->parent_count++; +} + +/** + * Sometimes we really want this node to execute after everything that + * was before it and before everything that followed it. This adds + * the deps to do so. + */ +void +instruction_scheduler::add_barrier_deps(schedule_node *n) +{ + schedule_node *prev = (schedule_node *)n->prev; + schedule_node *next = (schedule_node *)n->next; + + if (prev) { + while (!prev->is_head_sentinel()) { + add_dep(prev, n, 0); + prev = (schedule_node *)prev->prev; + } + } + + if (next) { + while (!next->is_tail_sentinel()) { + add_dep(n, next, 0); + next = (schedule_node *)next->next; + } + } +} + +void +instruction_scheduler::calculate_deps() +{ + schedule_node *last_grf_write[virtual_grf_count]; + schedule_node *last_mrf_write[BRW_MAX_MRF]; + schedule_node *last_conditional_mod = NULL; + + /* The last instruction always needs to still be the last + * instruction. Either it's flow control (IF, ELSE, ENDIF, DO, + * WHILE) and scheduling other things after it would disturb the + * basic block, or it's FB_WRITE and we should do a better job at + * dead code elimination anyway. + */ + schedule_node *last = (schedule_node *)instructions.get_tail(); + add_barrier_deps(last); + + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + + /* top-to-bottom dependencies: RAW and WAW. */ + foreach_iter(exec_list_iterator, iter, instructions) { + schedule_node *n = (schedule_node *)iter.get(); + fs_inst *inst = n->inst; + + /* read-after-write deps. */ + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == GRF) { + if (last_grf_write[inst->src[i].reg]) { + add_dep(last_grf_write[inst->src[i].reg], n, + last_grf_write[inst->src[i].reg]->latency); + } + } else if (inst->src[i].file != BAD_FILE && + inst->src[i].file != IMM && + inst->src[i].file != UNIFORM) { + assert(inst->src[i].file != MRF); + add_barrier_deps(n); + } + } + + for (int i = 0; i < inst->mlen; i++) { + /* It looks like the MRF regs are released in the send + * instruction once it's sent, not when the result comes + * back. + */ + if (last_mrf_write[inst->base_mrf + i]) { + add_dep(last_mrf_write[inst->base_mrf + i], n, + last_mrf_write[inst->base_mrf + i]->latency); + } + } + + if (inst->predicated) { + assert(last_conditional_mod); + add_dep(last_conditional_mod, n, last_conditional_mod->latency); + } + + /* write-after-write deps. */ + if (inst->dst.file == GRF) { + if (last_grf_write[inst->dst.reg]) { + add_dep(last_grf_write[inst->dst.reg], n, + last_grf_write[inst->dst.reg]->latency); + } + last_grf_write[inst->dst.reg] = n; + } else if (inst->dst.file == MRF) { + if (last_mrf_write[inst->dst.hw_reg]) { + add_dep(last_mrf_write[inst->dst.hw_reg], n, + last_mrf_write[inst->dst.hw_reg]->latency); + } + last_mrf_write[inst->dst.hw_reg] = n; + } else if (inst->dst.file != BAD_FILE) { + add_barrier_deps(n); + } + + if (inst->mlen > 0) { + for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + if (last_mrf_write[inst->base_mrf + i]) { + add_dep(last_mrf_write[inst->base_mrf + i], n, + last_mrf_write[inst->base_mrf + i]->latency); + } + last_mrf_write[inst->base_mrf + i] = n; + } + } + + if (inst->conditional_mod) { + add_dep(last_conditional_mod, n, 0); + last_conditional_mod = n; + } + } + + /* bottom-to-top dependencies: WAR */ + memset(last_grf_write, 0, sizeof(last_grf_write)); + memset(last_mrf_write, 0, sizeof(last_mrf_write)); + last_conditional_mod = NULL; + + exec_node *node; + exec_node *prev; + for (node = instructions.get_tail(), prev = node->prev; + !node->is_head_sentinel(); + node = prev, prev = node->prev) { + schedule_node *n = (schedule_node *)node; + fs_inst *inst = n->inst; + + /* write-after-read deps. */ + for (int i = 0; i < 3; i++) { + if (inst->src[i].file == GRF) { + if (last_grf_write[inst->src[i].reg]) { + add_dep(n, last_grf_write[inst->src[i].reg], n->latency); + } + } else if (inst->src[i].file != BAD_FILE && + inst->src[i].file != IMM && + inst->src[i].file != UNIFORM) { + assert(inst->src[i].file != MRF); + add_barrier_deps(n); + } + } + + for (int i = 0; i < inst->mlen; i++) { + /* It looks like the MRF regs are released in the send + * instruction once it's sent, not when the result comes + * back. + */ + add_dep(n, last_mrf_write[inst->base_mrf + i], 2); + } + + if (inst->predicated) { + if (last_conditional_mod) { + add_dep(n, last_conditional_mod, n->latency); + } + } + + /* Update the things this instruction wrote, so earlier reads + * can mark this as WAR dependency. + */ + if (inst->dst.file == GRF) { + last_grf_write[inst->dst.reg] = n; + } else if (inst->dst.file == MRF) { + last_mrf_write[inst->dst.hw_reg] = n; + } else if (inst->dst.file != BAD_FILE) { + add_barrier_deps(n); + } + + if (inst->mlen > 0) { + for (int i = 0; i < v->implied_mrf_writes(inst); i++) { + last_mrf_write[inst->base_mrf + i] = n; + } + } + + if (inst->conditional_mod) + last_conditional_mod = n; + } +} + +void +instruction_scheduler::schedule_instructions(fs_inst *next_block_header) +{ + int time = 0; + + /* Remove non-DAG heads from the list. */ + foreach_iter(exec_list_iterator, iter, instructions) { + schedule_node *n = (schedule_node *)iter.get(); + if (n->parent_count != 0) + n->remove(); + } + + while (!instructions.is_empty()) { + schedule_node *chosen = NULL; + int chosen_time = 0; + + foreach_iter(exec_list_iterator, iter, instructions) { + schedule_node *n = (schedule_node *)iter.get(); + + if (!chosen || n->unblocked_time < chosen_time) { + chosen = n; + chosen_time = n->unblocked_time; + } + } + + /* Schedule this instruction. */ + assert(chosen); + chosen->remove(); + next_block_header->insert_before(chosen->inst); + instructions_to_schedule--; + + /* Bump the clock. If we expected a delay for scheduling, then + * bump the clock to reflect that. + */ + time = MAX2(time + 1, chosen_time); + + /* Now that we've scheduled a new instruction, some of its + * children can be promoted to the list of instructions ready to + * be scheduled. Update the children's unblocked time for this + * DAG edge as we do so. + */ + for (int i = 0; i < chosen->child_count; i++) { + schedule_node *child = chosen->children[i]; + + child->unblocked_time = MAX2(child->unblocked_time, + time + chosen->child_latency[i]); + + child->parent_count--; + if (child->parent_count == 0) { + instructions.push_tail(child); + } + } + + /* Shared resource: the mathbox. There's one per EU (on later + * generations, it's even more limited pre-gen6), so if we send + * something off to it then the next math isn't going to make + * progress until the first is done. + */ + if (chosen->inst->is_math()) { + foreach_iter(exec_list_iterator, iter, instructions) { + schedule_node *n = (schedule_node *)iter.get(); + + if (n->inst->is_math()) + n->unblocked_time = MAX2(n->unblocked_time, + time + chosen->latency); + } + } + } + + assert(instructions_to_schedule == 0); +} + +void +fs_visitor::schedule_instructions() +{ + fs_inst *next_block_header = (fs_inst *)instructions.head; + instruction_scheduler sched(this, mem_ctx, this->virtual_grf_next); + + while (!next_block_header->is_tail_sentinel()) { + /* Add things to be scheduled until we get to a new BB. */ + while (!next_block_header->is_tail_sentinel()) { + fs_inst *inst = next_block_header; + next_block_header = (fs_inst *)next_block_header->next; + + sched.add_inst(inst); + if (inst->opcode == BRW_OPCODE_IF || + inst->opcode == BRW_OPCODE_ELSE || + inst->opcode == BRW_OPCODE_ENDIF || + inst->opcode == BRW_OPCODE_DO || + inst->opcode == BRW_OPCODE_WHILE || + inst->opcode == BRW_OPCODE_BREAK || + inst->opcode == BRW_OPCODE_CONTINUE) { + break; + } + } + sched.calculate_deps(); + sched.schedule_instructions(next_block_header); + } + + this->live_intervals_valid = false; +} diff --git a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp index 2be6b08b5c..530ffa2658 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_vector_splitting.cpp @@ -69,7 +69,7 @@ public: ir_variable *components[4]; - /** talloc_parent(this->var) -- the shader's talloc context. */ + /** ralloc_parent(this->var) -- the shader's ralloc context. */ void *mem_ctx; }; @@ -77,13 +77,13 @@ class ir_vector_reference_visitor : public ir_hierarchical_visitor { public: ir_vector_reference_visitor(void) { - this->mem_ctx = talloc_new(NULL); + this->mem_ctx = ralloc_context(NULL); this->variable_list.make_empty(); } ~ir_vector_reference_visitor(void) { - talloc_free(mem_ctx); + ralloc_free(mem_ctx); } virtual ir_visitor_status visit(ir_variable *); @@ -358,7 +358,7 @@ brw_do_vector_splitting(exec_list *instructions) if (refs.variable_list.is_empty()) return false; - void *mem_ctx = talloc_new(NULL); + void *mem_ctx = ralloc_context(NULL); /* Replace the decls of the vectors to be split with their split * components. @@ -368,10 +368,10 @@ brw_do_vector_splitting(exec_list *instructions) const struct glsl_type *type; type = glsl_type::get_instance(entry->var->type->base_type, 1, 1); - entry->mem_ctx = talloc_parent(entry->var); + entry->mem_ctx = ralloc_parent(entry->var); for (unsigned int i = 0; i < entry->var->type->vector_elements; i++) { - const char *name = talloc_asprintf(mem_ctx, "%s_%c", + const char *name = ralloc_asprintf(mem_ctx, "%s_%c", entry->var->name, "xyzw"[i]); @@ -386,7 +386,7 @@ brw_do_vector_splitting(exec_list *instructions) ir_vector_splitting_visitor split(&refs.variable_list); visit_list_elements(&split, instructions); - talloc_free(mem_ctx); + ralloc_free(mem_ctx); return true; } diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c index 73b41fdbce..70c451d071 100644 --- a/src/mesa/drivers/dri/i965/brw_gs.c +++ b/src/mesa/drivers/dri/i965/brw_gs.c @@ -96,6 +96,9 @@ static void compile_gs_prog( struct brw_context *brw, brw_gs_quad_strip( &c, key ); break; case GL_LINE_LOOP: + /* Gen6: LINELOOP is converted to LINESTRIP at the beginning of the 3D pipeline */ + if (intel->gen == 6) + return; brw_gs_lines( &c ); break; case GL_LINES: @@ -189,7 +192,7 @@ static void populate_key( struct brw_context *brw, } if (intel->gen == 6) - prim_gs_always = brw->primitive == GL_LINE_LOOP; + prim_gs_always = 0; else prim_gs_always = brw->primitive == GL_QUADS || brw->primitive == GL_QUAD_STRIP || diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c index a91b0528fa..c768be23fa 100644 --- a/src/mesa/drivers/dri/i965/brw_misc_state.c +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c @@ -74,7 +74,7 @@ static void upload_binding_table_pointers(struct brw_context *brw) struct intel_context *intel = &brw->intel; BEGIN_BATCH(6); - OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2)); + OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS << 16 | (6 - 2)); OUT_BATCH(brw->vs.bind_bo_offset); OUT_BATCH(0); /* gs */ OUT_BATCH(0); /* clip */ @@ -104,7 +104,7 @@ static void upload_gen6_binding_table_pointers(struct brw_context *brw) struct intel_context *intel = &brw->intel; BEGIN_BATCH(4); - OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | + OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS << 16 | GEN6_BINDING_TABLE_MODIFY_VS | GEN6_BINDING_TABLE_MODIFY_GS | GEN6_BINDING_TABLE_MODIFY_PS | @@ -142,7 +142,7 @@ static void upload_pipelined_state_pointers(struct brw_context *brw ) } BEGIN_BATCH(7); - OUT_BATCH(CMD_PIPELINED_STATE_POINTERS << 16 | (7 - 2)); + OUT_BATCH(_3DSTATE_PIPELINED_POINTERS << 16 | (7 - 2)); OUT_RELOC(brw->vs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); if (brw->gs.prog_active) OUT_RELOC(brw->gs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); @@ -151,7 +151,7 @@ static void upload_pipelined_state_pointers(struct brw_context *brw ) OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); - OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, + OUT_RELOC(brw->intel.batch.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, brw->cc.state_offset); ADVANCE_BATCH(); @@ -214,7 +214,7 @@ static void emit_depthbuffer(struct brw_context *brw) if (region == NULL) { BEGIN_BATCH(len); - OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2)); + OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2)); OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) | (BRW_SURFACE_NULL << 29)); OUT_BATCH(0); @@ -251,7 +251,7 @@ static void emit_depthbuffer(struct brw_context *brw) assert(region->tiling != I915_TILING_NONE); BEGIN_BATCH(len); - OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2)); + OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2)); OUT_BATCH(((region->pitch * region->cpp) - 1) | (format << 18) | (BRW_TILEWALK_YMAJOR << 26) | @@ -277,7 +277,7 @@ static void emit_depthbuffer(struct brw_context *brw) /* Initialize it for safety. */ if (intel->gen >= 6) { BEGIN_BATCH(2); - OUT_BATCH(CMD_3D_CLEAR_PARAMS << 16 | (2 - 2)); + OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 | (2 - 2)); OUT_BATCH(0); ADVANCE_BATCH(); } @@ -301,16 +301,15 @@ const struct brw_tracked_state brw_depthbuffer = { static void upload_polygon_stipple(struct brw_context *brw) { + struct intel_context *intel = &brw->intel; struct gl_context *ctx = &brw->intel.ctx; - struct brw_polygon_stipple bps; GLuint i; if (!ctx->Polygon.StippleFlag) return; - memset(&bps, 0, sizeof(bps)); - bps.header.opcode = CMD_POLY_STIPPLE_PATTERN; - bps.header.length = sizeof(bps)/4-2; + BEGIN_BATCH(33); + OUT_BATCH(_3DSTATE_POLY_STIPPLE_PATTERN << 16 | (33 - 2)); /* Polygon stipple is provided in OpenGL order, i.e. bottom * row first. If we're rendering to a window (i.e. the @@ -321,14 +320,13 @@ static void upload_polygon_stipple(struct brw_context *brw) */ if (ctx->DrawBuffer->Name == 0) { for (i = 0; i < 32; i++) - bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */ + OUT_BATCH(ctx->PolygonStipple[31 - i]); /* invert */ } else { for (i = 0; i < 32; i++) - bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */ + OUT_BATCH(ctx->PolygonStipple[i]); } - - BRW_CACHED_BATCH_STRUCT(brw, &bps); + CACHED_BATCH(); } const struct brw_tracked_state brw_polygon_stipple = { @@ -347,15 +345,14 @@ const struct brw_tracked_state brw_polygon_stipple = { static void upload_polygon_stipple_offset(struct brw_context *brw) { + struct intel_context *intel = &brw->intel; struct gl_context *ctx = &brw->intel.ctx; - struct brw_polygon_stipple_offset bpso; if (!ctx->Polygon.StippleFlag) return; - memset(&bpso, 0, sizeof(bpso)); - bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET; - bpso.header.length = sizeof(bpso)/4-2; + BEGIN_BATCH(2); + OUT_BATCH(_3DSTATE_POLY_STIPPLE_OFFSET << 16 | (2-2)); /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0), * we have to invert the Y axis in order to match the OpenGL @@ -365,16 +362,11 @@ static void upload_polygon_stipple_offset(struct brw_context *brw) * system works just fine, and there's no window system to * worry about. */ - if (brw->intel.ctx.DrawBuffer->Name == 0) { - bpso.bits0.x_offset = 0; - bpso.bits0.y_offset = (32 - (ctx->DrawBuffer->Height & 31)) & 31; - } - else { - bpso.bits0.y_offset = 0; - bpso.bits0.x_offset = 0; - } - - BRW_CACHED_BATCH_STRUCT(brw, &bpso); + if (brw->intel.ctx.DrawBuffer->Name == 0) + OUT_BATCH((32 - (ctx->DrawBuffer->Height & 31)) & 31); + else + OUT_BATCH(0); + CACHED_BATCH(); } #define _NEW_WINDOW_POS 0x40000000 @@ -393,18 +385,17 @@ const struct brw_tracked_state brw_polygon_stipple_offset = { */ static void upload_aa_line_parameters(struct brw_context *brw) { + struct intel_context *intel = &brw->intel; struct gl_context *ctx = &brw->intel.ctx; - struct brw_aa_line_parameters balp; if (!ctx->Line.SmoothFlag || !brw->has_aa_line_parameters) return; + OUT_BATCH(_3DSTATE_AA_LINE_PARAMETERS << 16 | (3 - 2)); /* use legacy aa line coverage computation */ - memset(&balp, 0, sizeof(balp)); - balp.header.opcode = CMD_AA_LINE_PARAMETERS; - balp.header.length = sizeof(balp) / 4 - 2; - - BRW_CACHED_BATCH_STRUCT(brw, &balp); + OUT_BATCH(0); + OUT_BATCH(0); + CACHED_BATCH(); } const struct brw_tracked_state brw_aa_line_parameters = { @@ -422,28 +413,21 @@ const struct brw_tracked_state brw_aa_line_parameters = { static void upload_line_stipple(struct brw_context *brw) { + struct intel_context *intel = &brw->intel; struct gl_context *ctx = &brw->intel.ctx; - struct brw_line_stipple bls; GLfloat tmp; GLint tmpi; if (!ctx->Line.StippleFlag) return; - memset(&bls, 0, sizeof(bls)); - bls.header.opcode = CMD_LINE_STIPPLE_PATTERN; - bls.header.length = sizeof(bls)/4 - 2; - - bls.bits0.pattern = ctx->Line.StipplePattern; - bls.bits1.repeat_count = ctx->Line.StippleFactor; - + BEGIN_BATCH(3); + OUT_BATCH(_3DSTATE_LINE_STIPPLE_PATTERN << 16 | (3 - 2)); + OUT_BATCH(ctx->Line.StipplePattern); tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor; tmpi = tmp * (1<<13); - - - bls.bits1.inverse_repeat_count = tmpi; - - BRW_CACHED_BATCH_STRUCT(brw, &bls); + OUT_BATCH(tmpi << 16 | ctx->Line.StippleFactor); + CACHED_BATCH(); } const struct brw_tracked_state brw_line_stipple = { @@ -481,7 +465,7 @@ static void upload_invarient_state( struct brw_context *brw ) /* Disable depth offset clamping. */ - gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP; + gdo.header.opcode = _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP; gdo.header.length = sizeof(gdo)/4 - 2; gdo.depth_offset_clamp = 0.0; @@ -492,20 +476,20 @@ static void upload_invarient_state( struct brw_context *brw ) int i; BEGIN_BATCH(3); - OUT_BATCH(CMD_3D_MULTISAMPLE << 16 | (3 - 2)); + OUT_BATCH(_3DSTATE_MULTISAMPLE << 16 | (3 - 2)); OUT_BATCH(MS_PIXEL_LOCATION_CENTER | MS_NUMSAMPLES_1); OUT_BATCH(0); /* positions for 4/8-sample */ ADVANCE_BATCH(); BEGIN_BATCH(2); - OUT_BATCH(CMD_3D_SAMPLE_MASK << 16 | (2 - 2)); + OUT_BATCH(_3DSTATE_SAMPLE_MASK << 16 | (2 - 2)); OUT_BATCH(1); ADVANCE_BATCH(); for (i = 0; i < 4; i++) { BEGIN_BATCH(4); - OUT_BATCH(CMD_GS_SVB_INDEX << 16 | (4 - 2)); + OUT_BATCH(_3DSTATE_GS_SVB_INDEX << 16 | (4 - 2)); OUT_BATCH(i << SVB_INDEX_SHIFT); OUT_BATCH(0); OUT_BATCH(0xffffffff); @@ -565,7 +549,7 @@ static void upload_state_base_address( struct brw_context *brw ) BEGIN_BATCH(10); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2)); OUT_BATCH(1); /* General state base address */ - OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0, + OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1); /* Surface state base address */ OUT_BATCH(1); /* Dynamic state base address */ OUT_BATCH(1); /* Indirect object base address */ @@ -579,7 +563,7 @@ static void upload_state_base_address( struct brw_context *brw ) BEGIN_BATCH(8); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2)); OUT_BATCH(1); /* General state base address */ - OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0, + OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1); /* Surface state base address */ OUT_BATCH(1); /* Indirect object base address */ OUT_BATCH(1); /* Instruction base address */ @@ -591,7 +575,7 @@ static void upload_state_base_address( struct brw_context *brw ) BEGIN_BATCH(6); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2)); OUT_BATCH(1); /* General state base address */ - OUT_RELOC(intel->batch->buf, I915_GEM_DOMAIN_SAMPLER, 0, + OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1); /* Surface state base address */ OUT_BATCH(1); /* Indirect object base address */ OUT_BATCH(1); /* General state upper bound */ diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c index 94efa79109..7d653327e3 100644 --- a/src/mesa/drivers/dri/i965/brw_program.c +++ b/src/mesa/drivers/dri/i965/brw_program.c @@ -36,7 +36,7 @@ #include "program/program.h" #include "program/programopt.h" #include "tnl/tnl.h" -#include "talloc.h" +#include "../glsl/ralloc.h" #include "brw_context.h" #include "brw_wm.h" @@ -115,7 +115,7 @@ shader_error(struct gl_context *ctx, struct gl_program *prog, const char *msg) shader = _mesa_lookup_shader_program(ctx, prog->Id); if (shader) { - shader->InfoLog = talloc_strdup_append(shader->InfoLog, msg); + ralloc_strcat(&shader->InfoLog, msg); shader->LinkStatus = GL_FALSE; } } diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c index f28f28663e..b41d05dd43 100644 --- a/src/mesa/drivers/dri/i965/brw_queryobj.c +++ b/src/mesa/drivers/dri/i965/brw_queryobj.c @@ -177,7 +177,7 @@ brw_end_query(struct gl_context *ctx, struct gl_query_object *q) ADVANCE_BATCH(); } - intel_batchbuffer_flush(intel->batch); + intel_batchbuffer_flush(intel); } else { /* Flush the batchbuffer in case it has writes to our query BO. * Have later queries write to a new query BO so that further rendering @@ -185,7 +185,7 @@ brw_end_query(struct gl_context *ctx, struct gl_query_object *q) */ if (query->bo) { brw_emit_query_end(brw); - intel_batchbuffer_flush(intel->batch); + intel_batchbuffer_flush(intel); drm_intel_bo_unreference(brw->query.bo); brw->query.bo = NULL; @@ -232,6 +232,12 @@ brw_prepare_query_begin(struct brw_context *brw) brw->query.bo = NULL; brw->query.bo = drm_intel_bo_alloc(intel->bufmgr, "query", 4096, 1); + + /* clear target buffer */ + drm_intel_bo_map(brw->query.bo, GL_TRUE); + memset((char *)brw->query.bo->virtual, 0, 4096); + drm_intel_bo_unmap(brw->query.bo); + brw->query.index = 0; } diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index 3beed16945..86b0caa4a4 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -164,25 +164,18 @@ void brw_destroy_caches( struct brw_context *brw ); /*********************************************************************** * brw_state_batch.c */ -#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data( brw->intel.batch, (s), sizeof(*(s))) -#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) ) - -GLboolean brw_cached_batch_struct( struct brw_context *brw, - const void *data, - GLuint sz ); -void brw_destroy_batch_cache( struct brw_context *brw ); -void brw_clear_batch_cache( struct brw_context *brw ); +#define BRW_BATCH_STRUCT(brw, s) intel_batchbuffer_data(&brw->intel, (s), \ + sizeof(*(s)), false) + void *brw_state_batch(struct brw_context *brw, int size, int alignment, - drm_intel_bo **out_bo, uint32_t *out_offset); /* brw_wm_surface_state.c */ void brw_create_constant_surface(struct brw_context *brw, drm_intel_bo *bo, int width, - drm_intel_bo **out_bo, uint32_t *out_offset); #endif diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c index be3989eb7d..213c7a38d8 100644 --- a/src/mesa/drivers/dri/i965/brw_state_batch.c +++ b/src/mesa/drivers/dri/i965/brw_state_batch.c @@ -29,75 +29,10 @@ * Keith Whitwell <keith@tungstengraphics.com> */ - - #include "brw_state.h" #include "intel_batchbuffer.h" #include "main/imports.h" - - -/* A facility similar to the data caching code above, which aims to - * prevent identical commands being issued repeatedly. - */ -GLboolean brw_cached_batch_struct( struct brw_context *brw, - const void *data, - GLuint sz ) -{ - struct brw_cached_batch_item *item = brw->cached_batch_items; - struct header *newheader = (struct header *)data; - - if (brw->emit_state_always) { - intel_batchbuffer_data(brw->intel.batch, data, sz); - return GL_TRUE; - } - - while (item) { - if (item->header->opcode == newheader->opcode) { - if (item->sz == sz && memcmp(item->header, newheader, sz) == 0) - return GL_FALSE; - if (item->sz != sz) { - free(item->header); - item->header = malloc(sz); - item->sz = sz; - } - goto emit; - } - item = item->next; - } - - assert(!item); - item = CALLOC_STRUCT(brw_cached_batch_item); - item->header = malloc(sz); - item->sz = sz; - item->next = brw->cached_batch_items; - brw->cached_batch_items = item; - - emit: - memcpy(item->header, newheader, sz); - intel_batchbuffer_data(brw->intel.batch, data, sz); - return GL_TRUE; -} - -void brw_clear_batch_cache( struct brw_context *brw ) -{ - struct brw_cached_batch_item *item = brw->cached_batch_items; - - while (item) { - struct brw_cached_batch_item *next = item->next; - free((void *)item->header); - free(item); - item = next; - } - - brw->cached_batch_items = NULL; -} - -void brw_destroy_batch_cache( struct brw_context *brw ) -{ - brw_clear_batch_cache(brw); -} - /** * Allocates a block of space in the batchbuffer for indirect state. * @@ -116,13 +51,12 @@ void * brw_state_batch(struct brw_context *brw, int size, int alignment, - drm_intel_bo **out_bo, uint32_t *out_offset) { - struct intel_batchbuffer *batch = brw->intel.batch; + struct intel_batchbuffer *batch = &brw->intel.batch; uint32_t offset; - assert(size < batch->buf->size); + assert(size < batch->bo->size); offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment); /* If allocating from the top would wrap below the batchbuffer, or @@ -130,19 +64,13 @@ brw_state_batch(struct brw_context *brw, * space, then flush and try again. */ if (batch->state_batch_offset < size || - offset < batch->ptr - batch->map + batch->reserved_space) { - intel_batchbuffer_flush(batch); + offset < 4*batch->used + batch->reserved_space) { + intel_batchbuffer_flush(&brw->intel); offset = ROUND_DOWN_TO(batch->state_batch_offset - size, alignment); } batch->state_batch_offset = offset; - if (*out_bo != batch->buf) { - drm_intel_bo_unreference(*out_bo); - drm_intel_bo_reference(batch->buf); - *out_bo = batch->buf; - } - *out_offset = offset; - return batch->map + offset; + return batch->map + (offset>>2); } diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c index 58ff528d44..01eeb19a68 100644 --- a/src/mesa/drivers/dri/i965/brw_state_cache.c +++ b/src/mesa/drivers/dri/i965/brw_state_cache.c @@ -58,8 +58,6 @@ #include "main/imports.h" #include "brw_state.h" -#include "intel_batchbuffer.h" -#include "brw_wm.h" #define FILE_DEBUG_FLAG DEBUG_STATE @@ -433,8 +431,6 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache) void brw_state_cache_check_size(struct brw_context *brw) { - DBG("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items); - /* un-tuned guess. Each object is generally a page, so 1000 of them is 4 MB of * state cache. */ diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c index e262887471..fdce79da2f 100644 --- a/src/mesa/drivers/dri/i965/brw_state_dump.c +++ b/src/mesa/drivers/dri/i965/brw_state_dump.c @@ -26,6 +26,7 @@ */ #include "main/mtypes.h" +#include "intel_batchbuffer.h" #include "brw_context.h" #include "brw_defines.h" @@ -54,7 +55,8 @@ state_out(const char *name, void *data, uint32_t hw_offset, int index, /** Generic, undecoded state buffer debug printout */ static void -state_struct_out(const char *name, drm_intel_bo *buffer, unsigned int state_size) +state_struct_out(const char *name, drm_intel_bo *buffer, + unsigned int offset, unsigned int size) { int i; @@ -62,8 +64,8 @@ state_struct_out(const char *name, drm_intel_bo *buffer, unsigned int state_size return; drm_intel_bo_map(buffer, GL_FALSE); - for (i = 0; i < state_size / 4; i++) { - state_out(name, buffer->virtual, buffer->offset, i, + for (i = 0; i < size / 4; i++) { + state_out(name, buffer->virtual + offset, buffer->offset + offset, i, "dword %d\n", i); } drm_intel_bo_unmap(buffer); @@ -98,21 +100,25 @@ get_965_surface_format(unsigned int surface_format) static void dump_wm_surface_state(struct brw_context *brw) { + dri_bo *bo; + GLubyte *base; int i; + bo = brw->intel.batch.bo; + drm_intel_bo_map(bo, GL_FALSE); + base = bo->virtual; + for (i = 0; i < brw->wm.nr_surfaces; i++) { - drm_intel_bo *surf_bo = brw->wm.surf_bo[i]; unsigned int surfoff; struct brw_surface_state *surf; char name[20]; - if (surf_bo == NULL) { + if (brw->wm.surf_offset[i] == 0) { fprintf(stderr, "WM SURF%d: NULL\n", i); continue; } - drm_intel_bo_map(surf_bo, GL_FALSE); - surfoff = surf_bo->offset + brw->wm.surf_offset[i]; - surf = (struct brw_surface_state *)(surf_bo->virtual + brw->wm.surf_offset[i]); + surfoff = bo->offset + brw->wm.surf_offset[i]; + surf = (struct brw_surface_state *)(base + brw->wm.surf_offset[i]); sprintf(name, "WM SURF%d", i); state_out(name, surf, surfoff, 0, "%s %s\n", @@ -127,9 +133,8 @@ static void dump_wm_surface_state(struct brw_context *brw) surf->ss4.min_lod); state_out(name, surf, surfoff, 5, "x,y offset: %d,%d\n", surf->ss5.x_offset, surf->ss5.y_offset); - - drm_intel_bo_unmap(surf_bo); } + drm_intel_bo_unmap(bo); } @@ -280,13 +285,14 @@ static void dump_cc_state(struct brw_context *brw) const char *name = "CC"; struct gen6_color_calc_state *cc; uint32_t cc_off; + dri_bo *bo = brw->intel.batch.bo; - if (brw->cc.state_bo == NULL) + if (brw->cc.state_offset == 0) return; - drm_intel_bo_map(brw->cc.state_bo, GL_FALSE); - cc = brw->cc.state_bo->virtual; - cc_off = brw->cc.state_bo->offset; + drm_intel_bo_map(bo, GL_FALSE); + cc = bo->virtual; + cc_off = bo->offset; state_out(name, cc, cc_off, 0, "alpha test format %s, round disable %d, stencil ref %d," "bf stencil ref %d\n", @@ -300,7 +306,7 @@ static void dump_cc_state(struct brw_context *brw) state_out(name, cc, cc_off, 4, "constant blue %f\n", cc->constant_b); state_out(name, cc, cc_off, 5, "constant alpha %f\n", cc->constant_a); - drm_intel_bo_unmap(brw->cc.state_bo); + drm_intel_bo_unmap(bo); } @@ -369,26 +375,29 @@ void brw_debug_batch(struct intel_context *intel) { struct brw_context *brw = brw_context(&intel->ctx); - state_struct_out("WM bind", brw->wm.bind_bo, 4 * brw->wm.nr_surfaces); + state_struct_out("WM bind", + brw->intel.batch.bo, + brw->wm.bind_bo_offset, + 4 * brw->wm.nr_surfaces); dump_wm_surface_state(brw); dump_wm_sampler_state(brw); if (intel->gen < 6) - state_struct_out("VS", brw->vs.state_bo, sizeof(struct brw_vs_unit_state)); + state_struct_out("VS", brw->vs.state_bo, 0, sizeof(struct brw_vs_unit_state)); brw_debug_prog("VS prog", brw->vs.prog_bo); if (intel->gen < 6) - state_struct_out("GS", brw->gs.state_bo, sizeof(struct brw_gs_unit_state)); + state_struct_out("GS", brw->gs.state_bo, 0, sizeof(struct brw_gs_unit_state)); brw_debug_prog("GS prog", brw->gs.prog_bo); if (intel->gen < 6) { - state_struct_out("SF", brw->sf.state_bo, sizeof(struct brw_sf_unit_state)); + state_struct_out("SF", brw->sf.state_bo, 0, sizeof(struct brw_sf_unit_state)); brw_debug_prog("SF prog", brw->sf.prog_bo); } dump_sf_viewport_state(brw); if (intel->gen < 6) - state_struct_out("WM", brw->wm.state_bo, sizeof(struct brw_wm_unit_state)); + state_struct_out("WM", brw->wm.state_bo, 0, sizeof(struct brw_wm_unit_state)); brw_debug_prog("WM prog", brw->wm.prog_bo); if (intel->gen >= 6) { diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index eba4411ca7..6f521be659 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -104,7 +104,7 @@ static const struct brw_tracked_state *gen4_atoms[] = &brw_constant_buffer }; -const struct brw_tracked_state *gen6_atoms[] = +static const struct brw_tracked_state *gen6_atoms[] = { &brw_check_fallback, @@ -169,25 +169,49 @@ const struct brw_tracked_state *gen6_atoms[] = void brw_init_state( struct brw_context *brw ) { + const struct brw_tracked_state **atoms; + int num_atoms; + brw_init_caches(brw); + + if (brw->intel.gen >= 6) { + atoms = gen6_atoms; + num_atoms = ARRAY_SIZE(gen6_atoms); + } else { + atoms = gen4_atoms; + num_atoms = ARRAY_SIZE(gen4_atoms); + } + + while (num_atoms--) { + assert((*atoms)->dirty.mesa | + (*atoms)->dirty.brw | + (*atoms)->dirty.cache); + + if ((*atoms)->prepare) + brw->prepare_atoms[brw->num_prepare_atoms++] = **atoms; + if ((*atoms)->emit) + brw->emit_atoms[brw->num_emit_atoms++] = **atoms; + atoms++; + } + assert(brw->num_emit_atoms <= ARRAY_SIZE(brw->emit_atoms)); + assert(brw->num_prepare_atoms <= ARRAY_SIZE(brw->prepare_atoms)); } void brw_destroy_state( struct brw_context *brw ) { brw_destroy_caches(brw); - brw_destroy_batch_cache(brw); } /*********************************************************************** */ -static GLboolean check_state( const struct brw_state_flags *a, - const struct brw_state_flags *b ) +static GLuint check_state( const struct brw_state_flags *a, + const struct brw_state_flags *b ) { - return ((a->mesa & b->mesa) || - (a->brw & b->brw) || - (a->cache & b->cache)); + return ((a->mesa & b->mesa) | + (a->brw & b->brw) | + (a->cache & b->cache)) != 0; } static void accumulate_state( struct brw_state_flags *a, @@ -233,7 +257,6 @@ static struct dirty_bit_map mesa_bits[] = { DEFINE_BIT(_NEW_MODELVIEW), DEFINE_BIT(_NEW_PROJECTION), DEFINE_BIT(_NEW_TEXTURE_MATRIX), - DEFINE_BIT(_NEW_ACCUM), DEFINE_BIT(_NEW_COLOR), DEFINE_BIT(_NEW_DEPTH), DEFINE_BIT(_NEW_EVAL), @@ -279,6 +302,10 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_VERTICES), DEFINE_BIT(BRW_NEW_BATCH), DEFINE_BIT(BRW_NEW_DEPTH_BUFFER), + DEFINE_BIT(BRW_NEW_NR_WM_SURFACES), + DEFINE_BIT(BRW_NEW_NR_VS_SURFACES), + DEFINE_BIT(BRW_NEW_VS_CONSTBUF), + DEFINE_BIT(BRW_NEW_WM_CONSTBUF), {0, 0, 0} }; @@ -340,24 +367,16 @@ void brw_validate_state( struct brw_context *brw ) struct gl_context *ctx = &brw->intel.ctx; struct intel_context *intel = &brw->intel; struct brw_state_flags *state = &brw->state.dirty; + const struct brw_tracked_state *atoms = brw->prepare_atoms; + int num_atoms = brw->num_prepare_atoms; GLuint i; - const struct brw_tracked_state **atoms; - int num_atoms; brw_clear_validated_bos(brw); state->mesa |= brw->intel.NewGLState; brw->intel.NewGLState = 0; - brw_add_validated_bo(brw, intel->batch->buf); - - if (intel->gen >= 6) { - atoms = gen6_atoms; - num_atoms = ARRAY_SIZE(gen6_atoms); - } else { - atoms = gen4_atoms; - num_atoms = ARRAY_SIZE(gen4_atoms); - } + brw_add_validated_bo(brw, intel->batch.bo); if (brw->emit_state_always) { state->mesa |= ~0; @@ -375,27 +394,20 @@ void brw_validate_state( struct brw_context *brw ) brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM; } - if (state->mesa == 0 && - state->cache == 0 && - state->brw == 0) + if ((state->mesa | state->cache | state->brw) == 0) return; - if (brw->state.dirty.brw & BRW_NEW_CONTEXT) - brw_clear_batch_cache(brw); - brw->intel.Fallback = GL_FALSE; /* boolean, not bitfield */ /* do prepare stage for all atoms */ for (i = 0; i < num_atoms; i++) { - const struct brw_tracked_state *atom = atoms[i]; - - if (brw->intel.Fallback) - break; + const struct brw_tracked_state *atom = &atoms[i]; if (check_state(state, &atom->dirty)) { - if (atom->prepare) { - atom->prepare(brw); - } + atom->prepare(brw); + + if (brw->intel.Fallback) + break; } } @@ -418,20 +430,11 @@ void brw_validate_state( struct brw_context *brw ) void brw_upload_state(struct brw_context *brw) { - struct intel_context *intel = &brw->intel; struct brw_state_flags *state = &brw->state.dirty; + const struct brw_tracked_state *atoms = brw->emit_atoms; + int num_atoms = brw->num_emit_atoms; int i; static int dirty_count = 0; - const struct brw_tracked_state **atoms; - int num_atoms; - - if (intel->gen >= 6) { - atoms = gen6_atoms; - num_atoms = ARRAY_SIZE(gen6_atoms); - } else { - atoms = gen4_atoms; - num_atoms = ARRAY_SIZE(gen4_atoms); - } brw_clear_validated_bos(brw); @@ -445,20 +448,14 @@ void brw_upload_state(struct brw_context *brw) prev = *state; for (i = 0; i < num_atoms; i++) { - const struct brw_tracked_state *atom = atoms[i]; + const struct brw_tracked_state *atom = &atoms[i]; struct brw_state_flags generated; - assert(atom->dirty.mesa || - atom->dirty.brw || - atom->dirty.cache); - if (brw->intel.Fallback) break; if (check_state(state, &atom->dirty)) { - if (atom->emit) { - atom->emit( brw ); - } + atom->emit(brw); } accumulate_state(&examined, &atom->dirty); @@ -474,15 +471,13 @@ void brw_upload_state(struct brw_context *brw) } else { for (i = 0; i < num_atoms; i++) { - const struct brw_tracked_state *atom = atoms[i]; + const struct brw_tracked_state *atom = &atoms[i]; if (brw->intel.Fallback) break; if (check_state(state, &atom->dirty)) { - if (atom->emit) { - atom->emit( brw ); - } + atom->emit(brw); } } } diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h index 8f97bd136f..6687a89e80 100644 --- a/src/mesa/drivers/dri/i965/brw_structs.h +++ b/src/mesa/drivers/dri/i965/brw_structs.h @@ -1017,7 +1017,14 @@ struct brw_wm_unit_state GLuint enable_32_pix:1; GLuint enable_con_32_pix:1; GLuint enable_con_64_pix:1; - GLuint pad0:5; + GLuint pad0:1; + + /* These next four bits are for Ironlake+ */ + GLuint fast_span_coverage_enable:1; + GLuint depth_buffer_clear:1; + GLuint depth_buffer_resolve_enable:1; + GLuint hierarchical_depth_buffer_resolve_enable:1; + GLuint legacy_global_depth_bias:1; GLuint line_stipple:1; GLuint depth_offset:1; @@ -1064,6 +1071,15 @@ struct brw_sampler_default_color { GLfloat color[4]; }; +struct gen5_sampler_default_color { + uint8_t ub[4]; + float f[4]; + uint16_t hf[4]; + uint16_t us[4]; + int16_t s[4]; + uint8_t b[4]; +}; + struct brw_sampler_state { @@ -1169,7 +1185,12 @@ struct brw_surface_state GLuint cube_neg_y:1; GLuint cube_pos_x:1; GLuint cube_neg_x:1; - GLuint pad:4; + GLuint pad:2; + /* Required on gen6 for surfaces accessed through render cache messages. + */ + GLuint render_cache_read_write:1; + /* Ironlake and newer: instead of replicating one of the texels */ + GLuint cube_corner_average:1; GLuint mipmap_layout_mode:1; GLuint vert_line_stride_ofs:1; GLuint vert_line_stride:1; @@ -1651,6 +1672,18 @@ struct brw_instruction struct { GLuint binding_table_index:8; + GLuint msg_control:3; + GLuint msg_type:3; + GLuint target_cache:2; + GLuint response_length:4; + GLuint msg_length:4; + GLuint msg_target:4; + GLuint pad1:3; + GLuint end_of_thread:1; + } dp_read_g4x; + + struct { + GLuint binding_table_index:8; GLuint msg_control:3; GLuint msg_type:3; GLuint target_cache:2; diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c index dfc1551aca..b0419d8a42 100644 --- a/src/mesa/drivers/dri/i965/brw_urb.c +++ b/src/mesa/drivers/dri/i965/brw_urb.c @@ -248,5 +248,13 @@ void brw_upload_urb_fence(struct brw_context *brw) uf.bits1.sf_fence = brw->urb.cs_start; uf.bits1.cs_fence = brw->urb.size; + /* erratum: URB_FENCE must not cross a 64byte cacheline */ + if ((brw->intel.batch.used & 15) > 12) { + int pad = 16 - (brw->intel.batch.used & 15); + do + brw->intel.batch.map[brw->intel.batch.used++] = MI_NOOP; + while (--pad); + } + BRW_BATCH_STRUCT(brw, &uf); } diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c index e878da3850..d28d9abcb3 100644 --- a/src/mesa/drivers/dri/i965/brw_util.c +++ b/src/mesa/drivers/dri/i965/brw_util.c @@ -37,16 +37,6 @@ #include "brw_util.h" #include "brw_defines.h" -GLuint brw_count_bits(uint64_t val) -{ - GLuint i; - for (i = 0; val ; val >>= 1) - if (val & 1) - i++; - return i; -} - - GLuint brw_translate_blend_equation( GLenum mode ) { switch (mode) { diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h index 04f3175d3e..940a871550 100644 --- a/src/mesa/drivers/dri/i965/brw_util.h +++ b/src/mesa/drivers/dri/i965/brw_util.h @@ -35,7 +35,14 @@ #include "main/mtypes.h" -extern GLuint brw_count_bits(uint64_t val); +#ifdef __GNUC__ +#define brw_count_bits(v) __builtin_popcount(v) +#else +static inline GLuint brw_count_bits(uint64_t v) +{ + return _mesa_popcount(v>>32) + _mesa_popcount(v&0xffffffff); +} +#endif extern GLuint brw_parameter_list_state_flags(struct gl_program_parameter_list *paramList); extern GLuint brw_translate_blend_factor( GLenum factor ); extern GLuint brw_translate_blend_equation( GLenum mode ); diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 59f270d675..6ae75d22c1 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -130,6 +130,7 @@ static void brw_upload_vs_prog(struct brw_context *brw) key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled); key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL || ctx->Polygon.BackMode != GL_FILL); + key.two_side_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide); /* _NEW_POINT */ if (ctx->Point.PointSprite) { @@ -157,7 +158,7 @@ static void brw_upload_vs_prog(struct brw_context *brw) */ const struct brw_tracked_state brw_vs_prog = { .dirty = { - .mesa = _NEW_TRANSFORM | _NEW_POLYGON | _NEW_POINT, + .mesa = _NEW_TRANSFORM | _NEW_POLYGON | _NEW_POINT | _NEW_LIGHT, .brw = BRW_NEW_VERTEX_PROGRAM, .cache = 0 }, diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h index 9338a6b7db..0b88cc1ec7 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.h +++ b/src/mesa/drivers/dri/i965/brw_vs.h @@ -44,6 +44,7 @@ struct brw_vs_prog_key { GLuint nr_userclip:4; GLuint copy_edgeflag:1; GLuint point_coord_replace:8; + GLuint two_side_color: 1; }; diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c index e1a3f33393..6ec62554cc 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c @@ -140,11 +140,13 @@ clear_current_const(struct brw_vs_compile *c) static void brw_vs_alloc_regs( struct brw_vs_compile *c ) { struct intel_context *intel = &c->func.brw->intel; - GLuint i, reg = 0, mrf; + GLuint i, reg = 0, mrf, j; int attributes_in_vue; int first_reladdr_output; int max_constant; int constant = 0; + int vert_result_reoder[VERT_RESULT_MAX]; + int bfc = 0; /* Determine whether to use a real constant buffer or use a block * of GRF registers for constants. The later is faster but only @@ -254,7 +256,7 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) } reg += (constant + 1) / 2; c->prog_data.curb_read_length = reg - 1; - c->prog_data.nr_params = constant; + c->prog_data.nr_params = constant * 4; /* XXX 0 causes a bug elsewhere... */ if (intel->gen < 6 && c->prog_data.nr_params == 0) c->prog_data.nr_params = 4; @@ -291,7 +293,36 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c ) mrf = 4; first_reladdr_output = get_first_reladdr_output(&c->vp->program); - for (i = 0; i < VERT_RESULT_MAX; i++) { + + for (i = 0; i < VERT_RESULT_MAX; i++) + vert_result_reoder[i] = i; + + /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */ + if (intel->gen >= 6 && c->key.two_side_color) { + if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) && + (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) { + assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)); + assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)); + bfc = 2; + } else if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) && + (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0))) + bfc = 1; + + if (bfc) { + for (i = 0; i < bfc; i++) { + vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 0] = VERT_RESULT_COL0 + i; + vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 1] = VERT_RESULT_BFC0 + i; + } + + for (i = VERT_RESULT_COL0 + bfc * 2; i < VERT_RESULT_BFC0 + bfc; i++) { + vert_result_reoder[i] = i - bfc; + } + } + } + + for (j = 0; j < VERT_RESULT_MAX; j++) { + i = vert_result_reoder[j]; + if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) { c->nr_outputs++; assert(i < Elements(c->regs[PROGRAM_OUTPUT])); @@ -627,6 +658,22 @@ static void emit_min( struct brw_compile *p, } } +static void emit_arl(struct brw_compile *p, + struct brw_reg dst, + struct brw_reg src) +{ + struct intel_context *intel = &p->brw->intel; + + if (intel->gen >= 6) { + struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F); + + brw_RNDD(p, dst_f, src); + brw_MOV(p, dst, dst_f); + } else { + brw_RNDD(p, dst, src); + } +} + static void emit_math1_gen4(struct brw_vs_compile *c, GLuint function, struct brw_reg dst, @@ -1072,8 +1119,6 @@ get_constant(struct brw_vs_compile *c, assert(argIndex < 3); - assert(c->func.brw->intel.gen < 6); /* FINISHME */ - if (c->current_const[argIndex].index != src->Index) { /* Keep track of the last constant loaded in this slot, for reuse. */ c->current_const[argIndex].index = src->Index; @@ -1091,7 +1136,7 @@ get_constant(struct brw_vs_compile *c, } /* replicate lower four floats into upper half (to get XYZWXYZW) */ - const_reg = stride(const_reg, 0, 4, 0); + const_reg = stride(const_reg, 0, 4, 1); const_reg.subnr = 0; return const_reg; @@ -1104,14 +1149,14 @@ get_reladdr_constant(struct brw_vs_compile *c, { const struct prog_src_register *src = &inst->SrcReg[argIndex]; struct brw_compile *p = &c->func; + struct brw_context *brw = p->brw; + struct intel_context *intel = &brw->intel; struct brw_reg const_reg = c->current_const[argIndex].reg; - struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0]; - struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D); + struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0]; + uint32_t offset; assert(argIndex < 3); - assert(c->func.brw->intel.gen < 6); /* FINISHME */ - /* Can't reuse a reladdr constant load. */ c->current_const[argIndex].index = -1; @@ -1120,15 +1165,21 @@ get_reladdr_constant(struct brw_vs_compile *c, src->Index, argIndex, c->current_const[argIndex].reg.nr); #endif - brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16)); + if (intel->gen >= 6) { + offset = src->Index; + } else { + struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D); + brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16)); + addr_reg = byte_addr_reg; + offset = 16 * src->Index; + } /* fetch the first vec4 */ brw_dp_READ_4_vs_relative(p, - const_reg, /* writeback dest */ - byte_addr_reg, /* address register */ - 16 * src->Index, /* byte offset */ - SURF_INDEX_VERT_CONST_BUFFER /* binding table index */ - ); + const_reg, + addr_reg, + offset, + SURF_INDEX_VERT_CONST_BUFFER); return const_reg; } @@ -1375,11 +1426,10 @@ static struct brw_reg get_arg( struct brw_vs_compile *c, GET_SWZ(src->Swizzle, 1), GET_SWZ(src->Swizzle, 2), GET_SWZ(src->Swizzle, 3)); - } - /* Note this is ok for non-swizzle instructions: - */ - reg.negate = src->Negate ? 1 : 0; + /* Note this is ok for non-swizzle ARB_vp instructions */ + reg.negate = src->Negate ? 1 : 0; + } return reg; } @@ -1511,6 +1561,7 @@ static void emit_vertex_write( struct brw_vs_compile *c) int eot; GLuint len_vertex_header = 2; int next_mrf, i; + int msg_len; if (c->key.copy_edgeflag) { brw_MOV(p, @@ -1677,13 +1728,20 @@ static void emit_vertex_write( struct brw_vs_compile *c) eot = (c->first_overflow_output == 0); + msg_len = c->nr_outputs + 2 + len_vertex_header; + if (intel->gen >= 6) { + /* interleaved urb write message length for gen6 should be multiple of 2 */ + if ((msg_len % 2) != 0) + msg_len++; + } + brw_urb_WRITE(p, brw_null_reg(), /* dest */ 0, /* starting mrf reg nr */ c->r0, /* src */ 0, /* allocate */ 1, /* used */ - MIN2(c->nr_outputs + 1 + len_vertex_header, (BRW_MAX_MRF-1)), /* msg len */ + MIN2(msg_len - 1, (BRW_MAX_MRF - 1)), /* msg len */ 0, /* response len */ eot, /* eot */ eot, /* writes complete */ @@ -1892,6 +1950,7 @@ void brw_vs_emit(struct brw_vs_compile *c ) switch (inst->Opcode) { case OPCODE_ABS: + args[0].negate = false; brw_MOV(p, dst, brw_abs(args[0])); break; case OPCODE_ADD: @@ -1928,7 +1987,7 @@ void brw_vs_emit(struct brw_vs_compile *c ) emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL); break; case OPCODE_ARL: - brw_RNDD(p, dst, args[0]); + emit_arl(p, dst, args[0]); break; case OPCODE_FLR: brw_RNDD(p, dst, args[0]); diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c index be92313861..c3a7cc247c 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_state.c @@ -96,7 +96,14 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key) * and those dwords will be written to the second URB handle when we * brw_urb_WRITE() results. */ - vs.thread1.single_program_flow = 0; + /* Disable single program flow on Ironlake. We cannot reliably get + * all applications working without it. See: + * https://bugs.freedesktop.org/show_bug.cgi?id=29172 + * + * The most notable and reliably failing application is the Humus + * demo "CelShading" + */ + vs.thread1.single_program_flow = (intel->gen == 5); if (intel->gen == 5) vs.thread1.binding_table_entry_count = 0; /* hardware requirement */ diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c index eabac51160..48cf265e51 100644 --- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c @@ -82,6 +82,15 @@ prepare_vs_constants(struct brw_context *brw) params->ParameterValues[i], 4 * sizeof(float)); } + + if (0) { + for (i = 0; i < params->NumParameters; i++) { + float *row = (float *)brw->vs.const_bo->virtual + i * 4; + printf("vs const surface %3d: %4.3f %4.3f %4.3f %4.3f\n", + i, row[0], row[1], row[2], row[3]); + } + } + drm_intel_gem_bo_unmap_gtt(brw->vs.const_bo); brw->state.dirty.brw |= BRW_NEW_VS_CONSTBUF; } @@ -115,13 +124,11 @@ brw_update_vs_constant_surface( struct gl_context *ctx, * it. */ if (brw->vs.const_bo == NULL) { - drm_intel_bo_unreference(brw->vs.surf_bo[surf]); - brw->vs.surf_bo[surf] = NULL; + brw->vs.surf_offset[surf] = 0; return; } brw_create_constant_surface(brw, brw->vs.const_bo, params->NumParameters, - &brw->vs.surf_bo[surf], &brw->vs.surf_offset[surf]); } @@ -157,11 +164,10 @@ static void upload_vs_surfaces(struct brw_context *brw) /* BRW_NEW_NR_VS_SURFACES */ if (brw->vs.nr_surfaces == 0) { - if (brw->vs.bind_bo) { - drm_intel_bo_unreference(brw->vs.bind_bo); - brw->vs.bind_bo = NULL; + if (brw->vs.bind_bo_offset) { brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE; } + brw->vs.bind_bo_offset = 0; return; } @@ -171,15 +177,11 @@ static void upload_vs_surfaces(struct brw_context *brw) * space for the binding table. (once we have vs samplers) */ bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_VS_MAX_SURF, - 32, &brw->vs.bind_bo, &brw->vs.bind_bo_offset); + 32, &brw->vs.bind_bo_offset); for (i = 0; i < BRW_VS_MAX_SURF; i++) { /* BRW_NEW_VS_CONSTBUF */ - if (brw->vs.surf_bo[i]) { - bind[i] = brw->vs.surf_offset[i]; - } else { - bind[i] = 0; - } + bind[i] = brw->vs.surf_offset[i]; } brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE; diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c index 3d7a98c981..152ee14156 100644 --- a/src/mesa/drivers/dri/i965/brw_vtbl.c +++ b/src/mesa/drivers/dri/i965/brw_vtbl.c @@ -73,15 +73,11 @@ static void brw_destroy_context( struct intel_context *intel ) free(brw->wm.compile_data); } - for (i = 0; i < brw->state.nr_color_regions; i++) - intel_region_release(&brw->state.color_regions[i]); - brw->state.nr_color_regions = 0; intel_region_release(&brw->state.depth_region); dri_bo_release(&brw->curbe.curbe_bo); dri_bo_release(&brw->vs.prog_bo); dri_bo_release(&brw->vs.state_bo); - dri_bo_release(&brw->vs.bind_bo); dri_bo_release(&brw->vs.const_bo); dri_bo_release(&brw->gs.prog_bo); dri_bo_release(&brw->gs.state_bo); @@ -93,16 +89,12 @@ static void brw_destroy_context( struct intel_context *intel ) dri_bo_release(&brw->sf.vp_bo); for (i = 0; i < BRW_MAX_TEX_UNIT; i++) dri_bo_release(&brw->wm.sdc_bo[i]); - dri_bo_release(&brw->wm.bind_bo); - for (i = 0; i < BRW_WM_MAX_SURF; i++) - dri_bo_release(&brw->wm.surf_bo[i]); dri_bo_release(&brw->wm.sampler_bo); dri_bo_release(&brw->wm.prog_bo); dri_bo_release(&brw->wm.state_bo); dri_bo_release(&brw->wm.const_bo); dri_bo_release(&brw->wm.push_const_bo); dri_bo_release(&brw->cc.prog_bo); - dri_bo_release(&brw->cc.state_bo); dri_bo_release(&brw->cc.vp_bo); dri_bo_release(&brw->cc.blend_state_bo); dri_bo_release(&brw->cc.depth_stencil_state_bo); @@ -122,20 +114,14 @@ static void brw_set_draw_region( struct intel_context *intel, GLuint num_color_regions) { struct brw_context *brw = brw_context(&intel->ctx); - GLuint i; /* release old color/depth regions */ if (brw->state.depth_region != depth_region) brw->state.dirty.brw |= BRW_NEW_DEPTH_BUFFER; - for (i = 0; i < brw->state.nr_color_regions; i++) - intel_region_release(&brw->state.color_regions[i]); intel_region_release(&brw->state.depth_region); /* reference new color/depth regions */ - for (i = 0; i < num_color_regions; i++) - intel_region_reference(&brw->state.color_regions[i], color_regions[i]); intel_region_reference(&brw->state.depth_region, depth_region); - brw->state.nr_color_regions = num_color_regions; } @@ -173,14 +159,7 @@ static void brw_new_batch( struct intel_context *intel ) brw->state.dirty.brw |= ~0; brw->state.dirty.cache |= ~0; - /* Move to the end of the current upload buffer so that we'll force choosing - * a new buffer next time. - */ - if (brw->vb.upload.bo != NULL) { - drm_intel_bo_unreference(brw->vb.upload.bo); - brw->vb.upload.bo = NULL; - brw->vb.upload.offset = 0; - } + brw->vb.nr_current_buffers = 0; } static void brw_invalidate_state( struct intel_context *intel, GLuint new_state ) @@ -203,4 +182,5 @@ void brwInitVtbl( struct brw_context *brw ) brw->intel.vtbl.destroy = brw_destroy_context; brw->intel.vtbl.set_draw_region = brw_set_draw_region; brw->intel.vtbl.debug_batch = brw_debug_batch; + brw->intel.vtbl.render_target_supported = brw_render_target_supported; } diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index e0aa3fd7f2..ca51d1599a 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -284,6 +284,7 @@ static void brw_wm_populate_key( struct brw_context *brw, /* Build the index for table lookup */ /* _NEW_COLOR */ + key->alpha_test = ctx->Color.AlphaEnabled; if (fp->program.UsesKill || ctx->Color.AlphaEnabled) lookup |= IZ_PS_KILL_ALPHATEST_BIT; @@ -364,8 +365,6 @@ static void brw_wm_populate_key( struct brw_context *brw, SWIZZLE_NIL }; - key->tex_swizzles[i] = SWIZZLE_NOOP; - /* GL_DEPTH_TEXTURE_MODE is normally handled through * brw_wm_surface_state, but it applies to shadow compares as * well and our shadow compares always return the result in @@ -378,6 +377,11 @@ static void brw_wm_populate_key( struct brw_context *brw, swizzles[2] = SWIZZLE_ZERO; } else if (t->DepthMode == GL_LUMINANCE) { swizzles[3] = SWIZZLE_ONE; + } else if (t->DepthMode == GL_RED) { + /* See table 3.23 of the GL 3.0 spec. */ + swizzles[1] = SWIZZLE_ZERO; + swizzles[2] = SWIZZLE_ZERO; + swizzles[3] = SWIZZLE_ONE; } } @@ -427,7 +431,8 @@ static void brw_wm_populate_key( struct brw_context *brw, key->render_to_fbo = ctx->DrawBuffer->Name != 0; } - key->nr_color_regions = brw->state.nr_color_regions; + /* _NEW_BUFFERS */ + key->nr_color_regions = ctx->DrawBuffer->_NumColorDrawBuffers; /* CACHE_NEW_VS_PROG */ key->vp_outputs_written = brw->vs.prog_data->outputs_written; diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index e7f3cfbb75..90771e1f50 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -64,6 +64,7 @@ struct brw_wm_prog_key { GLuint linear_color:1; /**< linear interpolation vs perspective interp */ GLuint nr_color_regions:5; GLuint render_to_fbo:1; + GLuint alpha_test:1; GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */ GLuint shadowtex_mask:16; @@ -474,5 +475,6 @@ struct gl_shader *brw_new_shader(struct gl_context *ctx, GLuint name, GLuint typ struct gl_shader_program *brw_new_shader_program(struct gl_context *ctx, GLuint name); bool brw_color_buffer_write_enabled(struct brw_context *brw); +bool brw_render_target_supported(gl_format format); #endif diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c index a0e86034e1..2336e27c1e 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_emit.c +++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c @@ -219,43 +219,45 @@ void emit_wpos_xy(struct brw_wm_compile *c, const struct brw_reg *arg0) { struct brw_compile *p = &c->func; + struct intel_context *intel = &p->brw->intel; + struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W); + struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W); if (mask & WRITEMASK_X) { + if (intel->gen >= 6) { + struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F); + brw_MOV(p, delta_x_f, delta_x); + delta_x = delta_x_f; + } + if (c->fp->program.PixelCenterInteger) { /* X' = X */ - brw_MOV(p, - dst[0], - retype(arg0[0], BRW_REGISTER_TYPE_W)); + brw_MOV(p, dst[0], delta_x); } else { /* X' = X + 0.5 */ - brw_ADD(p, - dst[0], - retype(arg0[0], BRW_REGISTER_TYPE_W), - brw_imm_f(0.5)); + brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5)); } } if (mask & WRITEMASK_Y) { + if (intel->gen >= 6) { + struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F); + brw_MOV(p, delta_y_f, delta_y); + delta_y = delta_y_f; + } + if (c->fp->program.OriginUpperLeft) { if (c->fp->program.PixelCenterInteger) { /* Y' = Y */ - brw_MOV(p, - dst[1], - retype(arg0[1], BRW_REGISTER_TYPE_W)); + brw_MOV(p, dst[1], delta_y); } else { - /* Y' = Y + 0.5 */ - brw_ADD(p, - dst[1], - retype(arg0[1], BRW_REGISTER_TYPE_W), - brw_imm_f(0.5)); + brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5)); } } else { float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5; /* Y' = (height - 1) - Y + center */ - brw_ADD(p, - dst[1], - negate(retype(arg0[1], BRW_REGISTER_TYPE_W)), + brw_ADD(p, dst[1], negate(delta_y), brw_imm_f(c->key.drawable_height - 1 + center_offset)); } } @@ -971,34 +973,23 @@ void emit_math2(struct brw_wm_compile *c, struct brw_reg temp_dst = dst[dst_chan]; if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) { - if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) { - /* Both scalar arguments. Do scalar calc. */ - src0.hstride = BRW_HORIZONTAL_STRIDE_1; - src1.hstride = BRW_HORIZONTAL_STRIDE_1; - temp_dst.hstride = BRW_HORIZONTAL_STRIDE_1; - temp_dst.width = BRW_WIDTH_1; - - if (arg0[0].subnr != 0) { - brw_MOV(p, temp_dst, src0); - src0 = temp_dst; - - /* Ouch. We've used the temp as a dst, and we still - * need a temp to store arg1 in, because src and dst - * offsets have to be equal. Leaving this up to - * glsl2-965 to handle correctly. - */ - assert(arg1[0].subnr == 0); - } else if (arg1[0].subnr != 0) { - brw_MOV(p, temp_dst, src1); - src1 = temp_dst; - } - } else { - brw_MOV(p, temp_dst, src0); - src0 = temp_dst; - } - } else if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) { - brw_MOV(p, temp_dst, src1); - src1 = temp_dst; + brw_MOV(p, temp_dst, src0); + src0 = temp_dst; + } + + if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) { + /* This is a heinous hack to get a temporary register for use + * in case both arg0 and arg1 are constants. Why you're + * doing exponentiation on constant values in the shader, we + * don't know. + * + * max_wm_grf is almost surely less than the maximum GRF, and + * gen6 doesn't care about the number of GRFs used in a + * shader like pre-gen6 did. + */ + struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0); + brw_MOV(p, temp, src1); + src1 = temp; } brw_set_saturate(p, (mask & SATURATE) ? 1 : 0); @@ -1016,14 +1007,6 @@ void emit_math2(struct brw_wm_compile *c, sechalf(src0), sechalf(src1)); } - - /* Splat a scalar result into all the channels. */ - if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 && - arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) { - temp_dst.hstride = BRW_HORIZONTAL_STRIDE_0; - temp_dst.vstride = BRW_VERTICAL_STRIDE_0; - brw_MOV(p, dst[dst_chan], temp_dst); - } } else { GLuint saturate = ((mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : @@ -1350,9 +1333,11 @@ static void fire_fb_write( struct brw_wm_compile *c, dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW); /* Pass through control information: + * + * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case. */ /* mov (8) m1.0<1>:ud r1.0<8;8,1>:ud { Align1 NoMask } */ - if (intel->gen < 6) /* gen6, use headerless for fb write */ + if (intel->gen < 6) { brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */ @@ -1373,7 +1358,8 @@ static void fire_fb_write( struct brw_wm_compile *c, target, nr, 0, - eot); + eot, + GL_TRUE); } @@ -1518,7 +1504,8 @@ void emit_fb_write(struct brw_wm_compile *c, */ brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, brw_message_reg(0), brw_vec8_grf(0, 0)); + brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD), + retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); brw_pop_insn_state(p); if (target != 0) { diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c index fea96d3538..30672b4251 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c @@ -69,12 +69,43 @@ static GLuint translate_wrap_mode( GLenum wrap ) static drm_intel_bo *upload_default_color( struct brw_context *brw, const GLfloat *color ) { - struct brw_sampler_default_color sdc; + struct intel_context *intel = &brw->intel; - COPY_4V(sdc.color, color); - - return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR, - &sdc, sizeof(sdc)); + if (intel->gen >= 5) { + struct gen5_sampler_default_color sdc; + + memset(&sdc, 0, sizeof(sdc)); + + UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[0], color[0]); + UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[1], color[1]); + UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[2], color[2]); + UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[3], color[3]); + + UNCLAMPED_FLOAT_TO_USHORT(sdc.us[0], color[0]); + UNCLAMPED_FLOAT_TO_USHORT(sdc.us[1], color[1]); + UNCLAMPED_FLOAT_TO_USHORT(sdc.us[2], color[2]); + UNCLAMPED_FLOAT_TO_USHORT(sdc.us[3], color[3]); + + UNCLAMPED_FLOAT_TO_SHORT(sdc.s[0], color[0]); + UNCLAMPED_FLOAT_TO_SHORT(sdc.s[1], color[1]); + UNCLAMPED_FLOAT_TO_SHORT(sdc.s[2], color[2]); + UNCLAMPED_FLOAT_TO_SHORT(sdc.s[3], color[3]); + + /* XXX: Fill in half floats */ + /* XXX: Fill in signed bytes */ + + COPY_4V(sdc.f, color); + + return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR, + &sdc, sizeof(sdc)); + } else { + struct brw_sampler_default_color sdc; + + COPY_4V(sdc.color, color); + + return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR, + &sdc, sizeof(sdc)); + } } @@ -245,9 +276,8 @@ brw_wm_sampler_populate_key(struct brw_context *brw, struct wm_sampler_entry *entry = &key->sampler[unit]; struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit]; struct gl_texture_object *texObj = texUnit->_Current; - struct intel_texture_object *intelObj = intel_texture_object(texObj); struct gl_texture_image *firstImage = - texObj->Image[0][intelObj->firstLevel]; + texObj->Image[0][texObj->BaseLevel]; memset(last_entry_end, 0, (char*)entry - last_entry_end + sizeof(*entry)); diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index 82835470a3..5b5afc4626 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -47,7 +47,6 @@ struct brw_wm_unit_key { unsigned int dispatch_grf_start_reg; unsigned int curbe_offset; - unsigned int urb_size; unsigned int nr_surfaces, sampler_count; GLboolean uses_depth, computes_depth, uses_kill, is_glsl; @@ -87,7 +86,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key) { struct gl_context *ctx = &brw->intel.ctx; const struct gl_fragment_program *fp = brw->fragment_program; - const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp; struct intel_context *intel = &brw->intel; memset(key, 0, sizeof(*key)); @@ -99,9 +97,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key) key->dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf; key->total_scratch = brw->wm.prog_data->total_scratch; - /* BRW_NEW_URB_FENCE */ - key->urb_size = brw->urb.vsize; - /* BRW_NEW_CURBE_OFFSETS */ key->curbe_offset = brw->curbe.wm_start; diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index 76fc94df1f..e1f8f57a9d 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -42,7 +42,7 @@ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" - +#include "brw_wm.h" static GLuint translate_tex_target( GLenum target ) { @@ -68,104 +68,74 @@ static GLuint translate_tex_target( GLenum target ) } } +static uint32_t brw_format_for_mesa_format[MESA_FORMAT_COUNT] = +{ + [MESA_FORMAT_L8] = BRW_SURFACEFORMAT_L8_UNORM, + [MESA_FORMAT_I8] = BRW_SURFACEFORMAT_I8_UNORM, + [MESA_FORMAT_A8] = BRW_SURFACEFORMAT_A8_UNORM, + [MESA_FORMAT_AL88] = BRW_SURFACEFORMAT_L8A8_UNORM, + [MESA_FORMAT_AL1616] = BRW_SURFACEFORMAT_L16A16_UNORM, + [MESA_FORMAT_R8] = BRW_SURFACEFORMAT_R8_UNORM, + [MESA_FORMAT_R16] = BRW_SURFACEFORMAT_R16_UNORM, + [MESA_FORMAT_RG88] = BRW_SURFACEFORMAT_R8G8_UNORM, + [MESA_FORMAT_RG1616] = BRW_SURFACEFORMAT_R16G16_UNORM, + [MESA_FORMAT_ARGB8888] = BRW_SURFACEFORMAT_B8G8R8A8_UNORM, + [MESA_FORMAT_XRGB8888] = BRW_SURFACEFORMAT_B8G8R8X8_UNORM, + [MESA_FORMAT_RGB565] = BRW_SURFACEFORMAT_B5G6R5_UNORM, + [MESA_FORMAT_ARGB1555] = BRW_SURFACEFORMAT_B5G5R5A1_UNORM, + [MESA_FORMAT_ARGB4444] = BRW_SURFACEFORMAT_B4G4R4A4_UNORM, + [MESA_FORMAT_YCBCR_REV] = BRW_SURFACEFORMAT_YCRCB_NORMAL, + [MESA_FORMAT_YCBCR] = BRW_SURFACEFORMAT_YCRCB_SWAPUVY, + [MESA_FORMAT_RGB_FXT1] = BRW_SURFACEFORMAT_FXT1, + [MESA_FORMAT_RGBA_FXT1] = BRW_SURFACEFORMAT_FXT1, + [MESA_FORMAT_RGB_DXT1] = BRW_SURFACEFORMAT_DXT1_RGB, + [MESA_FORMAT_RGBA_DXT1] = BRW_SURFACEFORMAT_BC1_UNORM, + [MESA_FORMAT_RGBA_DXT3] = BRW_SURFACEFORMAT_BC2_UNORM, + [MESA_FORMAT_RGBA_DXT5] = BRW_SURFACEFORMAT_BC3_UNORM, + [MESA_FORMAT_SRGB_DXT1] = BRW_SURFACEFORMAT_DXT1_RGB_SRGB, + [MESA_FORMAT_SRGBA_DXT1] = BRW_SURFACEFORMAT_BC1_UNORM_SRGB, + [MESA_FORMAT_SRGBA_DXT3] = BRW_SURFACEFORMAT_BC2_UNORM_SRGB, + [MESA_FORMAT_SRGBA_DXT5] = BRW_SURFACEFORMAT_BC3_UNORM_SRGB, + [MESA_FORMAT_SARGB8] = BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB, + [MESA_FORMAT_SLA8] = BRW_SURFACEFORMAT_L8A8_UNORM_SRGB, + [MESA_FORMAT_SL8] = BRW_SURFACEFORMAT_L8_UNORM_SRGB, + [MESA_FORMAT_DUDV8] = BRW_SURFACEFORMAT_R8G8_SNORM, + [MESA_FORMAT_SIGNED_RGBA8888_REV] = BRW_SURFACEFORMAT_R8G8B8A8_SNORM, + [MESA_FORMAT_RGBA8888_REV] = BRW_SURFACEFORMAT_R8G8B8A8_UNORM, +}; + +bool +brw_render_target_supported(gl_format format) +{ + if (format == MESA_FORMAT_S8_Z24 || + format == MESA_FORMAT_X8_Z24 || + format == MESA_FORMAT_Z16) { + return true; + } + + /* Not exactly true, as some of those formats are not renderable. + * But at least we know how to translate them. + */ + return brw_format_for_mesa_format[format] != 0; +} static GLuint translate_tex_format( gl_format mesa_format, GLenum internal_format, - GLenum depth_mode ) + GLenum depth_mode, + GLenum srgb_decode ) { switch( mesa_format ) { - case MESA_FORMAT_L8: - return BRW_SURFACEFORMAT_L8_UNORM; - - case MESA_FORMAT_I8: - return BRW_SURFACEFORMAT_I8_UNORM; - - case MESA_FORMAT_A8: - return BRW_SURFACEFORMAT_A8_UNORM; - - case MESA_FORMAT_AL88: - return BRW_SURFACEFORMAT_L8A8_UNORM; - - case MESA_FORMAT_AL1616: - return BRW_SURFACEFORMAT_L16A16_UNORM; - - case MESA_FORMAT_R8: - return BRW_SURFACEFORMAT_R8_UNORM; - - case MESA_FORMAT_R16: - return BRW_SURFACEFORMAT_R16_UNORM; - - case MESA_FORMAT_RG88: - return BRW_SURFACEFORMAT_R8G8_UNORM; - - case MESA_FORMAT_RG1616: - return BRW_SURFACEFORMAT_R16G16_UNORM; - - case MESA_FORMAT_RGB888: - assert(0); /* not supported for sampling */ - return BRW_SURFACEFORMAT_R8G8B8_UNORM; - - case MESA_FORMAT_ARGB8888: - return BRW_SURFACEFORMAT_B8G8R8A8_UNORM; - - case MESA_FORMAT_XRGB8888: - return BRW_SURFACEFORMAT_B8G8R8X8_UNORM; - - case MESA_FORMAT_RGBA8888_REV: - _mesa_problem(NULL, "unexpected format in i965:translate_tex_format()"); - return BRW_SURFACEFORMAT_R8G8B8A8_UNORM; - - case MESA_FORMAT_RGB565: - return BRW_SURFACEFORMAT_B5G6R5_UNORM; - - case MESA_FORMAT_ARGB1555: - return BRW_SURFACEFORMAT_B5G5R5A1_UNORM; - - case MESA_FORMAT_ARGB4444: - return BRW_SURFACEFORMAT_B4G4R4A4_UNORM; - - case MESA_FORMAT_YCBCR_REV: - return BRW_SURFACEFORMAT_YCRCB_NORMAL; - - case MESA_FORMAT_YCBCR: - return BRW_SURFACEFORMAT_YCRCB_SWAPUVY; - - case MESA_FORMAT_RGB_FXT1: - case MESA_FORMAT_RGBA_FXT1: - return BRW_SURFACEFORMAT_FXT1; case MESA_FORMAT_Z16: if (depth_mode == GL_INTENSITY) return BRW_SURFACEFORMAT_I16_UNORM; else if (depth_mode == GL_ALPHA) return BRW_SURFACEFORMAT_A16_UNORM; + else if (depth_mode == GL_RED) + return BRW_SURFACEFORMAT_R16_UNORM; else return BRW_SURFACEFORMAT_L16_UNORM; - case MESA_FORMAT_RGB_DXT1: - return BRW_SURFACEFORMAT_DXT1_RGB; - - case MESA_FORMAT_RGBA_DXT1: - return BRW_SURFACEFORMAT_BC1_UNORM; - - case MESA_FORMAT_RGBA_DXT3: - return BRW_SURFACEFORMAT_BC2_UNORM; - - case MESA_FORMAT_RGBA_DXT5: - return BRW_SURFACEFORMAT_BC3_UNORM; - - case MESA_FORMAT_SARGB8: - return BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB; - - case MESA_FORMAT_SLA8: - return BRW_SURFACEFORMAT_L8A8_UNORM_SRGB; - - case MESA_FORMAT_SL8: - return BRW_SURFACEFORMAT_L8_UNORM_SRGB; - - case MESA_FORMAT_SRGB_DXT1: - return BRW_SURFACEFORMAT_BC1_UNORM_SRGB; - case MESA_FORMAT_S8_Z24: /* XXX: these different surface formats don't seem to * make any difference for shadow sampler/compares. @@ -174,18 +144,21 @@ static GLuint translate_tex_format( gl_format mesa_format, return BRW_SURFACEFORMAT_I24X8_UNORM; else if (depth_mode == GL_ALPHA) return BRW_SURFACEFORMAT_A24X8_UNORM; + else if (depth_mode == GL_RED) + return BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS; else return BRW_SURFACEFORMAT_L24X8_UNORM; - - case MESA_FORMAT_DUDV8: - return BRW_SURFACEFORMAT_R8G8_SNORM; - - case MESA_FORMAT_SIGNED_RGBA8888_REV: - return BRW_SURFACEFORMAT_R8G8B8A8_SNORM; - + + case MESA_FORMAT_SARGB8: + case MESA_FORMAT_SLA8: + case MESA_FORMAT_SL8: + if (srgb_decode == GL_DECODE_EXT) + return brw_format_for_mesa_format[mesa_format]; + else if (srgb_decode == GL_SKIP_DECODE_EXT) + return brw_format_for_mesa_format[_mesa_get_srgb_format_linear(mesa_format)]; default: - assert(0); - return 0; + assert(brw_format_for_mesa_format[mesa_format] != 0); + return brw_format_for_mesa_format[mesa_format]; } } @@ -214,49 +187,45 @@ brw_update_texture_surface( struct gl_context *ctx, GLuint unit ) struct brw_context *brw = brw_context(ctx); struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current; struct intel_texture_object *intelObj = intel_texture_object(tObj); - struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel]; + struct gl_texture_image *firstImage = tObj->Image[0][tObj->BaseLevel]; const GLuint surf_index = SURF_INDEX_TEXTURE(unit); - struct brw_surface_state surf; - void *map; + struct brw_surface_state *surf; - memset(&surf, 0, sizeof(surf)); + surf = brw_state_batch(brw, sizeof(*surf), 32, + &brw->wm.surf_offset[surf_index]); + memset(surf, 0, sizeof(*surf)); - surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW; - surf.ss0.surface_type = translate_tex_target(tObj->Target); - surf.ss0.surface_format = translate_tex_format(firstImage->TexFormat, + surf->ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW; + surf->ss0.surface_type = translate_tex_target(tObj->Target); + surf->ss0.surface_format = translate_tex_format(firstImage->TexFormat, firstImage->InternalFormat, - tObj->DepthMode); + tObj->DepthMode, tObj->sRGBDecode); /* This is ok for all textures with channel width 8bit or less: */ -/* surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */ - surf.ss1.base_addr = intelObj->mt->region->buffer->offset; /* reloc */ +/* surf->ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */ + surf->ss1.base_addr = intelObj->mt->region->buffer->offset; /* reloc */ - surf.ss2.mip_count = intelObj->lastLevel - intelObj->firstLevel; - surf.ss2.width = firstImage->Width - 1; - surf.ss2.height = firstImage->Height - 1; - brw_set_surface_tiling(&surf, intelObj->mt->region->tiling); - surf.ss3.pitch = (intelObj->mt->region->pitch * intelObj->mt->cpp) - 1; - surf.ss3.depth = firstImage->Depth - 1; + surf->ss2.mip_count = intelObj->_MaxLevel - tObj->BaseLevel; + surf->ss2.width = firstImage->Width - 1; + surf->ss2.height = firstImage->Height - 1; + brw_set_surface_tiling(surf, intelObj->mt->region->tiling); + surf->ss3.pitch = (intelObj->mt->region->pitch * intelObj->mt->cpp) - 1; + surf->ss3.depth = firstImage->Depth - 1; - surf.ss4.min_lod = 0; + surf->ss4.min_lod = 0; if (tObj->Target == GL_TEXTURE_CUBE_MAP) { - surf.ss0.cube_pos_x = 1; - surf.ss0.cube_pos_y = 1; - surf.ss0.cube_pos_z = 1; - surf.ss0.cube_neg_x = 1; - surf.ss0.cube_neg_y = 1; - surf.ss0.cube_neg_z = 1; + surf->ss0.cube_pos_x = 1; + surf->ss0.cube_pos_y = 1; + surf->ss0.cube_pos_z = 1; + surf->ss0.cube_neg_x = 1; + surf->ss0.cube_neg_y = 1; + surf->ss0.cube_neg_z = 1; } - map = brw_state_batch(brw, sizeof(surf), 32, - &brw->wm.surf_bo[surf_index], - &brw->wm.surf_offset[surf_index]); - memcpy(map, &surf, sizeof(surf)); - /* Emit relocation to surface contents */ - drm_intel_bo_emit_reloc(brw->wm.surf_bo[surf_index], + drm_intel_bo_emit_reloc(brw->intel.batch.bo, brw->wm.surf_offset[surf_index] + offsetof(struct brw_surface_state, ss1), intelObj->mt->region->buffer, 0, @@ -271,37 +240,38 @@ void brw_create_constant_surface(struct brw_context *brw, drm_intel_bo *bo, int width, - drm_intel_bo **out_bo, uint32_t *out_offset) { + struct intel_context *intel = &brw->intel; const GLint w = width - 1; - struct brw_surface_state surf; - void *map; + struct brw_surface_state *surf; - memset(&surf, 0, sizeof(surf)); + surf = brw_state_batch(brw, sizeof(*surf), 32, out_offset); + memset(surf, 0, sizeof(*surf)); - surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW; - surf.ss0.surface_type = BRW_SURFACE_BUFFER; - surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT; + surf->ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW; + surf->ss0.surface_type = BRW_SURFACE_BUFFER; + surf->ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT; - assert(bo); - surf.ss1.base_addr = bo->offset; /* reloc */ + if (intel->gen >= 6) + surf->ss0.render_cache_read_write = 1; - surf.ss2.width = w & 0x7f; /* bits 6:0 of size or width */ - surf.ss2.height = (w >> 7) & 0x1fff; /* bits 19:7 of size or width */ - surf.ss3.depth = (w >> 20) & 0x7f; /* bits 26:20 of size or width */ - surf.ss3.pitch = (width * 16) - 1; /* ignored?? */ - brw_set_surface_tiling(&surf, I915_TILING_NONE); /* tiling now allowed */ + assert(bo); + surf->ss1.base_addr = bo->offset; /* reloc */ - map = brw_state_batch(brw, sizeof(surf), 32, out_bo, out_offset); - memcpy(map, &surf, sizeof(surf)); + surf->ss2.width = w & 0x7f; /* bits 6:0 of size or width */ + surf->ss2.height = (w >> 7) & 0x1fff; /* bits 19:7 of size or width */ + surf->ss3.depth = (w >> 20) & 0x7f; /* bits 26:20 of size or width */ + surf->ss3.pitch = (width * 16) - 1; /* ignored?? */ + brw_set_surface_tiling(surf, I915_TILING_NONE); /* tiling now allowed */ /* Emit relocation to surface contents. Section 5.1.1 of the gen4 * bspec ("Data Cache") says that the data cache does not exist as * a separate cache and is just the sampler cache. */ - drm_intel_bo_emit_reloc(*out_bo, (*out_offset + - offsetof(struct brw_surface_state, ss1)), + drm_intel_bo_emit_reloc(brw->intel.batch.bo, + (*out_offset + + offsetof(struct brw_surface_state, ss1)), bo, 0, I915_GEM_DOMAIN_SAMPLER, 0); } @@ -380,16 +350,14 @@ static void upload_wm_constant_surface(struct brw_context *brw ) * it. */ if (brw->wm.const_bo == 0) { - if (brw->wm.surf_bo[surf] != NULL) { - drm_intel_bo_unreference(brw->wm.surf_bo[surf]); - brw->wm.surf_bo[surf] = NULL; + if (brw->wm.surf_offset[surf]) { brw->state.dirty.brw |= BRW_NEW_WM_SURFACES; + brw->wm.surf_offset[surf] = 0; } return; } brw_create_constant_surface(brw, brw->wm.const_bo, params->NumParameters, - &brw->wm.surf_bo[surf], &brw->wm.surf_offset[surf]); brw->state.dirty.brw |= BRW_NEW_WM_SURFACES; } @@ -404,6 +372,28 @@ const struct brw_tracked_state brw_wm_constant_surface = { .emit = upload_wm_constant_surface, }; +static void +brw_update_null_renderbuffer_surface(struct brw_context *brw, unsigned int unit) +{ + struct intel_context *intel = &brw->intel; + struct brw_surface_state *surf; + + surf = brw_state_batch(brw, sizeof(*surf), 32, + &brw->wm.surf_offset[unit]); + memset(surf, 0, sizeof(*surf)); + + surf->ss0.surface_type = BRW_SURFACE_NULL; + surf->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; + + if (intel->gen < 6) { + /* _NEW_COLOR */ + surf->ss0.color_blend = 0; + surf->ss0.writedisable_red = 1; + surf->ss0.writedisable_green = 1; + surf->ss0.writedisable_blue = 1; + surf->ss0.writedisable_alpha = 1; + } +} /** * Sets up a surface state structure to point at the given region. @@ -417,123 +407,57 @@ brw_update_renderbuffer_surface(struct brw_context *brw, { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; - drm_intel_bo *region_bo = NULL; struct intel_renderbuffer *irb = intel_renderbuffer(rb); - struct intel_region *region = irb ? irb->region : NULL; - struct { - unsigned int surface_type; - unsigned int surface_format; - unsigned int width, height, pitch, cpp; - GLubyte color_mask[4]; - GLboolean color_blend; - uint32_t tiling; - uint32_t draw_x; - uint32_t draw_y; - } key; - struct brw_surface_state surf; - void *map; - - memset(&key, 0, sizeof(key)); - - if (region != NULL) { - region_bo = region->buffer; - - key.surface_type = BRW_SURFACE_2D; - switch (irb->Base.Format) { - /* XRGB and ARGB are treated the same here because the chips in this - * family cannot render to XRGB targets. This means that we have to - * mask writes to alpha (ala glColorMask) and reconfigure the alpha - * blending hardware to use GL_ONE (or GL_ZERO) for cases where - * GL_DST_ALPHA (or GL_ONE_MINUS_DST_ALPHA) is used. - */ - case MESA_FORMAT_ARGB8888: - case MESA_FORMAT_XRGB8888: - key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; - break; - case MESA_FORMAT_SARGB8: - key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB; - break; - case MESA_FORMAT_RGB565: - key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM; - break; - case MESA_FORMAT_ARGB1555: - key.surface_format = BRW_SURFACEFORMAT_B5G5R5A1_UNORM; - break; - case MESA_FORMAT_ARGB4444: - key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM; - break; - case MESA_FORMAT_A8: - key.surface_format = BRW_SURFACEFORMAT_A8_UNORM; - break; - case MESA_FORMAT_R8: - key.surface_format = BRW_SURFACEFORMAT_R8_UNORM; - break; - case MESA_FORMAT_R16: - key.surface_format = BRW_SURFACEFORMAT_R16_UNORM; - break; - case MESA_FORMAT_RG88: - key.surface_format = BRW_SURFACEFORMAT_R8G8_UNORM; - break; - case MESA_FORMAT_RG1616: - key.surface_format = BRW_SURFACEFORMAT_R16G16_UNORM; - break; - default: - _mesa_problem(ctx, "Bad renderbuffer format: %d\n", irb->Base.Format); - } - key.tiling = region->tiling; - key.width = rb->Width; - key.height = rb->Height; - key.pitch = region->pitch; - key.cpp = region->cpp; - key.draw_x = region->draw_x; - key.draw_y = region->draw_y; - } else { - key.surface_type = BRW_SURFACE_NULL; - key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; - key.tiling = I915_TILING_X; - key.width = 1; - key.height = 1; - key.cpp = 4; - key.draw_x = 0; - key.draw_y = 0; - } + struct intel_region *region = irb->region; + struct brw_surface_state *surf; - if (intel->gen < 6) { - /* _NEW_COLOR */ - memcpy(key.color_mask, ctx->Color.ColorMask[unit], - sizeof(key.color_mask)); + surf = brw_state_batch(brw, sizeof(*surf), 32, + &brw->wm.surf_offset[unit]); + memset(surf, 0, sizeof(*surf)); - /* As mentioned above, disable writes to the alpha component when the - * renderbuffer is XRGB. + switch (irb->Base.Format) { + case MESA_FORMAT_XRGB8888: + /* XRGB is handled as ARGB because the chips in this family + * cannot render to XRGB targets. This means that we have to + * mask writes to alpha (ala glColorMask) and reconfigure the + * alpha blending hardware to use GL_ONE (or GL_ZERO) for + * cases where GL_DST_ALPHA (or GL_ONE_MINUS_DST_ALPHA) is + * used. */ - if (ctx->DrawBuffer->Visual.alphaBits == 0) - key.color_mask[3] = GL_FALSE; - - key.color_blend = (!ctx->Color._LogicOpEnabled && - (ctx->Color.BlendEnabled & (1 << unit))); + surf->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; + break; + case MESA_FORMAT_SARGB8: + /* without GL_EXT_framebuffer_sRGB we shouldn't bind sRGB + surfaces to the blend/update as sRGB */ + if (ctx->Color.sRGBEnabled) + surf->ss0.surface_format = brw_format_for_mesa_format[irb->Base.Format]; + else + surf->ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; + break; + default: + surf->ss0.surface_format = brw_format_for_mesa_format[irb->Base.Format]; + assert(surf->ss0.surface_format != 0); } - memset(&surf, 0, sizeof(surf)); - - surf.ss0.surface_format = key.surface_format; - surf.ss0.surface_type = key.surface_type; - if (key.tiling == I915_TILING_NONE) { - surf.ss1.base_addr = (key.draw_x + key.draw_y * key.pitch) * key.cpp; + surf->ss0.surface_type = BRW_SURFACE_2D; + if (region->tiling == I915_TILING_NONE) { + surf->ss1.base_addr = (region->draw_x + + region->draw_y * region->pitch) * region->cpp; } else { uint32_t tile_base, tile_x, tile_y; - uint32_t pitch = key.pitch * key.cpp; + uint32_t pitch = region->pitch * region->cpp; - if (key.tiling == I915_TILING_X) { - tile_x = key.draw_x % (512 / key.cpp); - tile_y = key.draw_y % 8; - tile_base = ((key.draw_y / 8) * (8 * pitch)); - tile_base += (key.draw_x - tile_x) / (512 / key.cpp) * 4096; + if (region->tiling == I915_TILING_X) { + tile_x = region->draw_x % (512 / region->cpp); + tile_y = region->draw_y % 8; + tile_base = ((region->draw_y / 8) * (8 * pitch)); + tile_base += (region->draw_x - tile_x) / (512 / region->cpp) * 4096; } else { /* Y */ - tile_x = key.draw_x % (128 / key.cpp); - tile_y = key.draw_y % 32; - tile_base = ((key.draw_y / 32) * (32 * pitch)); - tile_base += (key.draw_x - tile_x) / (128 / key.cpp) * 4096; + tile_x = region->draw_x % (128 / region->cpp); + tile_y = region->draw_y % 32; + tile_base = ((region->draw_y / 32) * (32 * pitch)); + tile_base += (region->draw_x - tile_x) / (128 / region->cpp) * 4096; } assert(brw->has_surface_tile_offset || (tile_x == 0 && tile_y == 0)); assert(tile_x % 4 == 0); @@ -541,41 +465,40 @@ brw_update_renderbuffer_surface(struct brw_context *brw, /* Note that the low bits of these fields are missing, so * there's the possibility of getting in trouble. */ - surf.ss1.base_addr = tile_base; - surf.ss5.x_offset = tile_x / 4; - surf.ss5.y_offset = tile_y / 2; + surf->ss1.base_addr = tile_base; + surf->ss5.x_offset = tile_x / 4; + surf->ss5.y_offset = tile_y / 2; } - if (region_bo != NULL) - surf.ss1.base_addr += region_bo->offset; /* reloc */ + surf->ss1.base_addr += region->buffer->offset; /* reloc */ - surf.ss2.width = key.width - 1; - surf.ss2.height = key.height - 1; - brw_set_surface_tiling(&surf, key.tiling); - surf.ss3.pitch = (key.pitch * key.cpp) - 1; + surf->ss2.width = rb->Width - 1; + surf->ss2.height = rb->Height - 1; + brw_set_surface_tiling(surf, region->tiling); + surf->ss3.pitch = (region->pitch * region->cpp) - 1; if (intel->gen < 6) { /* _NEW_COLOR */ - surf.ss0.color_blend = key.color_blend; - surf.ss0.writedisable_red = !key.color_mask[0]; - surf.ss0.writedisable_green = !key.color_mask[1]; - surf.ss0.writedisable_blue = !key.color_mask[2]; - surf.ss0.writedisable_alpha = !key.color_mask[3]; + surf->ss0.color_blend = (!ctx->Color._LogicOpEnabled && + (ctx->Color.BlendEnabled & (1 << unit))); + surf->ss0.writedisable_red = !ctx->Color.ColorMask[unit][0]; + surf->ss0.writedisable_green = !ctx->Color.ColorMask[unit][1]; + surf->ss0.writedisable_blue = !ctx->Color.ColorMask[unit][2]; + /* As mentioned above, disable writes to the alpha component when the + * renderbuffer is XRGB. + */ + if (ctx->DrawBuffer->Visual.alphaBits == 0) + surf->ss0.writedisable_alpha = 1; + else + surf->ss0.writedisable_alpha = !ctx->Color.ColorMask[unit][3]; } - map = brw_state_batch(brw, sizeof(surf), 32, - &brw->wm.surf_bo[unit], - &brw->wm.surf_offset[unit]); - memcpy(map, &surf, sizeof(surf)); - - if (region_bo != NULL) { - drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit], - brw->wm.surf_offset[unit] + - offsetof(struct brw_surface_state, ss1), - region_bo, - surf.ss1.base_addr - region_bo->offset, - I915_GEM_DOMAIN_RENDER, - I915_GEM_DOMAIN_RENDER); - } + drm_intel_bo_emit_reloc(brw->intel.batch.bo, + brw->wm.surf_offset[unit] + + offsetof(struct brw_surface_state, ss1), + region->buffer, + surf->ss1.base_addr - region->buffer->offset, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER); } static void @@ -591,6 +514,11 @@ prepare_wm_surfaces(struct brw_context *brw) struct intel_renderbuffer *irb = intel_renderbuffer(rb); struct intel_region *region = irb ? irb->region : NULL; + if (region == NULL || region->buffer == NULL) { + brw->intel.Fallback = GL_TRUE; /* boolean, not bitfield */ + return; + } + brw_add_validated_bo(brw, region->buffer); nr_surfaces = SURF_INDEX_DRAW(i) + 1; } @@ -635,12 +563,16 @@ upload_wm_surfaces(struct brw_context *brw) /* Update surfaces for drawing buffers */ if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) { for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) { - brw_update_renderbuffer_surface(brw, - ctx->DrawBuffer->_ColorDrawBuffers[i], - i); + if (intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[i])) { + brw_update_renderbuffer_surface(brw, + ctx->DrawBuffer->_ColorDrawBuffers[i], + i); + } else { + brw_update_null_renderbuffer_surface(brw, i); + } } } else { - brw_update_renderbuffer_surface(brw, NULL, 0); + brw_update_null_renderbuffer_surface(brw, 0); } /* Update surfaces for textures */ @@ -652,8 +584,7 @@ upload_wm_surfaces(struct brw_context *brw) if (texUnit->_ReallyEnabled) { brw_update_texture_surface(ctx, i); } else { - drm_intel_bo_unreference(brw->wm.surf_bo[surf]); - brw->wm.surf_bo[surf] = NULL; + brw->wm.surf_offset[surf] = 0; } } @@ -686,16 +617,11 @@ brw_wm_upload_binding_table(struct brw_context *brw) * space for the binding table. */ bind = brw_state_batch(brw, sizeof(uint32_t) * BRW_WM_MAX_SURF, - 32, &brw->wm.bind_bo, &brw->wm.bind_bo_offset); + 32, &brw->wm.bind_bo_offset); for (i = 0; i < BRW_WM_MAX_SURF; i++) { /* BRW_NEW_WM_SURFACES */ bind[i] = brw->wm.surf_offset[i]; - if (brw->wm.surf_bo[i]) { - bind[i] = brw->wm.surf_offset[i]; - } else { - bind[i] = 0; - } } brw->state.dirty.brw |= BRW_NEW_BINDING_TABLE; diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c index c2631a7b4d..d1648a102d 100644 --- a/src/mesa/drivers/dri/i965/gen6_cc.c +++ b/src/mesa/drivers/dri/i965/gen6_cc.c @@ -66,12 +66,12 @@ blend_state_populate_key(struct brw_context *brw, /* _NEW_COLOR */ key->color_blend = ctx->Color.BlendEnabled; if (key->color_blend) { - key->blend_eq_rgb = ctx->Color.BlendEquationRGB; - key->blend_eq_a = ctx->Color.BlendEquationA; - key->blend_src_rgb = ctx->Color.BlendSrcRGB; - key->blend_dst_rgb = ctx->Color.BlendDstRGB; - key->blend_src_a = ctx->Color.BlendSrcA; - key->blend_dst_a = ctx->Color.BlendDstA; + key->blend_eq_rgb = ctx->Color.Blend[0].EquationRGB; + key->blend_eq_a = ctx->Color.Blend[0].EquationA; + key->blend_src_rgb = ctx->Color.Blend[0].SrcRGB; + key->blend_dst_rgb = ctx->Color.Blend[0].DstRGB; + key->blend_src_a = ctx->Color.Blend[0].SrcA; + key->blend_dst_a = ctx->Color.Blend[0].DstA; } /* _NEW_COLOR */ @@ -254,14 +254,14 @@ prepare_color_calc_state(struct brw_context *brw) color_calc_state_populate_key(brw, &key); - drm_intel_bo_unreference(brw->cc.state_bo); - brw->cc.state_bo = brw_search_cache(&brw->cache, BRW_COLOR_CALC_STATE, + drm_intel_bo_unreference(brw->cc.color_calc_state_bo); + brw->cc.color_calc_state_bo = brw_search_cache(&brw->cache, BRW_COLOR_CALC_STATE, &key, sizeof(key), NULL, 0, NULL); - if (brw->cc.state_bo == NULL) - brw->cc.state_bo = color_calc_state_create_from_key(brw, &key); + if (brw->cc.color_calc_state_bo == NULL) + brw->cc.color_calc_state_bo = color_calc_state_create_from_key(brw, &key); } const struct brw_tracked_state gen6_color_calc_state = { @@ -278,17 +278,17 @@ static void upload_cc_state_pointers(struct brw_context *brw) struct intel_context *intel = &brw->intel; BEGIN_BATCH(4); - OUT_BATCH(CMD_3D_CC_STATE_POINTERS << 16 | (4 - 2)); + OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (4 - 2)); OUT_RELOC(brw->cc.blend_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); OUT_RELOC(brw->cc.depth_stencil_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); - OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); + OUT_RELOC(brw->cc.color_calc_state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); ADVANCE_BATCH(); } static void prepare_cc_state_pointers(struct brw_context *brw) { - brw_add_validated_bo(brw, brw->cc.state_bo); + brw_add_validated_bo(brw, brw->cc.color_calc_state_bo); brw_add_validated_bo(brw, brw->cc.blend_state_bo); brw_add_validated_bo(brw, brw->cc.depth_stencil_state_bo); } diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c index c7c4eb1f27..d6c1f1c893 100644 --- a/src/mesa/drivers/dri/i965/gen6_clip_state.c +++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c @@ -43,7 +43,10 @@ upload_clip_state(struct brw_context *brw) depth_clamp = GEN6_CLIP_Z_TEST; if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) { - provoking = 0; + provoking = + (0 << GEN6_CLIP_TRI_PROVOKE_SHIFT) | + (1 << GEN6_CLIP_TRIFAN_PROVOKE_SHIFT) | + (0 << GEN6_CLIP_LINE_PROVOKE_SHIFT); } else { provoking = (2 << GEN6_CLIP_TRI_PROVOKE_SHIFT) | @@ -55,7 +58,7 @@ upload_clip_state(struct brw_context *brw) userclip = (1 << brw_count_bits(ctx->Transform.ClipPlanesEnabled)) - 1; BEGIN_BATCH(4); - OUT_BATCH(CMD_3D_CLIP_STATE << 16 | (4 - 2)); + OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2)); OUT_BATCH(GEN6_CLIP_STATISTICS_ENABLE); OUT_BATCH(GEN6_CLIP_ENABLE | GEN6_CLIP_API_OGL | @@ -65,7 +68,7 @@ upload_clip_state(struct brw_context *brw) depth_clamp | provoking); OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT | - U_FIXED(225.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT | + U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT | GEN6_CLIP_FORCE_ZERO_RTAINDEX); ADVANCE_BATCH(); } diff --git a/src/mesa/drivers/dri/i965/gen6_gs_state.c b/src/mesa/drivers/dri/i965/gen6_gs_state.c index 6127b9197a..7296c7cd1b 100644 --- a/src/mesa/drivers/dri/i965/gen6_gs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_gs_state.c @@ -37,7 +37,7 @@ upload_gs_state(struct brw_context *brw) /* Disable all the constant buffers. */ BEGIN_BATCH(5); - OUT_BATCH(CMD_3D_CONSTANT_GS_STATE << 16 | (5 - 2)); + OUT_BATCH(_3DSTATE_CONSTANT_GS << 16 | (5 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); @@ -46,7 +46,7 @@ upload_gs_state(struct brw_context *brw) if (brw->gs.prog_bo) { BEGIN_BATCH(7); - OUT_BATCH(CMD_3D_GS_STATE << 16 | (7 - 2)); + OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); OUT_RELOC(brw->gs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); OUT_BATCH(GEN6_GS_SPF_MODE | (0 << GEN6_GS_SAMPLER_COUNT_SHIFT) | @@ -62,7 +62,7 @@ upload_gs_state(struct brw_context *brw) ADVANCE_BATCH(); } else { BEGIN_BATCH(7); - OUT_BATCH(CMD_3D_GS_STATE << 16 | (7 - 2)); + OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); OUT_BATCH(0); /* prog_bo */ OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) | (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); diff --git a/src/mesa/drivers/dri/i965/gen6_sampler_state.c b/src/mesa/drivers/dri/i965/gen6_sampler_state.c index fc5d391c3c..f65c651bdf 100644 --- a/src/mesa/drivers/dri/i965/gen6_sampler_state.c +++ b/src/mesa/drivers/dri/i965/gen6_sampler_state.c @@ -36,7 +36,7 @@ upload_sampler_state_pointers(struct brw_context *brw) struct intel_context *intel = &brw->intel; BEGIN_BATCH(4); - OUT_BATCH(CMD_3D_SAMPLER_STATE_POINTERS << 16 | + OUT_BATCH(_3DSTATE_SAMPLER_STATE_POINTERS << 16 | VS_SAMPLER_STATE_CHANGE | GS_SAMPLER_STATE_CHANGE | PS_SAMPLER_STATE_CHANGE | diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c index b57126c793..12b65826ae 100644 --- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c +++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c @@ -92,7 +92,7 @@ static void upload_scissor_state_pointers(struct brw_context *brw) struct intel_context *intel = &brw->intel; BEGIN_BATCH(2); - OUT_BATCH(CMD_3D_SCISSOR_STATE_POINTERS << 16 | (2 - 2)); + OUT_BATCH(_3DSTATE_SCISSOR_STATE_POINTERS << 16 | (2 - 2)); OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); ADVANCE_BATCH(); diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c index 4cd2d69583..50a5ad38c6 100644 --- a/src/mesa/drivers/dri/i965/gen6_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c @@ -33,9 +33,10 @@ #include "intel_batchbuffer.h" static uint32_t -get_attr_override(struct brw_context *brw, int fs_attr) +get_attr_override(struct brw_context *brw, int fs_attr, int two_side_color) { int attr_index = 0, i, vs_attr; + int bfc = 0; if (fs_attr <= FRAG_ATTRIB_TEX7) vs_attr = fs_attr; @@ -53,10 +54,36 @@ get_attr_override(struct brw_context *brw, int fs_attr) * be FRAG_ATTRIB_*. */ for (i = 1; i < vs_attr; i++) { + if (i == VERT_RESULT_PSIZ) + continue; if (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(i)) attr_index++; } + assert(attr_index < 32); + + if (two_side_color) { + if ((brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) && + (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) { + assert(brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)); + assert(brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)); + bfc = 2; + } else if ((brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) && + (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0))) + bfc = 1; + } + + if (bfc && (fs_attr <= FRAG_ATTRIB_TEX7 && fs_attr > FRAG_ATTRIB_WPOS)) { + if (fs_attr == FRAG_ATTRIB_COL0) + attr_index |= (ATTRIBUTE_SWIZZLE_INPUTATTR_FACING << ATTRIBUTE_SWIZZLE_SHIFT); + else if (fs_attr == FRAG_ATTRIB_COL1 && bfc == 2) { + attr_index++; + attr_index |= (ATTRIBUTE_SWIZZLE_INPUTATTR_FACING << ATTRIBUTE_SWIZZLE_SHIFT); + } else { + attr_index += bfc; + } + } + return attr_index; } @@ -75,6 +102,7 @@ upload_sf_state(struct brw_context *brw) GLboolean render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0; int attr = 0; int urb_start; + int two_side_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide); /* _NEW_TRANSFORM */ if (ctx->Transform.ClipPlanesEnabled) @@ -181,7 +209,7 @@ upload_sf_state(struct brw_context *brw) ctx->Point._Attenuated)) dw4 |= GEN6_SF_USE_STATE_POINT_WIDTH; - dw4 |= U_FIXED(CLAMP(ctx->Point.Size, 0.125, 225.875), 3) << + dw4 |= U_FIXED(CLAMP(ctx->Point.Size, 0.125, 255.875), 3) << GEN6_SF_POINT_WIDTH_SHIFT; if (ctx->Point.SpriteOrigin == GL_LOWER_LEFT) dw1 |= GEN6_SF_POINT_SPRITE_LOWERLEFT; @@ -211,7 +239,7 @@ upload_sf_state(struct brw_context *brw) } BEGIN_BATCH(20); - OUT_BATCH(CMD_3D_SF_STATE << 16 | (20 - 2)); + OUT_BATCH(_3DSTATE_SF << 16 | (20 - 2)); OUT_BATCH(dw1); OUT_BATCH(dw2); OUT_BATCH(dw3); @@ -224,7 +252,7 @@ upload_sf_state(struct brw_context *brw) for (; attr < 64; attr++) { if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(attr)) { - attr_overrides |= get_attr_override(brw, attr); + attr_overrides |= get_attr_override(brw, attr, two_side_color); attr++; break; } @@ -232,7 +260,7 @@ upload_sf_state(struct brw_context *brw) for (; attr < 64; attr++) { if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(attr)) { - attr_overrides |= get_attr_override(brw, attr) << 16; + attr_overrides |= get_attr_override(brw, attr, two_side_color) << 16; attr++; break; } diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c index de97fd3783..c3819f9b36 100644 --- a/src/mesa/drivers/dri/i965/gen6_urb.c +++ b/src/mesa/drivers/dri/i965/gen6_urb.c @@ -34,19 +34,26 @@ static void prepare_urb( struct brw_context *brw ) { - brw->urb.nr_vs_entries = 24; - if (brw->gs.prog_bo) - brw->urb.nr_gs_entries = 4; - else - brw->urb.nr_gs_entries = 0; + int urb_size, max_urb_entry; + struct intel_context *intel = &brw->intel; + + if (IS_GT1(intel->intelScreen->deviceID)) { + urb_size = 32 * 1024; + max_urb_entry = 128; + } else { + urb_size = 64 * 1024; + max_urb_entry = 256; + } + + brw->urb.nr_vs_entries = max_urb_entry; + brw->urb.nr_gs_entries = max_urb_entry; + /* CACHE_NEW_VS_PROG */ brw->urb.vs_size = MAX2(brw->vs.prog_data->urb_entry_size, 1); - /* Check that the number of URB rows (8 floats each) allocated is less - * than the URB space. - */ - assert((brw->urb.nr_vs_entries + - brw->urb.nr_gs_entries) * brw->urb.vs_size * 8 < 64 * 1024); + if (2 * brw->urb.vs_size > urb_size) + brw->urb.nr_vs_entries = brw->urb.nr_gs_entries = + (urb_size ) / (2 * brw->urb.vs_size); } static void @@ -60,7 +67,7 @@ upload_urb(struct brw_context *brw) assert(!brw->gs.prog_bo || brw->urb.vs_size < 5); BEGIN_BATCH(3); - OUT_BATCH(CMD_URB << 16 | (3 - 2)); + OUT_BATCH(_3DSTATE_URB << 16 | (3 - 2)); OUT_BATCH(((brw->urb.vs_size - 1) << GEN6_URB_VS_SIZE_SHIFT) | ((brw->urb.nr_vs_entries) << GEN6_URB_VS_ENTRIES_SHIFT)); OUT_BATCH(((brw->urb.vs_size - 1) << GEN6_URB_GS_SIZE_SHIFT) | diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c index d691bbebc8..cd7d209e3e 100644 --- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c +++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c @@ -117,7 +117,7 @@ static void upload_viewport_state_pointers(struct brw_context *brw) struct intel_context *intel = &brw->intel; BEGIN_BATCH(4); - OUT_BATCH(CMD_VIEWPORT_STATE_POINTERS << 16 | (4 - 2) | + OUT_BATCH(_3DSTATE_VIEWPORT_STATE_POINTERS << 16 | (4 - 2) | GEN6_CC_VIEWPORT_MODIFY | GEN6_SF_VIEWPORT_MODIFY | GEN6_CLIP_VIEWPORT_MODIFY); diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c index 4ef9e2e607..ce0b8ea7ea 100644 --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c @@ -47,7 +47,7 @@ upload_vs_state(struct brw_context *brw) if (brw->vs.prog_data->nr_params == 0 && !ctx->Transform.ClipPlanesEnabled) { /* Disable the push constant buffers. */ BEGIN_BATCH(5); - OUT_BATCH(CMD_3D_CONSTANT_VS_STATE << 16 | (5 - 2)); + OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (5 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); @@ -112,7 +112,7 @@ upload_vs_state(struct brw_context *brw) assert(param_regs <= 32); BEGIN_BATCH(5); - OUT_BATCH(CMD_3D_CONSTANT_VS_STATE << 16 | + OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | GEN6_CONSTANT_BUFFER_0_ENABLE | (5 - 2)); OUT_RELOC(constant_bo, @@ -127,15 +127,17 @@ upload_vs_state(struct brw_context *brw) } BEGIN_BATCH(6); - OUT_BATCH(CMD_3D_VS_STATE << 16 | (6 - 2)); + OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2)); OUT_RELOC(brw->vs.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); OUT_BATCH((0 << GEN6_VS_SAMPLER_COUNT_SHIFT) | + GEN6_VS_FLOATING_POINT_MODE_ALT | (brw->vs.nr_surfaces << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); OUT_BATCH(0); /* scratch space base offset */ OUT_BATCH((1 << GEN6_VS_DISPATCH_START_GRF_SHIFT) | (brw->vs.prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) | (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT)); - OUT_BATCH((0 << GEN6_VS_MAX_THREADS_SHIFT) | + + OUT_BATCH(((60 - 1) << GEN6_VS_MAX_THREADS_SHIFT) | /* max 60 threads for gen6 */ GEN6_VS_STATISTICS_ENABLE | GEN6_VS_ENABLE); ADVANCE_BATCH(); diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index d80df4e254..78901ecac5 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -107,7 +107,7 @@ upload_wm_state(struct brw_context *brw) if (brw->wm.prog_data->nr_params == 0) { /* Disable the push constant buffers. */ BEGIN_BATCH(5); - OUT_BATCH(CMD_3D_CONSTANT_PS_STATE << 16 | (5 - 2)); + OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 | (5 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); @@ -115,7 +115,7 @@ upload_wm_state(struct brw_context *brw) ADVANCE_BATCH(); } else { BEGIN_BATCH(5); - OUT_BATCH(CMD_3D_CONSTANT_PS_STATE << 16 | + OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 | GEN6_CONSTANT_BUFFER_0_ENABLE | (5 - 2)); OUT_RELOC(brw->wm.push_const_bo, @@ -133,6 +133,9 @@ upload_wm_state(struct brw_context *brw) dw5 |= GEN6_WM_LINE_AA_WIDTH_1_0; dw5 |= GEN6_WM_LINE_END_CAP_AA_WIDTH_0_5; + /* OpenGL non-ieee floating point mode */ + dw2 |= GEN6_WM_FLOATING_POINT_MODE_ALT; + /* BRW_NEW_NR_WM_SURFACES */ dw2 |= brw->wm.nr_surfaces << GEN6_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT; @@ -178,7 +181,7 @@ upload_wm_state(struct brw_context *brw) GEN6_WM_NUM_SF_OUTPUTS_SHIFT; BEGIN_BATCH(9); - OUT_BATCH(CMD_3D_WM_STATE << 16 | (9 - 2)); + OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2)); OUT_RELOC(brw->wm.prog_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); OUT_BATCH(dw2); OUT_BATCH(0); /* scratch space base offset */ |