diff options
| author | Younes Manton <younes.m@gmail.com> | 2008-08-28 23:25:13 -0400 | 
|---|---|---|
| committer | Younes Manton <younes.m@gmail.com> | 2008-08-28 23:25:13 -0400 | 
| commit | 4d9d192672508eaa9b2a70f84e933f11108bf09f (patch) | |
| tree | 36fd68969b6bbd6857c190368be532625c6ab0e2 | |
| parent | 7f100d04ddacf9f6517c9aff1e2de5257eb77fb0 (diff) | |
g3dvl: Buffer the entire frame before rendering.
| -rw-r--r-- | src/gallium/state_trackers/g3dvl/Makefile | 2 | ||||
| -rw-r--r-- | src/gallium/state_trackers/g3dvl/vl_context.c | 4 | ||||
| -rw-r--r-- | src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c | 45 | ||||
| -rw-r--r-- | src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h | 4 | ||||
| -rw-r--r-- | src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c | 2341 | ||||
| -rw-r--r-- | src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h | 18 | ||||
| -rw-r--r-- | src/gallium/state_trackers/g3dvl/vl_render.h | 5 | ||||
| -rw-r--r-- | src/gallium/state_trackers/g3dvl/vl_surface.c | 2 | 
8 files changed, 2400 insertions, 21 deletions
| diff --git a/src/gallium/state_trackers/g3dvl/Makefile b/src/gallium/state_trackers/g3dvl/Makefile index bd77c62bc5..4f7a953484 100644 --- a/src/gallium/state_trackers/g3dvl/Makefile +++ b/src/gallium/state_trackers/g3dvl/Makefile @@ -1,6 +1,6 @@  TARGET		= libg3dvl.a  OBJECTS		= vl_display.o vl_screen.o vl_context.o vl_surface.o vl_shader_build.o vl_util.o vl_basic_csc.o	\ -		  vl_r16snorm_mc.o +		  vl_r16snorm_mc.o vl_r16snorm_mc_buf.o  GALLIUMDIR	= ../..  CFLAGS		+= -g -Wall -fPIC -I${GALLIUMDIR}/include -I${GALLIUMDIR}/auxiliary -I${GALLIUMDIR}/winsys/g3dvl diff --git a/src/gallium/state_trackers/g3dvl/vl_context.c b/src/gallium/state_trackers/g3dvl/vl_context.c index 56d360c05b..fe107e406d 100644 --- a/src/gallium/state_trackers/g3dvl/vl_context.c +++ b/src/gallium/state_trackers/g3dvl/vl_context.c @@ -6,6 +6,7 @@  #include <pipe/p_state.h>  #include "vl_render.h"  #include "vl_r16snorm_mc.h" +#include "vl_r16snorm_mc_buf.h"  #include "vl_csc.h"  #include "vl_basic_csc.h" @@ -126,7 +127,8 @@ int vlCreateContext  	vlInitCommon(ctx); -	vlCreateR16SNormMC(pipe, picture_width, picture_height, picture_format, &ctx->render); +	/*vlCreateR16SNormMC(pipe, picture_width, picture_height, picture_format, &ctx->render);*/ +	vlCreateR16SNormBufferedMC(pipe, picture_width, picture_height, picture_format, &ctx->render);  	vlCreateBasicCSC(pipe, &ctx->csc);  	*context = ctx; diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c index 80b09a6d1d..3272220ef8 100644 --- a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c +++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c @@ -57,7 +57,7 @@ struct vlR16SnormMC  	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;  }; -int vlBegin +static int vlBegin  (  	struct vlRender *render  ) @@ -312,7 +312,7 @@ static int vlGrabBlocks  	return 0;  } -int vlRenderIMacroBlock +static int vlRenderIMacroBlock  (  	struct vlR16SnormMC *mc,  	enum vlPictureType picture_type, @@ -370,13 +370,13 @@ int vlRenderIMacroBlock  	pipe->bind_fs_state(pipe, mc->i_fs);  	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24); -	 +  	mc->cur_buf++;  	return 0;  } -int vlRenderPMacroBlock +static int vlRenderPMacroBlock  (  	struct vlR16SnormMC *mc,  	enum vlPictureType picture_type, @@ -468,13 +468,13 @@ int vlRenderPMacroBlock  	pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);  	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24); -	 +  	mc->cur_buf++;  	return 0;  } -int vlRenderBMacroBlock +static int vlRenderBMacroBlock  (  	struct vlR16SnormMC *mc,  	enum vlPictureType picture_type, @@ -580,13 +580,13 @@ int vlRenderBMacroBlock  	pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);  	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24); -	 +  	mc->cur_buf++;  	return 0;  } -int vlRenderMacroBlocksMpeg2R16Snorm +static int vlRenderMacroBlocksMpeg2R16Snorm  (  	struct vlRender *render,  	struct vlMpeg2MacroBlockBatch *batch, @@ -702,7 +702,17 @@ int vlRenderMacroBlocksMpeg2R16Snorm  	return 0;  } -int vlEnd +static int vlEnd +( +	struct vlRender *render +) +{ +	assert(render); + +	return 0; +} + +static int vlFlush  (  	struct vlRender *render  ) @@ -712,7 +722,7 @@ int vlEnd  	return 0;  } -int vlDestroy +static int vlDestroy  (  	struct vlRender *render  ) @@ -765,7 +775,7 @@ int vlDestroy   * Need to be scaled to cover mbW*mbH macroblock pixels and translated into   * position on target surface.   */ -const struct vlVertex2f macroblock_verts[24] = +static const struct vlVertex2f macroblock_verts[24] =  {  	{0.0f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.0f},  	{0.5f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.5f}, @@ -785,7 +795,7 @@ const struct vlVertex2f macroblock_verts[24] =   * in a bW*(bH*4) texture. First luma block located at 0,0->bW,bH; second at   * 0,bH->bW,2bH; third at 0,2bH->bW,3bH; fourth at 0,3bH->bW,4bH.   */ -const struct vlVertex2f macroblock_luma_texcoords[24] = +static const struct vlVertex2f macroblock_luma_texcoords[24] =  {  	{0.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.0f},  	{1.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.25f}, @@ -804,7 +814,7 @@ const struct vlVertex2f macroblock_luma_texcoords[24] =   * Represents texcoords for the above for rendering 1 chroma block.   * Straight forward 0,0->1,1 mapping so we can reuse the MB pos vectors.   */ -const struct vlVertex2f *macroblock_chroma_420_texcoords = macroblock_verts; +static const struct vlVertex2f *macroblock_chroma_420_texcoords = macroblock_verts;  /*   * Represents texcoords for the above for rendering 2 chroma blocks arranged @@ -812,13 +822,13 @@ const struct vlVertex2f *macroblock_chroma_420_texcoords = macroblock_verts;   * 0,bH->bW,2bH. We can render this with 0,0->1,1 mapping.   * Straight forward 0,0->1,1 mapping so we can reuse MB pos vectors.   */ -const struct vlVertex2f *macroblock_chroma_422_texcoords = macroblock_verts; +static const struct vlVertex2f *macroblock_chroma_422_texcoords = macroblock_verts;  /*   * Represents texcoords for the above for rendering 4 chroma blocks.   * Same case as 4 luma blocks.   */ -const struct vlVertex2f *macroblock_chroma_444_texcoords = macroblock_luma_texcoords; +static const struct vlVertex2f *macroblock_chroma_444_texcoords = macroblock_luma_texcoords;  /*   * Used when rendering P and B macroblocks, multiplier is applied to the A channel, @@ -826,7 +836,7 @@ const struct vlVertex2f *macroblock_chroma_444_texcoords = macroblock_luma_texco   * get back the differential. The differential is then added to the samples from the   * reference surface(s).   */ -const struct vlFragmentShaderConsts fs_consts = +static const struct vlFragmentShaderConsts fs_consts =  {  	{32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},  	{0.5f, 2.0f, 0.0f, 0.0f} @@ -2093,7 +2103,7 @@ static int vlCreateFragmentShaderFieldBMB  	return 0;  } -int vlCreateDataBufs +static int vlCreateDataBufs  (  	struct vlR16SnormMC *mc  ) @@ -2319,6 +2329,7 @@ int vlCreateR16SNormMC  	mc->base.vlBegin = &vlBegin;  	mc->base.vlRenderMacroBlocksMpeg2 = &vlRenderMacroBlocksMpeg2R16Snorm;  	mc->base.vlEnd = &vlEnd; +	mc->base.vlFlush = &vlFlush;  	mc->base.vlDestroy = &vlDestroy;  	mc->pipe = pipe;  	mc->video_width = video_width; diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h index a6eecf05b6..9842926bf7 100644 --- a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h +++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h @@ -1,5 +1,5 @@ -#ifndef vl_mc_h -#define vl_mc_h +#ifndef vl_r16snorm_mc_h +#define vl_r16snorm_mc_h  #include "vl_types.h" diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c new file mode 100644 index 0000000000..fc383cb8f6 --- /dev/null +++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c @@ -0,0 +1,2341 @@ +#define VL_INTERNAL +#include "vl_r16snorm_mc_buf.h" +#include <assert.h> +#include <stdlib.h> +#include <pipe/p_context.h> +#include <pipe/p_winsys.h> +#include <pipe/p_screen.h> +#include <pipe/p_state.h> +#include <pipe/p_util.h> +#include <pipe/p_inlines.h> +#include <tgsi/tgsi_parse.h> +#include <tgsi/tgsi_build.h> +#include "vl_render.h" +#include "vl_shader_build.h" +#include "vl_surface.h" +#include "vl_util.h" +#include "vl_types.h" +#include "vl_defs.h" + +#define NUM_BUF_SETS 4	/* Number of rotating buffer sets to use */ + +enum vlMacroBlockTypeEx +{ +	vlMacroBlockExTypeIntra, +	vlMacroBlockExTypeFwdPredictedFrame, +	vlMacroBlockExTypeFwdPredictedField, +	vlMacroBlockExTypeBkwdPredictedFrame, +	vlMacroBlockExTypeBkwdPredictedField, +	vlMacroBlockExTypeBiPredictedFrame, +	vlMacroBlockExTypeBiPredictedField, + +	vlNumMacroBlockExTypes +}; + +struct vlVertexShaderConsts +{ +	struct vlVertex4f denorm; +}; + +struct vlFragmentShaderConsts +{ +	struct vlVertex4f multiplier; +	struct vlVertex4f div; +}; + +struct vlR16SnormBufferedMC +{ +	struct vlRender				base; + +	unsigned int				video_width, video_height; +	enum vlFormat				video_format; + +	unsigned int				cur_buf; +	struct vlSurface			*buffered_surface; +	struct vlSurface			*past_surface, *future_surface; +	struct vlVertex2f			surface_tex_inv_size; +	unsigned int				num_macroblocks[vlNumMacroBlockExTypes]; +	unsigned int				total_num_macroblocks; + +	struct pipe_context			*pipe; +	struct pipe_viewport_state		viewport; +	struct pipe_framebuffer_state		render_target; +	struct pipe_sampler_state		*samplers[5]; +	struct pipe_texture			*textures[NUM_BUF_SETS][5]; +	void					*i_vs, *p_vs[2], *b_vs[2]; +	void					*i_fs, *p_fs[2], *b_fs[2]; +	struct pipe_vertex_buffer 		vertex_bufs[NUM_BUF_SETS][vlNumMacroBlockExTypes][3]; +	struct pipe_vertex_element		vertex_elems[5]; +	struct pipe_constant_buffer		vs_const_buf, fs_const_buf; +}; + +static int vlBegin +( +	struct vlRender *render +) +{ +	assert(render); + +	return 0; +} + +static int vlGrabFrameCodedBlock(short *src, short *dst, unsigned int dst_pitch) +{ +	unsigned int y; + +	for (y = 0; y < VL_BLOCK_HEIGHT; ++y) +		memcpy +		( +			dst + y * dst_pitch, +			src + y * VL_BLOCK_WIDTH, +			VL_BLOCK_WIDTH * 2 +		); + +	return 0; +} + +static int vlGrabFieldCodedBlock(short *src, short *dst, unsigned int dst_pitch) +{ +	unsigned int y; + +	for (y = 0; y < VL_BLOCK_HEIGHT / 2; ++y) +		memcpy +		( +			dst + y * dst_pitch * 2, +			src + y * VL_BLOCK_WIDTH, +			VL_BLOCK_WIDTH * 2 +		); + +	dst += VL_BLOCK_HEIGHT * dst_pitch; + +	for (; y < VL_BLOCK_HEIGHT; ++y) +		memcpy +		( +			dst + y * dst_pitch * 2, +			src + y * VL_BLOCK_WIDTH, +			VL_BLOCK_WIDTH * 2 +		); + +	return 0; +} + +static int vlGrabNoBlock(short *dst, unsigned int dst_pitch) +{ +	unsigned int y; + +	for (y = 0; y < VL_BLOCK_HEIGHT; ++y) +		memset +		( +			dst + y * dst_pitch, +			0, +			VL_BLOCK_WIDTH * 2 +		); + +	return 0; +} + +static int vlGrabBlocks +( +	struct vlR16SnormBufferedMC *mc, +	unsigned int mbx, +	unsigned int mby, +	enum vlDCTType dct_type, +	unsigned int coded_block_pattern, +	short *blocks +) +{ +	struct pipe_surface	*tex_surface; +	short			*texels; +	unsigned int		tex_pitch; +	unsigned int		x, y, tb = 0, sb = 0; +	unsigned int		mbpx = mbx * VL_MACROBLOCK_WIDTH, mbpy = mby * VL_MACROBLOCK_HEIGHT; + +	assert(mc); +	assert(blocks); + +	tex_surface = mc->pipe->screen->get_tex_surface +	( +		mc->pipe->screen, +		mc->textures[mc->cur_buf % NUM_BUF_SETS][0], +		0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE +	); + +	texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE); +	tex_pitch = tex_surface->stride / tex_surface->block.size; + +	texels += mbpy * tex_pitch + mbpx; + +	for (y = 0; y < 2; ++y) +	{ +		for (x = 0; x < 2; ++x, ++tb) +		{ +			if ((coded_block_pattern >> (5 - tb)) & 1) +			{ +				short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT; + +				if (dct_type == vlDCTTypeFrameCoded) +				{ +					vlGrabFrameCodedBlock +					( +						cur_block, +						texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH, +						tex_pitch +					); +				} +				else +				{ +					vlGrabFieldCodedBlock +					( +						cur_block, +						texels + y * tex_pitch + x * VL_BLOCK_WIDTH, +						tex_pitch +					); +				} + +				++sb; +			} +			else +				vlGrabNoBlock(texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH, tex_pitch); +		} +	} + +	pipe_surface_unmap(tex_surface); + +	/* TODO: Implement 422, 444 */ +	mbpx >>= 1; +	mbpy >>= 1; + +	for (tb = 0; tb < 2; ++tb) +	{ +		tex_surface = mc->pipe->screen->get_tex_surface +		( +			mc->pipe->screen, +			mc->textures[mc->cur_buf % NUM_BUF_SETS][tb + 1], +			0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE +		); + +		texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE); +		tex_pitch = tex_surface->stride / tex_surface->block.size; + +		texels += mbpy * tex_pitch + mbpx; + +		if ((coded_block_pattern >> (1 - tb)) & 1) +		{ +			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT; + +			vlGrabFrameCodedBlock +			( +				cur_block, +				texels, +				tex_pitch +			); + +			++sb; +		} +		else +			vlGrabNoBlock(texels, tex_pitch); + +		pipe_surface_unmap(tex_surface); +	} + +	return 0; +} + +#if 0 +static int vlGrabMacroBlock +( +	struct vlR16SnormBufferedMC *mc, +	struct vlMpeg2MacroBlock *macroblock +) +{ +	const struct vlVertex2f	unit = +	{ +		mc->surface_tex_inv_size.x * VL_MACROBLOCK_WIDTH, +		mc->surface_tex_inv_size.y * VL_MACROBLOCK_HEIGHT +	}; +	const struct vlVertex2f half = +	{ +		mc->surface_tex_inv_size.x * (VL_MACROBLOCK_WIDTH / 2), +		mc->surface_tex_inv_size.y * (VL_MACROBLOCK_HEIGHT / 2) +	}; + +	struct vlVertex2f	*vb; +	enum vlMacroBlockTypeEx	mb_type_ex; +	struct vlVertex2f	mo_vec[2]; +	unsigned int		i; + +	assert(mc); +	assert(macroblock); + +	switch (macroblock->mb_type) +	{ +		case vlMacroBlockTypeIntra: +		{ +			mb_type_ex = vlMacroBlockExTypeIntra; +			break; +		} +		case vlMacroBlockTypeFwdPredicted: +		{ +			mb_type_ex = macroblock->mo_type == vlMotionTypeFrame ? +				vlMacroBlockExTypeFwdPredictedFrame : vlMacroBlockExTypeFwdPredictedField; +			break; +		} +		case vlMacroBlockTypeBkwdPredicted: +		{ +			mb_type_ex = macroblock->mo_type == vlMotionTypeFrame ? +				vlMacroBlockExTypeBkwdPredictedFrame : vlMacroBlockExTypeBkwdPredictedField; +			break; +		} +		case vlMacroBlockTypeBiPredicted: +		{ +			mb_type_ex = macroblock->mo_type == vlMotionTypeFrame ? +				vlMacroBlockExTypeBiPredictedFrame : vlMacroBlockExTypeBiPredictedField; +			break; +		} +		default: +			assert(0); +	} + +	switch (macroblock->mb_type) +	{ +		case vlMacroBlockTypeBiPredicted: +		{ +			vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map +			( +				mc->pipe->winsys, +				mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][2].buffer, +				PIPE_BUFFER_USAGE_CPU_WRITE +			) + mc->num_macroblocks[mb_type_ex] * 2 * 24; + +			mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x; +			mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y; + +			if (macroblock->mo_type == vlMotionTypeFrame) +			{ +				for (i = 0; i < 24 * 2; i += 2) +				{ +					vb[i].x = mo_vec[0].x; +					vb[i].y = mo_vec[0].y; +				} +			} +			else +			{ +				mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x; +				mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y; + +				for (i = 0; i < 24 * 2; i += 2) +				{ +					vb[i].x = mo_vec[0].x; +					vb[i].y = mo_vec[0].y; +					vb[i + 1].x = mo_vec[1].x; +					vb[i + 1].y = mo_vec[1].y; +				} +			} + +			mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][2].buffer); + +			/* fall-through */ +		} +		case vlMacroBlockTypeFwdPredicted: +		case vlMacroBlockTypeBkwdPredicted: +		{ +			vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map +			( +				mc->pipe->winsys, +				mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][1].buffer, +				PIPE_BUFFER_USAGE_CPU_WRITE +			) + mc->num_macroblocks[mb_type_ex] * 2 * 24; + +			if (macroblock->mb_type == vlMacroBlockTypeBkwdPredicted) +			{ +				mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x; +				mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y; + +				if (macroblock->mo_type == vlMotionTypeField) +				{ +					mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x; +					mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y; +				} +			} +			else +			{ +				mo_vec[0].x = macroblock->PMV[0][0][0] * 0.5f * mc->surface_tex_inv_size.x; +				mo_vec[0].y = macroblock->PMV[0][0][1] * 0.5f * mc->surface_tex_inv_size.y; + +				if (macroblock->mo_type == vlMotionTypeField) +				{ +					mo_vec[1].x = macroblock->PMV[1][0][0] * 0.5f * mc->surface_tex_inv_size.x; +					mo_vec[1].y = macroblock->PMV[1][0][1] * 0.5f * mc->surface_tex_inv_size.y; +				} +			} + +			if (macroblock->mo_type == vlMotionTypeFrame) +			{ +				for (i = 0; i < 24 * 2; i += 2) +				{ +					vb[i].x = mo_vec[0].x; +					vb[i].y = mo_vec[0].y; +				} +			} +			else +			{ +				for (i = 0; i < 24 * 2; i += 2) +				{ +					vb[i].x = mo_vec[0].x; +					vb[i].y = mo_vec[0].y; +					vb[i + 1].x = mo_vec[1].x; +					vb[i + 1].y = mo_vec[1].y; +				} +			} + +			mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][1].buffer); + +			/* fall-through */ +		} +		case vlMacroBlockTypeIntra: +		{ +			vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map +			( +				mc->pipe->winsys, +				mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][0].buffer, +				PIPE_BUFFER_USAGE_CPU_WRITE +			) + mc->num_macroblocks[mb_type_ex] * 24; + +			vb[0].x = macroblock->mbx * unit.x;		vb[0].y = macroblock->mby * unit.y; +			vb[1].x = macroblock->mbx * unit.x;		vb[1].y = macroblock->mby * unit.y + half.y; +			vb[2].x = macroblock->mbx * unit.x + half.x;	vb[2].y = macroblock->mby * unit.y; + +			vb[3].x = macroblock->mbx * unit.x + half.x;	vb[3].y = macroblock->mby * unit.y; +			vb[4].x = macroblock->mbx * unit.x;		vb[4].y = macroblock->mby * unit.y + half.y; +			vb[5].x = macroblock->mbx * unit.x + half.x;	vb[5].y = macroblock->mby * unit.y + half.y; + +			vb[6].x = macroblock->mbx * unit.x + half.x;	vb[6].y = macroblock->mby * unit.y; +			vb[7].x = macroblock->mbx * unit.x + half.x;	vb[7].y = macroblock->mby * unit.y + half.y; +			vb[8].x = macroblock->mbx * unit.x + unit.x;	vb[8].y = macroblock->mby * unit.y; + +			vb[9].x = macroblock->mbx * unit.x + unit.x;	vb[9].y = macroblock->mby * unit.y; +			vb[10].x = macroblock->mbx * unit.x + half.x;	vb[10].y = macroblock->mby * unit.y + half.y; +			vb[11].x = macroblock->mbx * unit.x + unit.x;	vb[11].y = macroblock->mby * unit.y + half.y; + +			vb[12].x = macroblock->mbx * unit.x;		vb[12].y = macroblock->mby * unit.y + half.y; +			vb[13].x = macroblock->mbx * unit.x;		vb[13].y = macroblock->mby * unit.y + unit.y; +			vb[14].x = macroblock->mbx * unit.x + half.x;	vb[14].y = macroblock->mby * unit.y + half.y; + +			vb[15].x = macroblock->mbx * unit.x + half.x;	vb[15].y = macroblock->mby * unit.y + half.y; +			vb[16].x = macroblock->mbx * unit.x;		vb[16].y = macroblock->mby * unit.y + unit.y; +			vb[17].x = macroblock->mbx * unit.x + half.x;	vb[17].y = macroblock->mby * unit.y + unit.y; + +			vb[18].x = macroblock->mbx * unit.x + half.x;	vb[18].y = macroblock->mby * unit.y + half.y; +			vb[19].x = macroblock->mbx * unit.x + half.x;	vb[19].y = macroblock->mby * unit.y + unit.y; +			vb[20].x = macroblock->mbx * unit.x + unit.x;	vb[20].y = macroblock->mby * unit.y + half.y; + +			vb[21].x = macroblock->mbx * unit.x + unit.x;	vb[21].y = macroblock->mby * unit.y + half.y; +			vb[22].x = macroblock->mbx * unit.x + half.x;	vb[22].y = macroblock->mby * unit.y + unit.y; +			vb[23].x = macroblock->mbx * unit.x + unit.x;	vb[23].y = macroblock->mby * unit.y + unit.y; + +			mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_type_ex][0].buffer); + +			break; +		} +		default: +			assert(0); +	} + +	vlGrabBlocks +	( +		mc, +		macroblock->mbx, +		macroblock->mby, +		macroblock->dct_type, +		macroblock->cbp, +		macroblock->blocks +	); + +	mc->num_macroblocks[mb_type_ex]++; +	mc->total_num_macroblocks++; + +	return 0; +} +#else +static int vlGrabMacroBlock +( +	struct vlR16SnormBufferedMC *mc, +	struct vlMpeg2MacroBlock *macroblock +) +{ +	const struct vlVertex2f	unit = +	{ +		mc->surface_tex_inv_size.x * VL_MACROBLOCK_WIDTH, +		mc->surface_tex_inv_size.y * VL_MACROBLOCK_HEIGHT +	}; +	const struct vlVertex2f half = +	{ +		mc->surface_tex_inv_size.x * (VL_MACROBLOCK_WIDTH / 2), +		mc->surface_tex_inv_size.y * (VL_MACROBLOCK_HEIGHT / 2) +	}; + +	struct vlVertex2f	*vb; +	unsigned int		mb_buf_id; +	struct vlVertex2f	mo_vec[2]; +	unsigned int		i; + +	assert(mc); +	assert(macroblock); + +	switch (macroblock->mb_type) +	{ +		case vlMacroBlockTypeIntra: +		{ +			mb_buf_id = vlMacroBlockExTypeIntra; +			break; +		} +		case vlMacroBlockTypeFwdPredicted: +		{ +			mb_buf_id = macroblock->mo_type == vlMotionTypeFrame ? +				vlMacroBlockExTypeFwdPredictedFrame : vlMacroBlockExTypeFwdPredictedField; +			break; +		} +		case vlMacroBlockTypeBkwdPredicted: +		{ +			mb_buf_id = macroblock->mo_type == vlMotionTypeFrame ? +				vlMacroBlockExTypeBkwdPredictedFrame : vlMacroBlockExTypeBkwdPredictedField; +			break; +		} +		case vlMacroBlockTypeBiPredicted: +		{ +			mb_buf_id = macroblock->mo_type == vlMotionTypeFrame ? +				vlMacroBlockExTypeBiPredictedFrame : vlMacroBlockExTypeBiPredictedField; +			break; +		} +		default: +			assert(0); +	} + +	vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map +	( +		mc->pipe->winsys, +		mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_buf_id][0].buffer, +		PIPE_BUFFER_USAGE_CPU_WRITE +	) + mc->num_macroblocks[mb_buf_id] * 24; + +	vb[0].x = macroblock->mbx * unit.x;		vb[0].y = macroblock->mby * unit.y; +	vb[1].x = macroblock->mbx * unit.x;		vb[1].y = macroblock->mby * unit.y + half.y; +	vb[2].x = macroblock->mbx * unit.x + half.x;	vb[2].y = macroblock->mby * unit.y; + +	vb[3].x = macroblock->mbx * unit.x + half.x;	vb[3].y = macroblock->mby * unit.y; +	vb[4].x = macroblock->mbx * unit.x;		vb[4].y = macroblock->mby * unit.y + half.y; +	vb[5].x = macroblock->mbx * unit.x + half.x;	vb[5].y = macroblock->mby * unit.y + half.y; + +	vb[6].x = macroblock->mbx * unit.x + half.x;	vb[6].y = macroblock->mby * unit.y; +	vb[7].x = macroblock->mbx * unit.x + half.x;	vb[7].y = macroblock->mby * unit.y + half.y; +	vb[8].x = macroblock->mbx * unit.x + unit.x;	vb[8].y = macroblock->mby * unit.y; + +	vb[9].x = macroblock->mbx * unit.x + unit.x;	vb[9].y = macroblock->mby * unit.y; +	vb[10].x = macroblock->mbx * unit.x + half.x;	vb[10].y = macroblock->mby * unit.y + half.y; +	vb[11].x = macroblock->mbx * unit.x + unit.x;	vb[11].y = macroblock->mby * unit.y + half.y; + +	vb[12].x = macroblock->mbx * unit.x;		vb[12].y = macroblock->mby * unit.y + half.y; +	vb[13].x = macroblock->mbx * unit.x;		vb[13].y = macroblock->mby * unit.y + unit.y; +	vb[14].x = macroblock->mbx * unit.x + half.x;	vb[14].y = macroblock->mby * unit.y + half.y; + +	vb[15].x = macroblock->mbx * unit.x + half.x;	vb[15].y = macroblock->mby * unit.y + half.y; +	vb[16].x = macroblock->mbx * unit.x;		vb[16].y = macroblock->mby * unit.y + unit.y; +	vb[17].x = macroblock->mbx * unit.x + half.x;	vb[17].y = macroblock->mby * unit.y + unit.y; + +	vb[18].x = macroblock->mbx * unit.x + half.x;	vb[18].y = macroblock->mby * unit.y + half.y; +	vb[19].x = macroblock->mbx * unit.x + half.x;	vb[19].y = macroblock->mby * unit.y + unit.y; +	vb[20].x = macroblock->mbx * unit.x + unit.x;	vb[20].y = macroblock->mby * unit.y + half.y; + +	vb[21].x = macroblock->mbx * unit.x + unit.x;	vb[21].y = macroblock->mby * unit.y + half.y; +	vb[22].x = macroblock->mbx * unit.x + half.x;	vb[22].y = macroblock->mby * unit.y + unit.y; +	vb[23].x = macroblock->mbx * unit.x + unit.x;	vb[23].y = macroblock->mby * unit.y + unit.y; + +	mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_buf_id][0].buffer); + +	if (macroblock->mb_type == vlMacroBlockTypeIntra) +	{ +		vlGrabBlocks +		( +			mc, +			macroblock->mbx, +			macroblock->mby, +			macroblock->dct_type, +			macroblock->cbp, +			macroblock->blocks +		); + +		mc->num_macroblocks[mb_buf_id]++; +		mc->total_num_macroblocks++; +		return 0; +	} + +	vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map +	( +		mc->pipe->winsys, +		mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_buf_id][1].buffer, +		PIPE_BUFFER_USAGE_CPU_WRITE +	) + mc->num_macroblocks[mb_buf_id] * 2 * 24; + +	if (macroblock->mb_type == vlMacroBlockTypeFwdPredicted || macroblock->mb_type == vlMacroBlockTypeBiPredicted) +	{ +		mo_vec[0].x = macroblock->PMV[0][0][0] * 0.5f * mc->surface_tex_inv_size.x; +		mo_vec[0].y = macroblock->PMV[0][0][1] * 0.5f * mc->surface_tex_inv_size.y; + +		if (macroblock->mo_type == vlMotionTypeField) +		{ +			mo_vec[1].x = macroblock->PMV[1][0][0] * 0.5f * mc->surface_tex_inv_size.x; +			mo_vec[1].y = macroblock->PMV[1][0][1] * 0.5f * mc->surface_tex_inv_size.y; +		} +	} +	else +	{ +		mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x; +		mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y; + +		if (macroblock->mo_type == vlMotionTypeField) +		{ +			mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x; +			mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y; +		} +	} + +	if (macroblock->mo_type == vlMotionTypeFrame) +	{ +		for (i = 0; i < 24 * 2; i += 2) +		{ +			vb[i].x = mo_vec[0].x; +			vb[i].y = mo_vec[0].y; +		} +	} +	else +	{ +		for (i = 0; i < 24 * 2; i += 2) +		{ +			vb[i].x = mo_vec[0].x; +			vb[i].y = mo_vec[0].y; +			vb[i + 1].x = mo_vec[1].x; +			vb[i + 1].y = mo_vec[1].y; +		} +	} + +	mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_buf_id][1].buffer); + +	if (macroblock->mb_type != vlMacroBlockTypeBiPredicted) +	{ +		vlGrabBlocks +		( +			mc, +			macroblock->mbx, +			macroblock->mby, +			macroblock->dct_type, +			macroblock->cbp, +			macroblock->blocks +		); + +		mc->num_macroblocks[mb_buf_id]++; +		mc->total_num_macroblocks++; +		return 0; +	} + +	vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map +	( +		mc->pipe->winsys, +		mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_buf_id][2].buffer, +		PIPE_BUFFER_USAGE_CPU_WRITE +	) + mc->num_macroblocks[mb_buf_id] * 2 * 24; + +	mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x; +	mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y; + +	if (macroblock->mo_type == vlMotionTypeFrame) +	{ +		for (i = 0; i < 24 * 2; i += 2) +		{ +			vb[i].x = mo_vec[0].x; +			vb[i].y = mo_vec[0].y; +		} +	} +	else +	{ +		mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x; +		mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y; + +		for (i = 0; i < 24 * 2; i += 2) +		{ +			vb[i].x = mo_vec[0].x; +			vb[i].y = mo_vec[0].y; +			vb[i + 1].x = mo_vec[1].x; +			vb[i + 1].y = mo_vec[1].y; +		} +	} + +	mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][mb_buf_id][2].buffer); + +	vlGrabBlocks +	( +		mc, +		macroblock->mbx, +		macroblock->mby, +		macroblock->dct_type, +		macroblock->cbp, +		macroblock->blocks +	); + +	mc->num_macroblocks[mb_buf_id]++; +	mc->total_num_macroblocks++; + +	return 0; +} +#endif + +static int vlFlush +( +	struct vlRender *render +) +{ +	struct vlR16SnormBufferedMC	*mc; +	struct pipe_context		*pipe; +	struct vlVertexShaderConsts	*vs_consts; + +	assert(mc); + +	mc = (struct vlR16SnormBufferedMC*)render; +	pipe = mc->pipe; + +	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface +	( +		pipe->screen, +		mc->buffered_surface->texture, +		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE +	); + +	pipe->set_framebuffer_state(pipe, &mc->render_target); +	pipe->set_viewport_state(pipe, &mc->viewport); +	vs_consts = pipe->winsys->buffer_map +	( +		pipe->winsys, +		mc->vs_const_buf.buffer, +		PIPE_BUFFER_USAGE_CPU_WRITE +	); + +	vs_consts->denorm.x = mc->buffered_surface->texture->width[0]; +	vs_consts->denorm.y = mc->buffered_surface->texture->height[0]; + +	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer); +	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &mc->fs_const_buf); + +	if (mc->num_macroblocks[vlMacroBlockExTypeIntra] > 0) +	{ +		pipe->set_vertex_buffers(pipe, 1, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeIntra]); +		pipe->set_vertex_elements(pipe, 1, mc->vertex_elems); +		pipe->set_sampler_textures(pipe, 3, mc->textures[mc->cur_buf % NUM_BUF_SETS]); +		pipe->bind_sampler_states(pipe, 3, (void**)mc->samplers); +		pipe->bind_vs_state(pipe, mc->i_vs); +		pipe->bind_fs_state(pipe, mc->i_fs); + +		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeIntra] * 24); +	} + +	if (mc->num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] > 0) +	{ +		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeFwdPredictedFrame]); +		pipe->set_vertex_elements(pipe, 3, mc->vertex_elems); +		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture; +		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]); +		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers); +		pipe->bind_vs_state(pipe, mc->p_vs[0]); +		pipe->bind_fs_state(pipe, mc->p_fs[0]); + +		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] * 24); +	} + +	if (mc->num_macroblocks[vlMacroBlockExTypeFwdPredictedField] > 0) +	{ +		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeFwdPredictedField]); +		pipe->set_vertex_elements(pipe, 3, mc->vertex_elems); +		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture; +		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]); +		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers); +		pipe->bind_vs_state(pipe, mc->p_vs[1]); +		pipe->bind_fs_state(pipe, mc->p_fs[1]); + +		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeFwdPredictedField] * 24); +	} + +	if (mc->num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] > 0) +	{ +		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeBkwdPredictedFrame]); +		pipe->set_vertex_elements(pipe, 3, mc->vertex_elems); +		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->future_surface->texture; +		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]); +		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers); +		pipe->bind_vs_state(pipe, mc->p_vs[0]); +		pipe->bind_fs_state(pipe, mc->p_fs[0]); + +		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] * 24); +	} + +	if (mc->num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] > 0) +	{ +		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeBkwdPredictedField]); +		pipe->set_vertex_elements(pipe, 3, mc->vertex_elems); +		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->future_surface->texture; +		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]); +		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers); +		pipe->bind_vs_state(pipe, mc->p_vs[1]); +		pipe->bind_fs_state(pipe, mc->p_fs[1]); + +		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] * 24); +	} + +	if (mc->num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] > 0) +	{ +		pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeBiPredictedFrame]); +		pipe->set_vertex_elements(pipe, 5, mc->vertex_elems); +		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture; +		mc->textures[mc->cur_buf % NUM_BUF_SETS][4] = mc->future_surface->texture; +		pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUF_SETS]); +		pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers); +		pipe->bind_vs_state(pipe, mc->b_vs[0]); +		pipe->bind_fs_state(pipe, mc->b_fs[0]); + +		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] * 24); +	} + +	if (mc->num_macroblocks[vlMacroBlockExTypeBiPredictedField] > 0) +	{ +		pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][vlMacroBlockExTypeBiPredictedField]); +		pipe->set_vertex_elements(pipe, 5, mc->vertex_elems); +		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture; +		mc->textures[mc->cur_buf % NUM_BUF_SETS][4] = mc->future_surface->texture; +		pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUF_SETS]); +		pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers); +		pipe->bind_vs_state(pipe, mc->b_vs[1]); +		pipe->bind_fs_state(pipe, mc->b_fs[1]); + +		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, mc->num_macroblocks[vlMacroBlockExTypeBiPredictedField] * 24); +	} + +	memset(mc->num_macroblocks, 0, sizeof(unsigned int) * 7); +	mc->total_num_macroblocks = 0; + +	return 0; +} + +static int vlRenderMacroBlocksMpeg2R16SnormBuffered +( +	struct vlRender *render, +	struct vlMpeg2MacroBlockBatch *batch, +	struct vlSurface *surface +) +{ +	struct vlR16SnormBufferedMC	*mc; +	unsigned int			i; + +	assert(render); + +	mc = (struct vlR16SnormBufferedMC*)render; + +	if (mc->buffered_surface) +	{ +		if +		( +			mc->buffered_surface != surface /*|| +			mc->past_surface != batch->past_surface || +			mc->future_surface != batch->future_surface*/ +		) +		{ +			vlFlush(&mc->base); +			mc->buffered_surface = surface; +			mc->past_surface = batch->past_surface; +			mc->future_surface = batch->future_surface; +			mc->surface_tex_inv_size.x = 1.0f / surface->texture->width[0]; +			mc->surface_tex_inv_size.y = 1.0f / surface->texture->height[0]; +		} +	} +	else +	{ +		mc->buffered_surface = surface; +		mc->past_surface = batch->past_surface; +		mc->future_surface = batch->future_surface; +		mc->surface_tex_inv_size.x = 1.0f / surface->texture->width[0]; +		mc->surface_tex_inv_size.y = 1.0f / surface->texture->height[0]; +	} + +	for (i = 0; i < batch->num_macroblocks; ++i) +		vlGrabMacroBlock(mc, &batch->macroblocks[i]); + +	return 0; +} + +static int vlEnd +( +	struct vlRender *render +) +{ +	assert(render); + +	return 0; +} + +static int vlDestroy +( +	struct vlRender *render +) +{ +	struct vlR16SnormBufferedMC	*mc; +	struct pipe_context		*pipe; +	unsigned int			g, h, i; + +	assert(render); + +	mc = (struct vlR16SnormBufferedMC*)render; +	pipe = mc->pipe; + +	for (i = 0; i < 5; ++i) +		pipe->delete_sampler_state(pipe, mc->samplers[i]); + +	for (g = 0; g < NUM_BUF_SETS; ++g) +		for (h = 0; h < 7; ++h) +			for (i = 0; i < 3; ++i) +				pipe->winsys->buffer_destroy(pipe->winsys, mc->vertex_bufs[g][h][i].buffer); + +	/* Textures 3 & 4 are not created directly, no need to release them here */ +	for (i = 0; i < NUM_BUF_SETS; ++i) +	{ +		pipe_texture_release(&mc->textures[i][0]); +		pipe_texture_release(&mc->textures[i][1]); +		pipe_texture_release(&mc->textures[i][2]); +	} + +	pipe->delete_vs_state(pipe, mc->i_vs); +	pipe->delete_fs_state(pipe, mc->i_fs); + +	for (i = 0; i < 2; ++i) +	{ +		pipe->delete_vs_state(pipe, mc->p_vs[i]); +		pipe->delete_fs_state(pipe, mc->p_fs[i]); +		pipe->delete_vs_state(pipe, mc->b_vs[i]); +		pipe->delete_fs_state(pipe, mc->b_fs[i]); +	} + +	pipe->winsys->buffer_destroy(pipe->winsys, mc->vs_const_buf.buffer); +	pipe->winsys->buffer_destroy(pipe->winsys, mc->fs_const_buf.buffer); + +	free(mc); + +	return 0; +} + +/* + * Muliplier renormalizes block samples from 16 bits to 12 bits. + * Divider is used when calculating Y % 2 for choosing top or bottom + * field for P or B macroblocks. + * TODO: Use immediates. + */ +static const struct vlFragmentShaderConsts fs_consts = +{ +	{32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f}, +	{0.5f, 2.0f, 0.0f, 0.0f} +}; + +static int vlCreateVertexShaderIMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 50; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	vs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header); + +	ti = 3; + +	/* +	 * decl i0		; Vertex pos, luma & chroma texcoords +	 */ +	for (i = 0; i < 3; i++) +	{ +		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * decl o0		; Vertex pos +	 * decl o1		; Luma/chroma texcoords +	 */ +	for (i = 0; i < 2; i++) +	{ +		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * mov o0, i0		; Move input vertex pos to output +	 * mov o1, i0		; Move input luma/chroma texcoords to output +	 */ +	for (i = 0; i < 2; ++i) +	{ +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, 0); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	vs.tokens = tokens; +	mc->i_vs = pipe->create_vs_state(pipe, &vs); +	free(tokens); + +	return 0; +} + +static int vlCreateFragmentShaderIMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 100; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	fs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header); + +	ti = 3; + +	/* decl i0			; Luma/chroma texcoords */ +	decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */ +	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl o0			; Fragment color */ +	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl t0, t1 */ +	decl = vl_decl_temps(0, 1); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* +	 * decl s0			; Sampler for luma texture +	 * decl s1			; Sampler for chroma Cb texture +	 * decl s2			; Sampler for chroma Cr texture +	 */ +	for (i = 0; i < 3; ++i) +	{ +		decl = vl_decl_samplers(i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header,max_tokens - ti); +	} + +	/* +	 * tex2d t1, i0, s0		; Read texel from luma texture +	 * mov t0.x, t1.x		; Move luma sample into .x component +	 * tex2d t1, i0, s1		; Read texel from chroma Cb texture +	 * mov t0.y, t1.x		; Move Cb sample into .y component +	 * tex2d t1, i0, s2		; Read texel from chroma Cr texture +	 * mov t0.z, t1.x		; Move Cr sample into .z component +	 */ +	for (i = 0; i < 3; ++i) +	{ +		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1); +		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i; +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	} + +	/* mul o0, t0, c0		; Rescale texel to correct range */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	fs.tokens = tokens; +	mc->i_fs = pipe->create_fs_state(pipe, &fs); +	free(tokens); + +	return 0; +} + +static int vlCreateVertexShaderFramePMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 100; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	vs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header); + +	ti = 3; + +	/* +	 * decl i0		; Vertex pos, luma/chroma texcoords +	 * decl i1		; Ref surface top field texcoords +	 * decl i2		; Ref surface bottom field texcoords (unused, packed in the same stream) +	 */ +	for (i = 0; i < 3; i++) +	{ +		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * decl o0		; Vertex pos +	 * decl o1		; Luma/chroma texcoords +	 * decl o2		; Ref macroblock texcoords +	 */ +	for (i = 0; i < 3; i++) +	{ +		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * mov o0, i0		; Move input vertex pos to output +	 * mov o1, i0		; Move input luma/chroma texcoords to output +	 */ +	for (i = 0; i < 2; ++i) +	{ +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, 0); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* add o2, i0, i1	; Translate vertex pos by motion vec to form ref macroblock texcoords */ +	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 2, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, 1); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	vs.tokens = tokens; +	mc->p_vs[0] = pipe->create_vs_state(pipe, &vs); +	free(tokens); + +	return 0; +} + +static int vlCreateVertexShaderFieldPMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 100; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	vs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header); + +	ti = 3; + +	/* +	 * decl i0		; Vertex pos, luma/chroma texcoords +	 * decl i1              ; Ref surface top field texcoords +	 * decl i2              ; Ref surface bottom field texcoords +	 */ +	for (i = 0; i < 3; i++) +	{ +		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* decl c0		; Texcoord denorm coefficients */ +	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* +	 * decl o0		; Vertex pos +	 * decl o1		; Luma/chroma texcoords +	 * decl o2		; Top field ref macroblock texcoords +	 * decl o3		; Bottom field ref macroblock texcoords +	 * decl o4		; Denormalized vertex pos +	 */ +	for (i = 0; i < 5; i++) +	{ +		decl = vl_decl_output((i == 0 || i == 5) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * mov o0, i0		; Move input vertex pos to output +	 * mov o1, i0		; Move input luma/chroma texcoords to output +	 */ +	for (i = 0; i < 3; ++i) +	{ +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i == 0 ? 0 : i - 1); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * add o2, i0, i1	; Translate vertex pos by motion vec to form top field macroblock texcoords +	 * add o3, i0, i2	; Translate vertex pos by motion vec to form bottom field macroblock texcoords +	 */ +	for (i = 0; i < 2; ++i) +	{ +		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 2, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 1); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* mul o4, i0, c0	; Denorm vertex pos */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 5, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	vs.tokens = tokens; +	mc->p_vs[1] = pipe->create_vs_state(pipe, &vs); +	free(tokens); + +	return 0; +} + +static int vlCreateFragmentShaderFramePMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 100; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	fs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header); + +	ti = 3; + +	/* +	 * decl i0			; Texcoords for s0, s1, s2 +	 * decl i1			; Texcoords for s3 +	 */ +	for (i = 0; i < 2; ++i) +	{ +		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */ +	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl o0			; Fragment color */ +	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl t0, t1 */ +	decl = vl_decl_temps(0, 1); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* +	 * decl s0			; Sampler for luma texture +	 * decl s1			; Sampler for chroma Cb texture +	 * decl s2			; Sampler for chroma Cr texture +	 * decl s3			; Sampler for ref surface texture +	 */ +	for (i = 0; i < 4; ++i) +	{ +		decl = vl_decl_samplers(i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * tex2d t1, i0, s0		; Read texel from luma texture +	 * mov t0.x, t1.x		; Move luma sample into .x component +	 * tex2d t1, i0, s1		; Read texel from chroma Cb texture +	 * mov t0.y, t1.x		; Move Cb sample into .y component +	 * tex2d t1, i0, s2		; Read texel from chroma Cr texture +	 * mov t0.z, t1.x		; Move Cr sample into .z component +	 */ +	for (i = 0; i < 3; ++i) +	{ +		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1); +		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i; +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	} + +	/* mul t0, t0, c0		; Rescale texel to correct range */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* tex2d t1, i1, s3		; Read texel from ref macroblock */ +	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 1, TGSI_FILE_SAMPLER, 3); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* add o0, t0, t1		; Add ref and differential to form final output */ +	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	fs.tokens = tokens; +	mc->p_fs[0] = pipe->create_fs_state(pipe, &fs); +	free(tokens); + +	return 0; +} + +static int vlCreateFragmentShaderFieldPMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 200; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	fs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header); + +	ti = 3; + +	/* +	 * decl i0			; Texcoords for s0, s1, s2 +	 * decl i1			; Texcoords for s3 +	 * decl i2			; Texcoords for s3 +	 * decl i3			; Denormalized vertex pos +	 */ +	for (i = 0; i < 4; ++i) +	{ +		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm +	 * decl c1			; Constants 1/2 & 2 in .x, .y channels for Y-mod-2 top/bottom field selection +	 */ +	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl o0			; Fragment color */ +	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl t0-t4 */ +	decl = vl_decl_temps(0, 4); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* +	 * decl s0			; Sampler for luma texture +	 * decl s1			; Sampler for chroma Cb texture +	 * decl s2			; Sampler for chroma Cr texture +	 * decl s3			; Sampler for ref surface texture +	 */ +	for (i = 0; i < 4; ++i) +	{ +		decl = vl_decl_samplers(i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * tex2d t1, i0, s0		; Read texel from luma texture +	 * mov t0.x, t1.x		; Move luma sample into .x component +	 * tex2d t1, i0, s1		; Read texel from chroma Cb texture +	 * mov t0.y, t1.x		; Move Cb sample into .y component +	 * tex2d t1, i0, s2		; Read texel from chroma Cr texture +	 * mov t0.z, t1.x		; Move Cr sample into .z component +	 */ +	for (i = 0; i < 3; ++i) +	{ +		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1); +		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i; +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	} + +	/* mul t0, t0, c0		; Rescale texel to correct range */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* +	 * tex2d t1, i1, s3		; Read texel from ref macroblock top field +	 * tex2d t2, i2, s3		; Read texel from ref macroblock bottom field +	 */ +	for (i = 0; i < 2; ++i) +	{ +		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 1, TGSI_FILE_SAMPLER, 3); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* XXX: Pos values off by 0.5? */ +	/* sub t4, i3.y, c1.x		; Sub 0.5 from denormalized pos */ +	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 3, TGSI_FILE_CONSTANT, 1); +	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X; +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1); +	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X; +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* floor t3, t3			; Get rid of fractional part */ +	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* mul t3, t3, c1.y		; Multiply by 2 */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1); +	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y; +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */ +	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* TODO: Move to conditional tex fetch on t3 instead of lerp */ +	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */ +	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* add o0, t0, t1		; Add ref and differential to form final output */ +	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	fs.tokens = tokens; +	mc->p_fs[1] = pipe->create_fs_state(pipe, &fs); +	free(tokens); + +	return 0; +} + +static int vlCreateVertexShaderFrameBMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 100; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	vs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header); + +	ti = 3; + +	/* +	 * decl i0		; Vertex pos, luma/chroma texcoords +	 * decl i1              ; First ref surface top field texcoords +	 * decl i2              ; First ref surface bottom field texcoords (unused, packed in the same stream) +	 * decl i3		; Second ref surface top field texcoords +	 * decl i4		; Second ref surface bottom field texcoords (unused, packed in the same stream) +	 */ +	for (i = 0; i < 5; i++) +	{ +		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * decl o0		; Vertex pos +	 * decl o1		; Luma/chroma texcoords +	 * decl o2		; First ref macroblock texcoords +	 * decl o3		; Second ref macroblock texcoords +	 */ +	for (i = 0; i < 4; i++) +	{ +		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * mov o0, i0		; Move input vertex pos to output +	 * mov o1, i0		; Move input luma/chroma texcoords to output +	 */ +	for (i = 0; i < 2; ++i) +	{ +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, 0); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * add o2, i0, i1	; Translate vertex pos by motion vec to form first ref macroblock texcoords +	 * add o3, i0, i3	; Translate vertex pos by motion vec to form second ref macroblock texcoords +	 */ +	for (i = 0; i < 2; ++i) +	{ +		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 2, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i * 2 + 1); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	vs.tokens = tokens; +	mc->b_vs[0] = pipe->create_vs_state(pipe, &vs); +	free(tokens); + +	return 0; +} + +static int vlCreateVertexShaderFieldBMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 100; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	vs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header); + +	ti = 3; + +	/* +	 * decl i0		; Vertex pos, Luma/chroma texcoords +	 * decl i1              ; First ref surface top field texcoords +	 * decl i2              ; First ref surface bottom field texcoords +	 * decl i3              ; Second ref surface top field texcoords +	 * decl i4              ; Second ref surface bottom field texcoords +	 */ +	for (i = 0; i < 5; i++) +	{ +		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* decl c0		; Denorm coefficients */ +	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 6); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* +	 * decl o0		; Vertex pos +	 * decl o1		; Luma/chroma texcoords +	 * decl o2		; Top field past ref macroblock texcoords +	 * decl o3		; Bottom field past ref macroblock texcoords +	 * decl o4		; Top field future ref macroblock texcoords +	 * decl o5		; Bottom field future ref macroblock texcoords +	 * decl o6		; Denormalized vertex pos +	 */ +	for (i = 0; i < 7; i++) +	{ +		decl = vl_decl_output((i == 0 || i == 7) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* decl t0, t1 */ +	decl = vl_decl_temps(0, 1); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* +	 * mov o0, i0		; Move input vertex pos to output +	 * mov o1, i0		; Move input luma/chroma texcoords to output +	 * mov o2, i1		; Move past top field texcoords to output +	 * mov o3, i2		; Move past bottom field texcoords to output +	 * mov o4, i3		; Move future top field texcoords to output +	 * mov o5, i4		; Move future bottom field texcoords to output +	 */ +	for (i = 0; i < 6; ++i) +	{ +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, 0); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * add o2, i0, i1	; Translate vertex pos by motion vec to form first top field macroblock texcoords +	 * add o3, i0, i2	; Translate vertex pos by motion vec to form first bottom field macroblock texcoords +	 * add o4, i0, i3	; Translate vertex pos by motion vec to form second top field macroblock texcoords +	 * add o5, i0, i4	; Translate vertex pos by motion vec to form second bottom field macroblock texcoords +	 */ +	for (i = 0; i < 4; ++i) +	{ +		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 2, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 1); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* mul o6, i0, c0	; Denorm vertex pos */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 6, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	vs.tokens = tokens; +	mc->b_vs[1] = pipe->create_vs_state(pipe, &vs); +	free(tokens); + +	return 0; +} + +static int vlCreateFragmentShaderFrameBMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 100; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	fs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header); + +	ti = 3; + +	/* +	 * decl i0			; Texcoords for s0, s1, s2 +	 * decl i1			; Texcoords for s3 +	 * decl i2			; Texcoords for s4 +	 */ +	for (i = 0; i < 3; ++i) +	{ +		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm +	 * decl c1			; Constant 1/2 in .x channel to use as weight to blend past and future texels +	 */ +	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl o0			; Fragment color */ +	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl t0-t2 */ +	decl = vl_decl_temps(0, 2); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* +	 * decl s0			; Sampler for luma texture +	 * decl s1			; Sampler for chroma Cb texture +	 * decl s2			; Sampler for chroma Cr texture +	 * decl s3			; Sampler for past ref surface texture +	 * decl s4			; Sampler for future ref surface texture +	 */ +	for (i = 0; i < 5; ++i) +	{ +		decl = vl_decl_samplers(i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * tex2d t1, i0, s0		; Read texel from luma texture +	 * mov t0.x, t1.x		; Move luma sample into .x component +	 * tex2d t1, i0, s1		; Read texel from chroma Cb texture +	 * mov t0.y, t1.x		; Move Cb sample into .y component +	 * tex2d t1, i0, s2		; Read texel from chroma Cr texture +	 * mov t0.z, t1.x		; Move Cr sample into .z component +	 */ +	for (i = 0; i < 3; ++i) +	{ +		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1); +		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i; +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	} + +	/* mul t0, t0, c0		; Rescale texel to correct range */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* +	 * tex2d t1, i1, s3		; Read texel from past ref macroblock +	 * tex2d t2, i2, s4		; Read texel from future ref macroblock +	 */ +	for (i = 0; i < 2; ++i) +	{ +		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 1, TGSI_FILE_SAMPLER, i + 3); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */ +	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2); +	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X; +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* add o0, t0, t1		; Add past/future ref and differential to form final output */ +	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	fs.tokens = tokens; +	mc->b_fs[0] = pipe->create_fs_state(pipe, &fs); +	free(tokens); + +	return 0; +} + +static int vlCreateFragmentShaderFieldBMB +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int		max_tokens = 200; + +	struct pipe_context		*pipe; +	struct pipe_shader_state	fs; +	struct tgsi_token		*tokens; +	struct tgsi_header		*header; + +	struct tgsi_full_declaration	decl; +	struct tgsi_full_instruction	inst; + +	unsigned int			ti; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; +	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token)); + +	/* Version */ +	*(struct tgsi_version*)&tokens[0] = tgsi_build_version(); +	/* Header */ +	header = (struct tgsi_header*)&tokens[1]; +	*header = tgsi_build_header(); +	/* Processor */ +	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header); + +	ti = 3; + +	/* +	 * decl i0			; Texcoords for s0, s1, s2 +	 * decl i1			; Texcoords for s3 +	 * decl i2			; Texcoords for s3 +	 * decl i3			; Texcoords for s4 +	 * decl i4			; Texcoords for s4 +	 * decl i5			; Denormalized vertex pos +	 */ +	for (i = 0; i < 6; ++i) +	{ +		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm +	 * decl c1			; Constants 1/2 & 2 in .x, .y channels to use as weight to blend past and future texels +	 *				; and for Y-mod-2 top/bottom field selection +	 */ +	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl o0			; Fragment color */ +	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* decl t0-t5 */ +	decl = vl_decl_temps(0, 5); +	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); + +	/* +	 * decl s0			; Sampler for luma texture +	 * decl s1			; Sampler for chroma Cb texture +	 * decl s2			; Sampler for chroma Cr texture +	 * decl s3			; Sampler for past ref surface texture +	 * decl s4			; Sampler for future ref surface texture +	 */ +	for (i = 0; i < 5; ++i) +	{ +		decl = vl_decl_samplers(i, i); +		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti); +	} + +	/* +	 * tex2d t1, i0, s0		; Read texel from luma texture +	 * mov t0.x, t1.x		; Move luma sample into .x component +	 * tex2d t1, i0, s1		; Read texel from chroma Cb texture +	 * mov t0.y, t1.x		; Move Cb sample into .y component +	 * tex2d t1, i0, s2		; Read texel from chroma Cr texture +	 * mov t0.z, t1.x		; Move Cr sample into .z component +	 */ +	for (i = 0; i < 3; ++i) +	{ +		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, i); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1); +		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i; +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	} + +	/* mul t0, t0, c0		; Rescale texel to correct range */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* XXX: Pos values off by 0.5? */ +	/* sub t4, i5.y, c1.x		; Sub 0.5 from denormalized pos */ +	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 5, TGSI_FILE_CONSTANT, 1); +	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X; +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */ +	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1); +	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X; +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* floor t3, t3			; Get rid of fractional part */ +	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* mul t3, t3, c1.y		; Multiply by 2 */ +	inst = vl_inst3( TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1); +	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y; +	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y; +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */ +	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* +	 * tex2d t1, i1, s3		; Read texel from past ref macroblock top field +	 * tex2d t2, i2, s3		; Read texel from past ref macroblock bottom field +	 */ +	for (i = 0; i < 2; ++i) +	{ +		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 1, TGSI_FILE_SAMPLER, 3); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* TODO: Move to conditional tex fetch on t3 instead of lerp */ +	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */ +	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* +	 * tex2d t4, i3, s4		; Read texel from future ref macroblock top field +	 * tex2d t5, i4, s4		; Read texel from future ref macroblock bottom field +	 */ +	for (i = 0; i < 2; ++i) +	{ +		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 4, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, 4); +		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); +	} + +	/* TODO: Move to conditional tex fetch on t3 instead of lerp */ +	/* lerp t2, t3, t4, t5		; Choose between top and bottom fields based on Y % 2 */ +	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */ +	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2); +	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X; +	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X; +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* add o0, t0, t1		; Add past/future ref and differential to form final output */ +	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	/* end */ +	inst = vl_end(); +	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti); + +	fs.tokens = tokens; +	mc->b_fs[1] = pipe->create_fs_state(pipe, &fs); +	free(tokens); + +	return 0; +} + +static int vlCreateDataBufs +( +	struct vlR16SnormBufferedMC *mc +) +{ +	const unsigned int	mbw = align(mc->video_width, VL_MACROBLOCK_WIDTH) / VL_MACROBLOCK_WIDTH; +	const unsigned int	mbh = align(mc->video_height, VL_MACROBLOCK_HEIGHT) / VL_MACROBLOCK_HEIGHT; +	const unsigned int	num_mb_per_frame = mbw * mbh; + +	struct pipe_context	*pipe; +	unsigned int		g, h, i; + +	assert(mc); + +	pipe = mc->pipe; + +	for (g = 0; g < NUM_BUF_SETS; ++g) +	{ +		for (h = 0; h < 7; ++h) +		{ +			/* Create our vertex buffer and vertex buffer element */ +			mc->vertex_bufs[g][h][0].pitch = sizeof(struct vlVertex2f); +			mc->vertex_bufs[g][h][0].max_index = 24 * num_mb_per_frame - 1; +			mc->vertex_bufs[g][h][0].buffer_offset = 0; +			mc->vertex_bufs[g][h][0].buffer = pipe->winsys->buffer_create +			( +				pipe->winsys, +				1, +				PIPE_BUFFER_USAGE_VERTEX, +				sizeof(struct vlVertex2f) * 24 * num_mb_per_frame +			); +		} +	} + +	/* Position & block luma, block chroma texcoord element */ +	mc->vertex_elems[0].src_offset = 0; +	mc->vertex_elems[0].vertex_buffer_index = 0; +	mc->vertex_elems[0].nr_components = 2; +	mc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT; + +	for (g = 0; g < NUM_BUF_SETS; ++g) +	{ +		for (h = 0; h < 7; ++h) +		{ +			for (i = 1; i < 3; ++i) +			{ +				mc->vertex_bufs[g][h][i].pitch = sizeof(struct vlVertex2f) * 2; +				mc->vertex_bufs[g][h][i].max_index = 24 * num_mb_per_frame - 1; +				mc->vertex_bufs[g][h][i].buffer_offset = 0; +				mc->vertex_bufs[g][h][i].buffer = pipe->winsys->buffer_create +				( +					pipe->winsys, +					1, +					PIPE_BUFFER_USAGE_VERTEX, +					sizeof(struct vlVertex2f) * 2 * 24 * num_mb_per_frame +				); +			} +		} +	} + +	/* First ref surface top field texcoord element */ +	mc->vertex_elems[1].src_offset = 0; +	mc->vertex_elems[1].vertex_buffer_index = 1; +	mc->vertex_elems[1].nr_components = 2; +	mc->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT; + +	/* First ref surface bottom field texcoord element */ +	mc->vertex_elems[2].src_offset = sizeof(struct vlVertex2f); +	mc->vertex_elems[2].vertex_buffer_index = 1; +	mc->vertex_elems[2].nr_components = 2; +	mc->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT; + +	/* Second ref surface top field texcoord element */ +	mc->vertex_elems[3].src_offset = 0; +	mc->vertex_elems[3].vertex_buffer_index = 2; +	mc->vertex_elems[3].nr_components = 2; +	mc->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT; + +	/* Second ref surface bottom field texcoord element */ +	mc->vertex_elems[4].src_offset = sizeof(struct vlVertex2f); +	mc->vertex_elems[4].vertex_buffer_index = 2; +	mc->vertex_elems[4].nr_components = 2; +	mc->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT; + +	/* Create our constant buffer */ +	mc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts); +	mc->vs_const_buf.buffer = pipe->winsys->buffer_create +	( +		pipe->winsys, +		1, +		PIPE_BUFFER_USAGE_CONSTANT, +		mc->vs_const_buf.size +	); + +	mc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts); +	mc->fs_const_buf.buffer = pipe->winsys->buffer_create +	( +		pipe->winsys, +		1, +		PIPE_BUFFER_USAGE_CONSTANT, +		mc->fs_const_buf.size +	); + +	memcpy +	( +		pipe->winsys->buffer_map(pipe->winsys, mc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE), +		&fs_consts, +		sizeof(struct vlFragmentShaderConsts) +	); + +	pipe->winsys->buffer_unmap(pipe->winsys, mc->fs_const_buf.buffer); + +	return 0; +} + +static int vlInit +( +	struct vlR16SnormBufferedMC *mc +) +{ +	struct pipe_context		*pipe; +	struct pipe_sampler_state	sampler; +	struct pipe_texture		template; +	unsigned int			filters[5]; +	unsigned int			i; + +	assert(mc); + +	pipe = mc->pipe; + +	/* For MC we render to textures, which are rounded up to nearest POT */ +	mc->viewport.scale[0] = vlRoundUpPOT(mc->video_width); +	mc->viewport.scale[1] = vlRoundUpPOT(mc->video_height); +	mc->viewport.scale[2] = 1; +	mc->viewport.scale[3] = 1; +	mc->viewport.translate[0] = 0; +	mc->viewport.translate[1] = 0; +	mc->viewport.translate[2] = 0; +	mc->viewport.translate[3] = 0; + +	mc->render_target.width = vlRoundUpPOT(mc->video_width); +	mc->render_target.height = vlRoundUpPOT(mc->video_height); +	mc->render_target.num_cbufs = 1; +	/* FB for MC stage is a vlSurface created by the user, set at render time */ +	mc->render_target.zsbuf = NULL; + +	filters[0] = PIPE_TEX_FILTER_NEAREST; +	filters[1] = mc->video_format == vlFormatYCbCr444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR; +	filters[2] = mc->video_format == vlFormatYCbCr444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR; +	filters[3] = PIPE_TEX_FILTER_LINEAR; +	filters[4] = PIPE_TEX_FILTER_LINEAR; + +	for (i = 0; i < 5; ++i) +	{ +		sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE; +		sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE; +		sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE; +		sampler.min_img_filter = filters[i]; +		sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE; +		sampler.mag_img_filter = filters[i]; +		sampler.compare_mode = PIPE_TEX_COMPARE_NONE; +		sampler.compare_func = PIPE_FUNC_ALWAYS; +		sampler.normalized_coords = 1; +		/*sampler.prefilter = ;*/ +		/*sampler.shadow_ambient = ;*/ +		/*sampler.lod_bias = ;*/ +		sampler.min_lod = 0; +		/*sampler.max_lod = ;*/ +		/*sampler.border_color[i] = ;*/ +		/*sampler.max_anisotropy = ;*/ +		mc->samplers[i] = pipe->create_sampler_state(pipe, &sampler); +	} + +	memset(&template, 0, sizeof(struct pipe_texture)); +	template.target = PIPE_TEXTURE_2D; +	template.format = PIPE_FORMAT_R16_SNORM; +	template.last_level = 0; +	template.width[0] = vlRoundUpPOT(mc->video_width); +	template.height[0] = vlRoundUpPOT(mc->video_height); +	template.depth[0] = 1; +	template.compressed = 0; +	pf_get_block(template.format, &template.block); + +	for (i = 0; i < NUM_BUF_SETS; ++i) +		mc->textures[i][0] = pipe->screen->texture_create(pipe->screen, &template); + +	if (mc->video_format == vlFormatYCbCr420) +	{ +		template.width[0] = vlRoundUpPOT(mc->video_width / 2); +		template.height[0] = vlRoundUpPOT(mc->video_height / 2); +	} +	else if (mc->video_format == vlFormatYCbCr422) +		template.height[0] = vlRoundUpPOT(mc->video_height / 2); + +	for (i = 0; i < NUM_BUF_SETS; ++i) +	{ +		mc->textures[i][1] = pipe->screen->texture_create(pipe->screen, &template); +		mc->textures[i][2] = pipe->screen->texture_create(pipe->screen, &template); +	} + +	/* textures[3] & textures[4] are assigned from vlSurfaces for P and B macroblocks at render time */ + +	vlCreateVertexShaderIMB(mc); +	vlCreateFragmentShaderIMB(mc); +	vlCreateVertexShaderFramePMB(mc); +	vlCreateVertexShaderFieldPMB(mc); +	vlCreateFragmentShaderFramePMB(mc); +	vlCreateFragmentShaderFieldPMB(mc); +	vlCreateVertexShaderFrameBMB(mc); +	vlCreateVertexShaderFieldBMB(mc); +	vlCreateFragmentShaderFrameBMB(mc); +	vlCreateFragmentShaderFieldBMB(mc); +	vlCreateDataBufs(mc); + +	return 0; +} + +int vlCreateR16SNormBufferedMC +( +	struct pipe_context *pipe, +	unsigned int video_width, +	unsigned int video_height, +	enum vlFormat video_format, +	struct vlRender **render +) +{ +	struct vlR16SnormBufferedMC *mc; + +	assert(pipe); +	assert(render); + +	mc = calloc(1, sizeof(struct vlR16SnormBufferedMC)); + +	mc->base.vlBegin = &vlBegin; +	mc->base.vlRenderMacroBlocksMpeg2 = &vlRenderMacroBlocksMpeg2R16SnormBuffered; +	mc->base.vlEnd = &vlEnd; +	mc->base.vlFlush = &vlFlush; +	mc->base.vlDestroy = &vlDestroy; +	mc->pipe = pipe; +	mc->video_width = video_width; +	mc->video_height = video_height; + +	mc->cur_buf = 0; +	mc->buffered_surface = NULL; +	mc->past_surface = NULL; +	mc->future_surface = NULL; +	memset(mc->num_macroblocks, 0, sizeof(unsigned int) * 7); +	mc->total_num_macroblocks = 0; + +	vlInit(mc); + +	*render = &mc->base; + +	return 0; +} diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h new file mode 100644 index 0000000000..30f67db3e7 --- /dev/null +++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h @@ -0,0 +1,18 @@ +#ifndef vl_r16snorm_mc_buf_h +#define vl_r16snorm_mc_buf_h + +#include "vl_types.h" + +struct pipe_context; +struct vlRender; + +int vlCreateR16SNormBufferedMC +( +	struct pipe_context *pipe, +	unsigned int video_width, +	unsigned int video_height, +	enum vlFormat video_format, +	struct vlRender **render +); + +#endif diff --git a/src/gallium/state_trackers/g3dvl/vl_render.h b/src/gallium/state_trackers/g3dvl/vl_render.h index 63016b5cbe..166030b498 100644 --- a/src/gallium/state_trackers/g3dvl/vl_render.h +++ b/src/gallium/state_trackers/g3dvl/vl_render.h @@ -24,6 +24,11 @@ struct vlRender  		struct vlRender *render  	); +	int (*vlFlush) +	( +		struct vlRender *render +	); +  	int (*vlDestroy)  	(  		struct vlRender *render diff --git a/src/gallium/state_trackers/g3dvl/vl_surface.c b/src/gallium/state_trackers/g3dvl/vl_surface.c index ffc8122172..687fd1ec29 100644 --- a/src/gallium/state_trackers/g3dvl/vl_surface.c +++ b/src/gallium/state_trackers/g3dvl/vl_surface.c @@ -113,6 +113,8 @@ int vlPutPicture  	assert(surface);  	assert(surface->context); +	surface->context->render->vlFlush(surface->context->render); +  	csc = surface->context->csc;  	pipe = surface->context->pipe; | 
