cfg-common.h | 4 + cfg-mencoder.h | 4 + cfg-mplayer.h | 4 + configure | 13 +- libaf/af_format.c | 7 + libavcodec/Makefile | 7 + libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++ libavcodec/avr32/fdct.S | 541 ++++++++ libavcodec/avr32/h264idct.S | 451 +++++++ libavcodec/avr32/idct.S | 829 ++++++++++++ libavcodec/avr32/mc.S | 434 ++++++ libavcodec/avr32/pico.h | 260 ++++ libavcodec/bitstream.h | 77 +- libavcodec/dsputil.c | 3 + libavcodec/h264.c | 15 + libavutil/common.h | 16 + libavutil/internal.h | 9 + libfaad2/common.h | 2 +- libmpcodecs/ad_libmad.c | 5 + libswscale/pico-avr32.h | 137 ++ libswscale/swscale_internal.h | 2 +- libswscale/yuv2rgb.c | 14 + libswscale/yuv2rgb_avr32.c | 416 ++++++ libvo/vo_fbdev2.c | 101 ++- version.sh | 2 +- 25 files changed, 6011 insertions(+), 20 deletions(-) create mode 100644 libavcodec/avr32/dsputil_avr32.c create mode 100644 libavcodec/avr32/fdct.S create mode 100644 libavcodec/avr32/h264idct.S create mode 100644 libavcodec/avr32/idct.S create mode 100644 libavcodec/avr32/mc.S create mode 100644 libavcodec/avr32/pico.h create mode 100644 libswscale/pico-avr32.h create mode 100644 libswscale/yuv2rgb_avr32.c diff --git a/cfg-common.h b/cfg-common.h index 780df38..7d878a8 100644 --- a/cfg-common.h +++ b/cfg-common.h @@ -235,6 +235,10 @@ {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL}, {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL}, +#ifdef ARCH_AVR32 + {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL}, + {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL}, +#endif // draw by slices or whole frame (useful with libmpeg2/libavcodec) {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL}, {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL}, diff --git a/cfg-mencoder.h b/cfg-mencoder.h index 411b748..addf791 100644 --- a/cfg-mencoder.h +++ b/cfg-mencoder.h @@ -5,6 +5,10 @@ #include "cfg-common.h" +#ifdef ARCH_AVR32 +extern int avr32_use_pico; +#endif + #ifdef USE_FAKE_MONO extern int fakemono; // defined in dec_audio.c #endif diff --git a/cfg-mplayer.h b/cfg-mplayer.h index 62b6eac..31499c2 100644 --- a/cfg-mplayer.h +++ b/cfg-mplayer.h @@ -4,6 +4,10 @@ #include "cfg-common.h" +#ifdef ARCH_AVR32 +extern int avr32_use_pico; +#endif + extern int noconsolecontrols; #if defined(HAVE_FBDEV)||defined(HAVE_VESA) diff --git a/configure b/configure index 29002c8..56c6fe4 100755 --- a/configure +++ b/configure @@ -1203,6 +1203,15 @@ EOF _optimizing="$proc" ;; + avr32) + _def_arch='#define ARCH_AVR32' + _target_arch='TARGET_ARCH_AVR32 = yes' + iproc='avr32' + proc='' + _march='' + _mcpu='' + _optimizing='' + ;; arm|armv4l|armv5tel) _def_arch='#define ARCH_ARMV4L 1' _target_arch='TARGET_ARCH_ARMV4L = yes' @@ -1533,7 +1542,7 @@ echores $_named_asm_args # Checking for CFLAGS _stripbinaries=yes if test "$_profile" != "" || test "$_debug" != "" ; then - CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile" + CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile" if test "$_cc_major" -ge "3" ; then CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'` fi @@ -3794,7 +3803,7 @@ fi echocheck "X11 headers presence" - for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do + for I in `echo $_inc_extra | sed s/-I//g`; do if test -f "$I/X11/Xlib.h" ; then _inc_x11="-I$I" _x11_headers="yes" diff --git a/libaf/af_format.c b/libaf/af_format.c index e5b7cc9..5d7ea6d 100644 --- a/libaf/af_format.c +++ b/libaf/af_format.c @@ -20,7 +20,14 @@ // Integer to float conversion through lrintf() #ifdef HAVE_LRINTF #include <math.h> + +#ifdef ARCH_AVR32 +#define lrintf(x) rint(x) +#define llrint(x) (long long)rint(x) +#else long int lrintf(float); +#endif + #else #define lrintf(x) ((int)(x)) #endif diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 17b6c45..8e1dc96 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) += sparc/dsputil_vis.o \ sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc +# avr32 specific stuff +ifeq ($(TARGET_ARCH_AVR32),yes) +ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o +OBJS += avr32/dsputil_avr32.o +endif + # sun mediaLib specific stuff OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \ @@ -419,6 +425,7 @@ tests: apiexample $(TESTS) clean:: rm -f \ i386/*.o i386/*~ \ + avr32/*.o avr32/*~ \ armv4l/*.o armv4l/*~ \ mlib/*.o mlib/*~ \ alpha/*.o alpha/*~ \ diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c new file mode 100644 index 0000000..200284d --- /dev/null +++ b/libavcodec/avr32/dsputil_avr32.c @@ -0,0 +1,2678 @@ +/* + * Copyright (c) 2007 Atmel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. The name of ATMEL may not be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#include "../dsputil.h" +#include "pico.h" + +int avr32_use_pico = 1; + +//#define CHECK_DSP_FUNCS_AGAINST_C + +#ifdef CHECK_DSP_FUNCS_AGAINST_C +#define DSP_FUNC_NAME(name) test_ ## name +#else +#define DSP_FUNC_NAME(name) name +#endif + +union doubleword { + int64_t doubleword; + struct { + int32_t top; + int32_t bottom; + } words; +}; + +#undef LD16 +#undef LD32 +#undef LD64 + +#define LD16(a) (*((uint16_t*)(a))) +#define LD32(a) (*((uint32_t*)(a))) +#define LD64(a) (*((uint64_t*)(a))) +#define LD64_UNALIGNED(a) \ + ({ union doubleword __tmp__; \ + __tmp__.words.top = LD32(a); \ + __tmp__.words.bottom = LD32(a + 4); \ + __tmp__.doubleword; }) + +#undef ST32 +#undef ST16 + +#define ST16(a, b) *((uint16_t*)(a)) = (b) +#define ST32(a, b) *((uint32_t*)(a)) = (b) + +#undef rnd_avg32 +#define rnd_avg32(a, b) \ + ({ uint32_t __tmp__;\ + asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\ + __tmp__;}) + +void idct_avr32(DCTELEM *data); +void fdct_avr32(DCTELEM *data); + +void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data); +void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data); + +void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride); +void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride); + +#define extern_dspfunc(PFX, NUM) \ + void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ + void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ + void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ + void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ) + +extern_dspfunc(put, 8); +extern_dspfunc(put_no_rnd, 8); +extern_dspfunc(avg, 8); +extern_dspfunc(avg_no_rnd, 8); +#undef extern_dspfunc + +#ifdef CHECK_DSP_FUNCS_AGAINST_C +#define extern_dspfunc(PFX, NUM) \ + void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ + void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ + void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ + void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ) + +extern_dspfunc(put, 4); +extern_dspfunc(put_no_rnd, 4); +extern_dspfunc(put, 8); +extern_dspfunc(put_no_rnd, 8); +extern_dspfunc(put, 16); +extern_dspfunc(put_no_rnd, 16); +extern_dspfunc(avg, 8); +extern_dspfunc(avg_no_rnd, 8); +extern_dspfunc(avg, 16); +extern_dspfunc(avg_no_rnd, 16); + + +#undef extern_dspfunc +#define extern_dspfunc(PFX, NUM) \ +void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \ +void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \ + +extern_dspfunc(put_h264_qpel, 16); +extern_dspfunc(put_h264_qpel, 8); +extern_dspfunc(put_h264_qpel, 4); +extern_dspfunc(avg_h264_qpel, 16); +extern_dspfunc(avg_h264_qpel, 8); +extern_dspfunc(avg_h264_qpel, 4); + +#undef extern_dspfunc + +void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); +void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); +void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); + +void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); +void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); +void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); + + +void dump_block8(uint8_t *block, int line_size, int h); +void dump_block4(uint8_t *block, int line_size, int h); +void dump_block(uint8_t *block, int line_size, int h, int w); + +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, + int h, char *name, int max_dev); +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, + int h, char *name, int max_dev); +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, + int h, int width, char *name, int max_dev); + +#define PIXOP2( OPNAME, OP ) \ +void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + int i;\ + for(i=0; i<h; i++){\ + OP(*((uint32_t*)(block )), LD32(pixels ));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ + int src_stride1, int src_stride2, int h){\ + int i;\ + for(i=0; i<h; i++){\ + uint32_t a,b;\ + a= LD32(&src1[i*src_stride1 ]);\ + b= LD32(&src2[i*src_stride2 ]);\ + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ + a= LD32(&src1[i*src_stride1+4]);\ + b= LD32(&src2[i*src_stride2+4]);\ + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ + }\ +}\ +\ +void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ + int src_stride1, int src_stride2, int h){\ + int i;\ + for(i=0; i<h; i++){\ + uint32_t a,b;\ + a= LD32(&src1[i*src_stride1 ]);\ + b= LD32(&src2[i*src_stride2 ]);\ + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ + }\ +}\ +\ +void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ + int src_stride1, int src_stride2, int h){\ + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ +}\ + +#else +#define PIXOP2( OPNAME, OP ) \ +static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + int i;\ + for(i=0; i<h; i++){\ + OP(*((uint32_t*)(block )), LD32(pixels ));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + int i;\ + for(i=0; i<h; i++){\ + OP(*((uint32_t*)(block )), LD32(pixels ));\ + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +static void OPNAME ## _pixels16_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ + int i;\ + for(i=0; i<h; i++){\ + OP(*((uint32_t*)(block )), LD32(pixels ));\ + OP(*((uint32_t*)(block+4)), LD32(pixels+4));\ + OP(*((uint32_t*)(block+8)), LD32(pixels+8));\ + OP(*((uint32_t*)(block+12)), LD32(pixels+12));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +static void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ + int src_stride1, int src_stride2, int h){\ + int i;\ + for(i=0; i<h; i++){\ + uint32_t a,b;\ + a= LD32(&src1[i*src_stride1 ]);\ + b= LD32(&src2[i*src_stride2 ]);\ + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ + a= LD32(&src1[i*src_stride1+4]);\ + b= LD32(&src2[i*src_stride2+4]);\ + OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\ + }\ +}\ +\ +static void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ + int src_stride1, int src_stride2, int h){\ + int i;\ + for(i=0; i<h; i++){\ + uint32_t a,b;\ + a= LD32(&src1[i*src_stride1 ]);\ + b= LD32(&src2[i*src_stride2 ]);\ + OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\ + }\ +}\ +\ +static void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \ + int src_stride1, int src_stride2, int h){\ + OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\ + OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\ +}\ + +#endif + +#define op_avg(a, b) a = rnd_avg32(a, b) +#define op_put(a, b) a = b + +PIXOP2(avg, op_avg) +PIXOP2(put, op_put) +#undef op_avg +#undef op_put + + + +static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) +{ + int i; + for(i=0; i<h; i++) + { + ST32(dst , LD32(src )); + dst+=dstStride; + src+=srcStride; + } +} + +static void clear_blocks_avr32(DCTELEM *blocks) +{ + int n = 12; + uint64_t tmp1, tmp2; + blocks += 6*64; + asm volatile ( "mov\t%1, 0\n" + "mov\t%m1, 0\n" + "mov\t%2, 0\n" + "mov\t%m2, 0\n" + "0:\n" + "stm\t--%3, %1, %m1, %2, %m2\n" + "stm\t--%3, %1, %m1, %2, %m2\n" + "stm\t--%3, %1, %m1, %2, %m2\n" + "stm\t--%3, %1, %m1, %2, %m2\n" + "sub\t%0, 1\n" + "brne\t0b\n" + : "+r"(n), "=&r"(tmp1), "=&r"(tmp2), + "+r"(blocks)); +} + + +static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) +{ + int i; + for(i=0; i<h; i++) + { + ST32(dst , LD32(src )); + ST32(dst+4 , LD32(src+4 )); + dst+=dstStride; + src+=srcStride; + } +} + +static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h) +{ + int i; + for(i=0; i<h; i++) + { + ST32(dst , LD32(src )); + ST32(dst+4 , LD32(src+4 )); + ST32(dst+8 , LD32(src+8 )); + ST32(dst+12, LD32(src+12)); + dst+=dstStride; + src+=srcStride; + } +} + + +static void put_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ + const int A=(8-x)*(8-y); + const int B=( x)*(8-y); + const int C=(8-x)*( y); + const int D=( x)*( y); + int i; + + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); + PICO_PUT_W(PICO_COEFF0_B, 32); + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); + PICO_PUT_W(PICO_COEFF1_B, 0); + PICO_PUT_W(PICO_COEFF2_A, 0); + PICO_PUT_W(PICO_COEFF2_B, 0); + PICO_PUT_W(PICO_CONFIG, + PICO_OUTPUT_MODE(PICO_PLANAR_MODE) + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) + | PICO_COEFF_FRAC_BITS(6) + | PICO_OFFSET_FRAC_BITS(6)); + + for(i=0; i<h; i++) + { + + int src0 = LD32(src); + int src1 = LD32(src + stride); + + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0); + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0); + src += stride; + ST16(dst,(short)PICO_GET_W(PICO_OUTPIX0)); + dst += stride; + } +} + + +static void put_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ + const int A=(8-x)*(8-y);\ + const int B=( x)*(8-y); + const int C=(8-x)*( y); + const int D=( x)*( y); + int i; + + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); + PICO_PUT_W(PICO_COEFF0_B, 32); + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); + PICO_PUT_W(PICO_COEFF1_B, 0); + PICO_PUT_W(PICO_COEFF2_A, 0); + PICO_PUT_W(PICO_COEFF2_B, 0); + PICO_PUT_W(PICO_CONFIG, + PICO_OUTPUT_MODE(PICO_PLANAR_MODE) + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) + | PICO_COEFF_FRAC_BITS(6) + | PICO_OFFSET_FRAC_BITS(6)); + + for(i=0; i<h; i++) + { + /* + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1])); + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2])); + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3])); + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4])); + dst+= stride; + src+= stride; + */ + + int src0 = LD32(src); + int src1 = (((int)src[4] << 24) | (int)src[stride]); + int src2 = LD32(src + stride + 1); + + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); + src += stride; + ST32(dst, PICO_GET_W(PICO_OUTPIX0)); + + dst += stride; + } +} + +static void put_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ + const int A=(8-x)*(8-y); + const int B=( x)*(8-y); + const int C=(8-x)*( y); + const int D=( x)*( y); + int i; + + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); + PICO_PUT_W(PICO_COEFF0_B, 32); + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); + PICO_PUT_W(PICO_COEFF1_B, 0); + PICO_PUT_W(PICO_COEFF2_A, 0); + PICO_PUT_W(PICO_COEFF2_B, 0); + PICO_PUT_W(PICO_CONFIG, + PICO_OUTPUT_MODE(PICO_PLANAR_MODE) + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) + | PICO_COEFF_FRAC_BITS(6) + | PICO_OFFSET_FRAC_BITS(6)); + + for(i=0; i<h; i++) + { + /* + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1])); + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2])); + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3])); + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4])); + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5])); + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6])); + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7])); + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8])); + dst+= stride; + src+= stride; + */ + int src0 = LD32(src); + int src1 = (((int)src[4] << 24) | (int)src[stride]); + int src2 = LD32(src + stride + 1); + + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); + ST32(dst, PICO_GET_W(PICO_OUTPIX0)); + + src0 = LD32(src + 4); + src1 = (src[8] << 24) | src[stride + 4]; + src2 = LD32(src + stride + 5); + + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); + src += stride; + ST32(dst + 4, PICO_GET_W(PICO_OUTPIX0)); + + dst += stride; + } +} + + +static void avg_h264_chroma_mc2_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ + const int A=(8-x)*(8-y); + const int B=( x)*(8-y); + const int C=(8-x)*( y); + const int D=( x)*( y); + int i; + + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); + PICO_PUT_W(PICO_COEFF0_B, 32); + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); + PICO_PUT_W(PICO_COEFF1_B, 0); + PICO_PUT_W(PICO_COEFF2_A, 0); + PICO_PUT_W(PICO_COEFF2_B, 0); + PICO_PUT_W(PICO_CONFIG, + PICO_OUTPUT_MODE(PICO_PLANAR_MODE) + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) + | PICO_COEFF_FRAC_BITS(6) + | PICO_OFFSET_FRAC_BITS(6)); + + for(i=0; i<h; i++) + { + int src0 = LD32(src); + int src1 = LD32(src + stride); + + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_OP(PICO_SINGLE_VECTOR, 2, 0, 4, 0); + PICO_OP(PICO_SINGLE_VECTOR, 3, 1, 5, 0); + src += stride; + ST16(dst, rnd_avg32(LD16(dst), PICO_GET_W(PICO_OUTPIX0))); + dst += stride; + } +} + + +static void avg_h264_chroma_mc4_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ + const int A=(8-x)*(8-y);\ + const int B=( x)*(8-y); + const int C=(8-x)*( y); + const int D=( x)*( y); + int i; + + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); + PICO_PUT_W(PICO_COEFF0_B, 32); + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); + PICO_PUT_W(PICO_COEFF1_B, 0); + PICO_PUT_W(PICO_COEFF2_A, 0); + PICO_PUT_W(PICO_COEFF2_B, 0); + PICO_PUT_W(PICO_CONFIG, + PICO_OUTPUT_MODE(PICO_PLANAR_MODE) + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) + | PICO_COEFF_FRAC_BITS(6) + | PICO_OFFSET_FRAC_BITS(6)); + + for(i=0; i<h; i++) + { + /* + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1])); + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2])); + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3])); + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4])); + dst+= stride; + src+= stride; + */ + + int src0 = *((int *)src); + int src1 = (int)((src[4] << 24) | src[stride]); + int src2 = *((int *)(src + stride + 1)); + + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); + src += stride; + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0))); + dst += stride; + } +} + +static void avg_h264_chroma_mc8_pico(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){ + const int A=(8-x)*(8-y); + const int B=( x)*(8-y); + const int C=(8-x)*( y); + const int D=( x)*( y); + int i; + + PICO_PUT_W(PICO_COEFF0_A, (A << 16) | (B & 0xFFFF)); + PICO_PUT_W(PICO_COEFF0_B, 32); + PICO_PUT_W(PICO_COEFF1_A, (C << 16) | (D & 0xFFFF)); + PICO_PUT_W(PICO_COEFF1_B, 0); + PICO_PUT_W(PICO_COEFF2_A, 0); + PICO_PUT_W(PICO_COEFF2_B, 0); + PICO_PUT_W(PICO_CONFIG, + PICO_OUTPUT_MODE(PICO_PLANAR_MODE) + | PICO_INPUT_MODE(PICO_HOR_FILTER_MODE) + | PICO_COEFF_FRAC_BITS(6) + | PICO_OFFSET_FRAC_BITS(6)); + + for(i=0; i<h; i++) + { + /* + OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1])); + OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2])); + OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3])); + OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4])); + OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5])); + OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6])); + OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7])); + OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8])); + dst+= stride; + src+= stride; + */ + int src0 = *((int *)src); + int src1 = (volatile int)((src[4] << 24) | src[stride]); + int src2 = *((int *)(src + stride + 1)); + + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0))); + + src0 = *((int *)(src + 4)); + src1 = (int)((src[8] << 24) | src[stride + 4]); + src2 = *((int *)(src + stride + 5)); + + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 7, 0); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 8, 0); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 9, 0); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 10, 0); + src += stride; + ST32(dst + 4, rnd_avg32(LD32(dst + 4), PICO_GET_W(PICO_OUTPIX0))); + dst += stride; + } +} + +static struct pico_config_t h264_qpel4_h_lowpass_config = { + .input_mode = PICO_HOR_FILTER_MODE, + .output_mode = PICO_PLANAR_MODE, + .coeff_frac_bits = 5, + .offset_frac_bits = 5, + .coeff0_0 = 1, + .coeff0_1 = -5, + .coeff0_2 = 20, + .coeff0_3 = 16, + .coeff1_0 = 20, + .coeff1_1 = -5, + .coeff1_2 = 1, + .coeff1_3 = 0, + .coeff2_0 = 0, + .coeff2_1 = 0, + .coeff2_2 = 0, + .coeff2_3 = 0 +}; + + + +static void put_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + const int h=4; + int i; + + set_pico_config(&h264_qpel4_h_lowpass_config); + + for(i=0; i<h; i++){ + + /* + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\ + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\ + dst+=dstStride;\ + src+=srcStride;\ */ + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2)); + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2)); + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9); + src += srcStride; + ST32(dst, PICO_GET_W(PICO_OUTPIX0)); + dst += dstStride; + } +} + +static void avg_h264_qpel4_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + const int h=4; + int i; + + set_pico_config(&h264_qpel4_h_lowpass_config); + + for(i=0; i<h; i++){ + + /* + OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\ + OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\ + OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\ + OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\ + dst+=dstStride;\ + src+=srcStride;\ */ + + PICO_MVRC_W(PICO_INPIX0, LD32(src - 2)); + PICO_MVRC_D(PICO_INPIX2, LD64_UNALIGNED(src + 2)); + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9); + src += srcStride; + ST32(dst, rnd_avg32(LD32(dst), PICO_GET_W(PICO_OUTPIX0))); + dst += dstStride; + } +} + +static struct pico_config_t h264_qpel4_v_lowpass_config1 = { + .input_mode = PICO_VERT_FILTER_MODE, + .output_mode = PICO_PACKED_MODE, + .coeff_frac_bits = 5, + .offset_frac_bits = 5, + .coeff0_0 = 1, + .coeff0_1 = -5, + .coeff0_2 = 20, + .coeff0_3 = 16, + .coeff1_0 = 1, + .coeff1_1 = -5, + .coeff1_2 = 20, + .coeff1_3 = 16, + .coeff2_0 = 1, + .coeff2_1 = -5, + .coeff2_2 = 20, + .coeff2_3 = 16 +}; + + + +static struct pico_config_t h264_qpel4_v_lowpass_config2 = { + .input_mode = PICO_VERT_FILTER_MODE, + .output_mode = PICO_PLANAR_MODE, + .coeff_frac_bits = 5, + .offset_frac_bits = 5, + .coeff0_0 = 1, + .coeff0_1 = -5, + .coeff0_2 = 20, + .coeff0_3 = 16, + .coeff1_0 = 20, + .coeff1_1 = -5, + .coeff1_2 = 1, + .coeff1_3 = 0, + .coeff2_0 = 0, + .coeff2_1 = 0, + .coeff2_2 = 0, + .coeff2_3 = 0 +}; + +static void put_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + + /* + const int w=4; + uint8_t *cm = cropTbl + MAX_NEG_CROP; + int i; + for(i=0; i<w; i++) + { + const int srcB= src[-2*srcStride];\ + const int srcA= src[-1*srcStride];\ + const int src0= src[0 *srcStride];\ + const int src1= src[1 *srcStride];\ + const int src2= src[2 *srcStride];\ + const int src3= src[3 *srcStride];\ + const int src4= src[4 *srcStride];\ + const int src5= src[5 *srcStride];\ + const int src6= src[6 *srcStride];\ + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ + dst++;\ + src++;\ + */ + + set_pico_config(&h264_qpel4_v_lowpass_config1); + + { + int srcB= LD32(src - 2*srcStride); + int srcA= LD32(src - 1*srcStride); + int src0= LD32(src + 0 *srcStride); + int src1= LD32(src + 1 *srcStride); + int src2= LD32(src + 2 *srcStride); + int src3= LD32(src + 3 *srcStride); + int src4= LD32(src + 4 *srcStride); + int src5= LD32(src + 5 *srcStride); + int src6= LD32(src + 6 *srcStride); + + /* First compute the leftmost three colums */ + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(0, 0, 0, 3, 6); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX0, src3); + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); + ST32(dst, PICO_GET_W(PICO_OUTPIX0)); + dst += dstStride; + PICO_MVRC_W(PICO_INPIX0, srcA); + PICO_MVRC_W(PICO_INPIX1, src0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_OP(0, 0, 0, 3, 6); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_MVRC_W(PICO_INPIX1, src3); + PICO_MVRC_W(PICO_INPIX0, src4); + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); + ST32(dst, PICO_GET_W(PICO_OUTPIX0)); + dst += dstStride; + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(0, 0, 0, 3, 6); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_MVRC_W(PICO_INPIX1, src4); + PICO_MVRC_W(PICO_INPIX0, src5); + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); + ST32(dst, PICO_GET_W(PICO_OUTPIX0)); + dst += dstStride; + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_OP(0, 0, 0, 3, 6); + PICO_MVRC_W(PICO_INPIX2, src4); + PICO_MVRC_W(PICO_INPIX1, src5); + PICO_MVRC_W(PICO_INPIX0, src6); + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); + ST32(dst, PICO_GET_W(PICO_OUTPIX0)); + /* Now compute the last column */ + + union wordbytes { + int word; + struct { + unsigned int t:8; + unsigned int u:8; + unsigned int l:8; + unsigned int b:8; + } bytes; } tmp1, tmp2, tmp3; + + + tmp1.bytes.t = srcB; + tmp1.bytes.u = src1; + tmp1.bytes.l = src4; + + tmp2.bytes.t = srcA; + tmp2.bytes.u = src2; + tmp2.bytes.l = src5; + + tmp3.bytes.t = src0; + tmp3.bytes.u = src3; + tmp3.bytes.l = src6; + + PICO_MVRC_W(PICO_INPIX0, tmp1.word); + PICO_MVRC_W(PICO_INPIX1, tmp2.word); + PICO_MVRC_W(PICO_INPIX2, tmp3.word); + set_pico_config(&h264_qpel4_v_lowpass_config2); + + + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9); + + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0); + dst[3] = (char)(tmp1.bytes.b); + dst[3 - dstStride] = (char)(tmp1.bytes.l); + dst[3 - 2*dstStride] = (char)(tmp1.bytes.u); + dst[3 - 3*dstStride] = (char)(tmp1.bytes.t); + + } + /*} + + + }*/ +} + +static void avg_h264_qpel4_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + + /* + const int w=4; + uint8_t *cm = cropTbl + MAX_NEG_CROP; + int i; + for(i=0; i<w; i++) + { + const int srcB= src[-2*srcStride];\ + const int srcA= src[-1*srcStride];\ + const int src0= src[0 *srcStride];\ + const int src1= src[1 *srcStride];\ + const int src2= src[2 *srcStride];\ + const int src3= src[3 *srcStride];\ + const int src4= src[4 *srcStride];\ + const int src5= src[5 *srcStride];\ + const int src6= src[6 *srcStride];\ + OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\ + OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\ + OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\ + OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\ + dst++;\ + src++;\ + */ + uint8_t tmp_block[4*4]; + + set_pico_config(&h264_qpel4_v_lowpass_config1); + + { + int srcB= LD32(src - 2*srcStride); + int srcA= LD32(src - 1*srcStride); + int src0= LD32(src + 0 *srcStride); + int src1= LD32(src + 1 *srcStride); + int src2= LD32(src + 2 *srcStride); + int src3= LD32(src + 3 *srcStride); + int src4= LD32(src + 4 *srcStride); + int src5= LD32(src + 5 *srcStride); + int src6= LD32(src + 6 *srcStride); + + /* First compute the leftmost three colums */ + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(0, 0, 0, 3, 6); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX0, src3); + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); + ST32(tmp_block, PICO_GET_W(PICO_OUTPIX0)); + PICO_MVRC_W(PICO_INPIX0, srcA); + PICO_MVRC_W(PICO_INPIX1, src0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_OP(0, 0, 0, 3, 6); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_MVRC_W(PICO_INPIX1, src3); + PICO_MVRC_W(PICO_INPIX0, src4); + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); + ST32(tmp_block + 4, PICO_GET_W(PICO_OUTPIX0)); + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(0, 0, 0, 3, 6); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_MVRC_W(PICO_INPIX1, src4); + PICO_MVRC_W(PICO_INPIX0, src5); + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); + ST32(tmp_block + 8, PICO_GET_W(PICO_OUTPIX0)); + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_OP(0, 0, 0, 3, 6); + PICO_MVRC_W(PICO_INPIX2, src4); + PICO_MVRC_W(PICO_INPIX1, src5); + PICO_MVRC_W(PICO_INPIX0, src6); + PICO_OP(PICO_USE_ACC, 0, 0, 3, 6); + ST32(tmp_block + 12, PICO_GET_W(PICO_OUTPIX0)); + /* Now compute the last column */ + + union wordbytes { + int word; + struct { + unsigned int t:8; + unsigned int u:8; + unsigned int l:8; + unsigned int b:8; + } bytes; } tmp1, tmp2, tmp3; + + + tmp1.bytes.t = srcB; + tmp1.bytes.u = src1; + tmp1.bytes.l = src4; + + tmp2.bytes.t = srcA; + tmp2.bytes.u = src2; + tmp2.bytes.l = src5; + + tmp3.bytes.t = src0; + tmp3.bytes.u = src3; + tmp3.bytes.l = src6; + + PICO_MVRC_W(PICO_INPIX0, tmp1.word); + PICO_MVRC_W(PICO_INPIX1, tmp2.word); + PICO_MVRC_W(PICO_INPIX2, tmp3.word); + set_pico_config(&h264_qpel4_v_lowpass_config2); + + + PICO_OP(PICO_SINGLE_VECTOR, 0, 0, 3, 6); + PICO_OP(PICO_SINGLE_VECTOR, 1, 1, 4, 7); + PICO_OP(PICO_SINGLE_VECTOR, 2, 2, 5, 8); + PICO_OP(PICO_SINGLE_VECTOR, 3, 3, 6, 9); + + PICO_MVCR_W(tmp1.word, PICO_OUTPIX0); + tmp_block[3 + 3*4] = (char)(tmp1.bytes.b); + tmp_block[3 + 2*4] = (char)(tmp1.bytes.l); + tmp_block[3 + 1*4] = (char)(tmp1.bytes.u); + tmp_block[3] = (char)(tmp1.bytes.t); + + /* Compute the average */ + srcB= LD32(dst); + srcA= LD32(dst + dstStride); + src0= LD32(dst + dstStride*2); + src1= LD32(dst + dstStride*3); + + src2= LD32(tmp_block); + src3= LD32(tmp_block + 4); + src4= LD32(tmp_block + 8); + src5= LD32(tmp_block + 12); + + ST32(dst, rnd_avg32(srcB, src2)); + ST32(dst + dstStride, rnd_avg32(srcA, src3)); + ST32(dst + 2*dstStride, rnd_avg32(src0, src4)); + ST32(dst + 3*dstStride, rnd_avg32(src1, src5)); + } +} + +static struct pico_config_t h264_qpel4_hv_lowpass_config = { + .input_mode = PICO_HOR_FILTER_MODE, + .output_mode = PICO_PACKED_MODE, + .coeff_frac_bits = 10, + .offset_frac_bits = 10, + .coeff0_0 = 1, + .coeff0_1 = -5, + .coeff0_2 = 20, + .coeff0_3 = 512, + .coeff1_0 = -5, + .coeff1_1 = 25, + .coeff1_2 = -100, + .coeff1_3 = 0, + .coeff2_0 = 20, + .coeff2_1 = -100, + .coeff2_2 = 400, + .coeff2_3 = 0 +}; + +static void put_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + + int32_t tmp_block[48]; + int32_t *tmp = tmp_block; + int i; + + set_pico_config(&h264_qpel4_hv_lowpass_config); + + src -= 2; + for ( i = 0; i < 2; i++ ){ + int srcB= LD32(src - 2*srcStride); + int srcA= LD32(src - 1*srcStride); + int src0= LD32(src + 0 *srcStride); + int src1= LD32(src + 1 *srcStride); + int src2= LD32(src + 2 *srcStride); + int src3= LD32(src + 3 *srcStride); + int src4= LD32(src + 4 *srcStride); + int src5= LD32(src + 5 *srcStride); + int src6= LD32(src + 6 *srcStride); + + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(0, 0, 0, 4, 8); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX0, src3); + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_OP(0, 0, 1, 5, 9); + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_OP(0, 0, 4, 8, 0); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_MVRC_W(PICO_INPIX1, src3); + PICO_MVRC_W(PICO_INPIX0, src4); + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_OP(0, 0, 1, 5, 9); + PICO_MVRC_W(PICO_INPIX0, srcA); + PICO_MVRC_W(PICO_INPIX1, src0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_MVRC_W(PICO_INPIX0, src2); + PICO_OP(0, 0, 4, 8, 0); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_MVRC_W(PICO_INPIX1, src4); + PICO_MVRC_W(PICO_INPIX0, src5); + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_OP(0, 0, 1, 5, 9); + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_MVRC_W(PICO_INPIX0, src3); + PICO_OP(0, 0, 4, 8, 0); + PICO_MVRC_W(PICO_INPIX2, src4); + PICO_MVRC_W(PICO_INPIX1, src5); + PICO_MVRC_W(PICO_INPIX0, src6); + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_OP(0, 0, 1, 5, 9); + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + src += 2; + } + + src -= 1; + tmp -= 48; + + + PICO_PUT_W(PICO_CONFIG, + PICO_OUTPUT_MODE(PICO_PLANAR_MODE) + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE) + | PICO_COEFF_FRAC_BITS(10) + | PICO_OFFSET_FRAC_BITS(10)); + + for ( i = 0; i < 2; i++ ){ + int srcB= LD32(src - 2*srcStride); + int srcA= LD32(src - 1*srcStride); + int src0= LD32(src + 0 *srcStride); + int src1= LD32(src + 1 *srcStride); + int src2= LD32(src + 2 *srcStride); + int src3= LD32(src + 3 *srcStride); + int src4= LD32(src + 4 *srcStride); + int src5= LD32(src + 5 *srcStride); + int src6= LD32(src + 6 *srcStride); + + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX0, src3); + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_MVRC_W(PICO_INPIX0, srcA); + PICO_MVRC_W(PICO_INPIX1, src0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_MVRC_W(PICO_INPIX1, src3); + PICO_MVRC_W(PICO_INPIX0, src4); + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); + PICO_MVRC_W(PICO_INPIX0, srcA); + PICO_MVRC_W(PICO_INPIX1, src0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3); + + ST16(dst + 0*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16)); + ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0)); + + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_MVRC_W(PICO_INPIX1, src4); + PICO_MVRC_W(PICO_INPIX0, src5); + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); + PICO_MVRC_W(PICO_INPIX2, src4); + PICO_MVRC_W(PICO_INPIX1, src5); + PICO_MVRC_W(PICO_INPIX0, src6); + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3); + + ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16)); + ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0)); + + dst += 2; + src += 2; + } +} + + + + +static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + + int32_t tmp_block[48]; + int32_t *tmp = tmp_block; + int i; + + set_pico_config(&h264_qpel4_hv_lowpass_config); + + src -= 2; + for ( i = 0; i < 2; i++ ){ + int srcB= LD32(src - 2*srcStride); + int srcA= LD32(src - 1*srcStride); + int src0= LD32(src + 0 *srcStride); + int src1= LD32(src + 1 *srcStride); + int src2= LD32(src + 2 *srcStride); + int src3= LD32(src + 3 *srcStride); + int src4= LD32(src + 4 *srcStride); + int src5= LD32(src + 5 *srcStride); + int src6= LD32(src + 6 *srcStride); + + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(0, 0, 0, 4, 8); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX0, src3); + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_OP(0, 0, 1, 5, 9); + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_OP(0, 0, 4, 8, 0); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_MVRC_W(PICO_INPIX1, src3); + PICO_MVRC_W(PICO_INPIX0, src4); + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_OP(0, 0, 1, 5, 9); + PICO_MVRC_W(PICO_INPIX0, srcA); + PICO_MVRC_W(PICO_INPIX1, src0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_MVRC_W(PICO_INPIX0, src2); + PICO_OP(0, 0, 4, 8, 0); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_MVRC_W(PICO_INPIX1, src4); + PICO_MVRC_W(PICO_INPIX0, src5); + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_OP(0, 0, 1, 5, 9); + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_MVRC_W(PICO_INPIX0, src3); + PICO_OP(0, 0, 4, 8, 0); + PICO_MVRC_W(PICO_INPIX2, src4); + PICO_MVRC_W(PICO_INPIX1, src5); + PICO_MVRC_W(PICO_INPIX0, src6); + PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + + PICO_OP(0, 0, 1, 5, 9); + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); + PICO_STCM_W(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + tmp += 3; + src += 2; + } + + src -= 1; + tmp -= 48; + + + PICO_PUT_W(PICO_CONFIG, + PICO_OUTPUT_MODE(PICO_PLANAR_MODE) + | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE) + | PICO_COEFF_FRAC_BITS(10) + | PICO_OFFSET_FRAC_BITS(10)); + + for ( i = 0; i < 2; i++ ){ + int srcB= LD32(src - 2*srcStride); + int srcA= LD32(src - 1*srcStride); + int src0= LD32(src + 0 *srcStride); + int src1= LD32(src + 1 *srcStride); + int src2= LD32(src + 2 *srcStride); + int src3= LD32(src + 3 *srcStride); + int src4= LD32(src + 4 *srcStride); + int src5= LD32(src + 5 *srcStride); + int src6= LD32(src + 6 *srcStride); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX0, src3); + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); + PICO_MVRC_W(PICO_INPIX0, srcB); + PICO_MVRC_W(PICO_INPIX1, srcA); + PICO_MVRC_W(PICO_INPIX2, src0); + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_MVRC_W(PICO_INPIX0, srcA); + PICO_MVRC_W(PICO_INPIX1, src0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_MVRC_W(PICO_INPIX1, src3); + PICO_MVRC_W(PICO_INPIX0, src4); + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); + PICO_MVRC_W(PICO_INPIX0, srcA); + PICO_MVRC_W(PICO_INPIX1, src0); + PICO_MVRC_W(PICO_INPIX2, src1); + PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3); + + ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16)); + ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0))); + + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_MVRC_W(PICO_INPIX1, src4); + PICO_MVRC_W(PICO_INPIX0, src5); + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); + PICO_MVRC_W(PICO_INPIX0, src0); + PICO_MVRC_W(PICO_INPIX1, src1); + PICO_MVRC_W(PICO_INPIX2, src2); + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); + PICO_MVRC_W(PICO_INPIX2, src4); + PICO_MVRC_W(PICO_INPIX1, src5); + PICO_MVRC_W(PICO_INPIX0, src6); + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0); + + PICO_LDCM_W_INC(tmp, + PICO_REGVECT_VMU0_OUT, + PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT); + PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); + PICO_MVRC_W(PICO_INPIX0, src1); + PICO_MVRC_W(PICO_INPIX1, src2); + PICO_MVRC_W(PICO_INPIX2, src3); + PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3); + + ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16)); + ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0))); + + dst += 2; + src += 2; + } +} + + +static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); + src += 4*srcStride; + dst += 4*dstStride; + put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); +} + +static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); + src += 4*srcStride; + dst += 4*dstStride; + avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); +} + +static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); + src += 4*srcStride; + dst += 4*dstStride; + put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); +} + +static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); + src += 4*srcStride; + dst += 4*dstStride; + avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); +} + +static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); + src += 4*srcStride; + dst += 4*dstStride; + put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); +} + +static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); + src += 4*srcStride; + dst += 4*dstStride; + avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); +} + +static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); +} + +static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); +} + +static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); +} + +static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); +} + +static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); + put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); +} + +static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); + src += 8*srcStride; + dst += 8*dstStride; + avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); + avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); +} + + +#define H264_MC(OPNAME, SIZE) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t half[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t half[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t full[SIZE*(SIZE+5)];\ + uint8_t * const full_mid= full + SIZE*2;\ + uint8_t half[SIZE*SIZE];\ + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\ + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t full[SIZE*(SIZE+5)];\ + uint8_t * const full_mid= full + SIZE*2;\ + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t full[SIZE*(SIZE+5)];\ + uint8_t * const full_mid= full + SIZE*2;\ + uint8_t half[SIZE*SIZE];\ + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ + put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\ + OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t full[SIZE*(SIZE+5)];\ + uint8_t * const full_mid= full + SIZE*2;\ + uint8_t halfH[SIZE*SIZE];\ + uint8_t halfV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\ + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t full[SIZE*(SIZE+5)];\ + uint8_t * const full_mid= full + SIZE*2;\ + uint8_t halfH[SIZE*SIZE];\ + uint8_t halfV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\ + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t full[SIZE*(SIZE+5)];\ + uint8_t * const full_mid= full + SIZE*2;\ + uint8_t halfH[SIZE*SIZE];\ + uint8_t halfV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\ + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t full[SIZE*(SIZE+5)];\ + uint8_t * const full_mid= full + SIZE*2;\ + uint8_t halfH[SIZE*SIZE];\ + uint8_t halfV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\ + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t halfH[SIZE*SIZE];\ + uint8_t halfHV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t halfH[SIZE*SIZE];\ + uint8_t halfHV[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\ + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t full[SIZE*(SIZE+5)];\ + uint8_t * const full_mid= full + SIZE*2;\ + uint8_t halfV[SIZE*SIZE];\ + uint8_t halfHV[SIZE*SIZE];\ + copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\ + uint8_t full[SIZE*(SIZE+5)];\ + uint8_t * const full_mid= full + SIZE*2;\ + uint8_t halfV[SIZE*SIZE];\ + uint8_t halfHV[SIZE*SIZE];\ + copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ + put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ + put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ +}\ + +H264_MC(put_, 4) +H264_MC(put_, 8) +H264_MC(put_, 16) +H264_MC(avg_, 4) +H264_MC(avg_, 8) +H264_MC(avg_, 16) + + + +#define dspfunc16(PFX) \ + void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ + PFX ## _pixels8_avr32(dst, pixels, line_size, h);\ + PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\ + }\ + void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ + PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\ + PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\ + }\ + void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ + PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\ + PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\ + }\ + void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ + PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\ + PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\ + }\ + + +dspfunc16(put) +dspfunc16(put_no_rnd) +dspfunc16(avg) +dspfunc16(avg_no_rnd) +#undef dspfunc16 + +static int pix_sum_avr32(uint8_t * pix, int line_size) +{ + int s, i; + + s = 0; + for (i = 0; i < 16; i++) { + int tmp1,tmp2,tmp3,tmp4,tmp5; + __asm__ volatile ( "ld.w\t%0, %6[0]\n\t" + "ld.w\t%1, %6[4]\n\t" + "ld.w\t%2, %6[8]\n\t" + "ld.w\t%3, %6[12]\n\t" + "punpckub.h\t%4, %0:t\n\t" + "padd.h\t%5, %5, %4\n\t" + "punpckub.h\t%4, %0:b\n\t" + "padd.h\t%5, %5, %4\n\t" + "punpckub.h\t%4, %1:t\n\t" + "padd.h\t%5, %5, %4\n\t" + "punpckub.h\t%4, %1:b\n\t" + "padd.h\t%5, %5, %4\n\t" + "punpckub.h\t%4, %2:t\n\t" + "padd.h\t%5, %5, %4\n\t" + "punpckub.h\t%4, %2:b\n\t" + "padd.h\t%5, %5, %4\n\t" + "punpckub.h\t%4, %3:t\n\t" + "padd.h\t%5, %5, %4\n\t" + "punpckub.h\t%4, %3:b\n\t" + "padd.h\t%5, %5, %4\n\t" + : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s) + : "r"(pix)); + pix += line_size; + } + __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) ); + + return s; +} + + +//#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom ) +//#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) +//#define H264_WEIGHT(W,H) \ +//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ +// int attribute_unused x, y; \ +// offset <<= log2_denom; \ +// if(log2_denom) offset += 1<<(log2_denom-1); \ +// for(y=0; y<H; y++, block += stride){ \ +// uint32_t tmp0, tmp1; +// if(W==2) { \ +// asm volatile ( "ld.ub\t%[tmp0], %[block][0]\n" \ +// "ld.ub\t%[tmp1], %[block][1]\n" \ +// "mulhh.w\t%[tmp0], %[tmp0]:b, %[weight]:b\n" \ +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \ +// "asr\t%[tmp0], %[log2_denom]\n" \ +// "asr\t%[tmp1], %[log2_denom]\n" \ +// "satu\t%[tmp0] >> 0, 8\n" \ +// "satu\t%[tmp1] >> 0, 8\n" \ +// "st.b\t%[block][0], %[tmp0]\n" \ +// "st.b\t%[block][1], %[tmp1]\n" \ +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \ +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \ +// } else if ( W==4 ) { \ +// asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \ +// "punpckub.h\t%[tmp1], %[tmp0]:t\n" \ +// "punpckub.h\t%[tmp0], %[tmp0]:b\n" \ +// "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \ +// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \ +// "asr\t%[tmp0], %[log2_denom]\n" \ +// "asr\t%[tmp1], %[log2_denom]\n" \ +// "satu\t%[tmp0] >> 0, 8\n" \ +// "satu\t%[tmp1] >> 0, 8\n" \ +// "st.b\t%[block][0], %[tmp0]\n" \ +// "st.b\t%[block][1], %[tmp1]\n" \ +// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \ +// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \ +// +// +// +// if(W==4) continue; \ +// op_scale1(4); \ +// op_scale1(5); \ +// op_scale1(6); \ +// op_scale1(7); \ +// if(W==8) continue; \ +// op_scale1(8); \ +// op_scale1(9); \ +// op_scale1(10); \ +// op_scale1(11); \ +// op_scale1(12); \ +// op_scale1(13); \ +// op_scale1(14); \ +// op_scale1(15); \ +// } \ +//} \ +//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \ +// int attribute_unused x, y; \ +// int offset = (offsets + offsetd + 1) >> 1; \ +// offset = ((offset << 1) + 1) << log2_denom; \ +// for(y=0; y<H; y++, dst += stride, src += stride){ \ +// op_scale2(0); \ +// op_scale2(1); \ +// if(W==2) continue; \ +// op_scale2(2); \ +// op_scale2(3); \ +// if(W==4) continue; \ +// op_scale2(4); \ +// op_scale2(5); \ +// op_scale2(6); \ +// op_scale2(7); \ +// if(W==8) continue; \ +// op_scale2(8); \ +// op_scale2(9); \ +// op_scale2(10); \ +// op_scale2(11); \ +// op_scale2(12); \ +// op_scale2(13); \ +// op_scale2(14); \ +// op_scale2(15); \ +// } \ +//} + + + +/* Returns zero in each byte where the absolute difference between <a> and <b> + is not less than <compare> */ +#define PABS_DIFF_LESS_THAN( a, b, compare) \ + ({ uint32_t __tmp__, __tmp2__, __mask__; \ + asm ( \ + /* Check ABS( a - b ) < compare */ \ + "psubs.ub\t%[tmp], %[opa], %[opb]\n" \ + "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \ + "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \ + /* This produces 0 for all bytes where the comparison is not true */ \ + "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \ + : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \ + : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \ + __mask__; }) + +/* + Set all bytes containing zero in <value> to 255 and the rest to zero. + + Add with saturation 254 to all bytes making all bytes different from + zero become 255. Then add one without saturation to make all bytes + originally containing zero 255 and the rest 0. */ +#define SET_ALL_BITS_IN_ZERO_BYTES(value) \ + ({ uint32_t __tmp__; \ + asm ( \ + "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \ + "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \ + : [tmp] "=r"(__tmp__) \ + : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \ + __tmp__; }) + +#define PACKW_SH(upper, lower) \ + ({ uint32_t __tmp__; \ + asm ( \ + "packw.sh\t%[tmp], %[u], %[l]\n" \ + : [tmp] "=r"(__tmp__) \ + : [u] "r"(upper), [l] "r"(lower) ); \ + __tmp__; }) + +#define PACKSH_UB(upper, lower) \ + ({ uint32_t __tmp__; \ + asm ( \ + "packsh.sb\t%[tmp], %[u], %[l]\n" \ + : [tmp] "=r"(__tmp__) \ + : [u] "r"(upper), [l] "r"(lower) ); \ + __tmp__; }) + +static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +{ + int i; + + if ( alpha == 0 ) + return; + + alpha = PACKW_SH(alpha, alpha); + alpha = PACKSH_UB(alpha, alpha); + beta = PACKW_SH(beta, beta); + beta = PACKSH_UB(beta, beta); + + for( i = 0; i < 4; i++ ) { + uint32_t p0, p1, p2, q0, q1, q2; + uint32_t mask, mask2; + uint32_t tmp, tmp2, tmp3, tmp4; + + if( tc0[i] < 0 ) { + pix += 4; + continue; + } + +/* for( d = 0; d < 4; d++ ) { + const int p0 = pix[-1*stride]; + const int p1 = pix[-2*stride]; + const int p2 = pix[-3*stride]; + const int q0 = pix[0]; + const int q1 = pix[1*stride]; + const int q2 = pix[2*stride]; + + if( ABS( p0 - q0 ) < alpha && + ABS( p1 - p0 ) < beta && + ABS( q1 - q0 ) < beta ) { */ + + p0 = LD32(pix - stride); + p1 = LD32(pix - 2*stride); + q0 = LD32(pix); + q1 = LD32(pix + stride); + + /* Check which of the columns should be filtered, if any. */ + mask = PABS_DIFF_LESS_THAN(p0, q0, alpha); + mask |= PABS_DIFF_LESS_THAN(p1, p0, beta); + mask |= PABS_DIFF_LESS_THAN(q1, q0, beta); + + if ( !mask ) + continue; + + mask = SET_ALL_BITS_IN_ZERO_BYTES(mask); + + + int tc = PACKW_SH(tc0[i], tc0[i]); + int tc0_p = tc; + int tc0_m = PACKW_SH(-tc0[i], -tc0[i]); + + /* + int i_delta; + if( ABS( p2 - p0 ) < beta ) { + pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); + tc++; + }*/ + + p2 = LD32(pix - 3*stride); + mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask; + + if ( mask2 ){ + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2); + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n" + "paddh.ub\t%[tmp], %[tmp], %[p2]\n" + "punpckub.h\t%[tmp2], %[tmp]:t\n" + "punpckub.h\t%[tmp], %[tmp]:b\n" + "punpckub.h\t%[tmp3], %[p1]:t\n" + "punpckub.h\t%[tmp4], %[p1]:b\n" + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n" + "psub.h\t%[tmp], %[tmp], %[tmp4]\n" + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n" + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n" + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n" + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n" + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n" + "padd.h\t%[tmp], %[tmp], %[tmp4]\n" + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n" + "andn\t%[tmp], %[mask2]\n" + "and\t%[tmp2], %[q1], %[mask2]\n" + "or\t%[tmp], %[tmp2]\n" + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), + [tmp4]"=&r"(tmp4) + : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p), + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2)); + ST32(pix - 2*stride, tmp); + tc += 0x00010001; + } + + + q2 = LD32(pix + 2*stride); + + /* + if( ABS( q2 - q0 ) < beta ) { + pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); + tc++; + } + */ + mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask; + + if ( mask2 ){ + mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2); + asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n" + "paddh.ub\t%[tmp], %[tmp], %[q2]\n" + "punpckub.h\t%[tmp2], %[tmp]:t\n" + "punpckub.h\t%[tmp], %[tmp]:b\n" + "punpckub.h\t%[tmp3], %[q1]:t\n" + "punpckub.h\t%[tmp4], %[q1]:b\n" + "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n" + "psub.h\t%[tmp], %[tmp], %[tmp4]\n" + "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n" + "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n" + "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n" + "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n" + "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n" + "padd.h\t%[tmp], %[tmp], %[tmp4]\n" + "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n" + "andn\t%[tmp], %[mask2]\n" + "and\t%[tmp2], %[q1], %[mask2]\n" + "or\t%[tmp], %[tmp2]\n" + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), + [tmp4]"=&r"(tmp4) + : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p), + [tc0_m]"r"(tc0_m), [mask2]"r"(mask2)); + ST32(pix + stride, tmp); + tc += 0x00010001; + } + + uint32_t old_p0 = p0; + uint32_t old_q0 = q0; + + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); + pix[-stride] = clip_uint8( p0 + i_delta ); + pix[0] = clip_uint8( q0 - i_delta ); */ + + asm ( + /* Check if the two upper pixels should be filtered */ + "lsr\t%[tmp], %[inv_mask], 16\n" + "breq\t0f\n" + + "punpckub.h\t%[tmp], %[p1]:t\n" + "punpckub.h\t%[tmp2], %[q1]:t\n" + + /* p1 - q1 */ + "psub.h\t%[tmp], %[tmp], %[tmp2]\n" + + "punpckub.h\t%[tmp3], %[q0]:t\n" + "punpckub.h\t%[tmp4], %[p0]:t\n" + + /* q0 - p0 */ + "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n" + + /* (q0 - p0) << 2 */ + "plsl.h\t%[tmp2], %[tmp2], 2\n" + + /* ((q0 - p0) << 2) + (p1 - q1) */ + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n" + + "mov\t%[tmp], 0x00040004\n" + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/ + "padd.h\t%[tmp2], %[tmp2], %[tmp]\n" + + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/ + "pasr.h\t%[tmp2], %[tmp2], 3\n" + + "mov\t%[tmp], 0\n" + "psub.h\t%[tmp], %[tmp], %[tc]\n" + + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */ + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n" + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n" + + + /* pix[-stride] = clip_uint8( p0 + i_delta ); */ + "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n" + + + /* pix[0] = clip_uint8( q0 - i_delta ); */ + "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n" + + /* Check if the two lower pixels should be filtered */ + "lsl\t%[tmp2], %[inv_mask], 16\n" + "breq\t1f\n" + + "0:\n" + "punpckub.h\t%[p1], %[p1]:b\n" + "punpckub.h\t%[q1], %[q1]:b\n" + + /* p1 - q1 */ + "psub.h\t%[p1], %[p1], %[q1]\n" + + "punpckub.h\t%[q0], %[q0]:b\n" + "punpckub.h\t%[p0], %[p0]:b\n" + + /* q0 - p0 */ + "psub.h\t%[tmp2], %[q0], %[p0]\n" + + /* (q0 - p0) << 2 */ + "plsl.h\t%[tmp2], %[tmp2], 2\n" + + /* ((q0 - p0) << 2) + (p1 - q1) */ + "padd.h\t%[tmp2], %[tmp2], %[p1]\n" + + "mov\t%[q1], 0x00040004\n" + /* ((q0 - p0) << 2) + (p1 - q1) + 4*/ + "padd.h\t%[tmp2], %[tmp2], %[q1]\n" + + /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/ + "pasr.h\t%[tmp2], %[tmp2], 3\n" + + /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */ + "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n" + "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n" + + /* pix[-stride] = clip_uint8( p0 + i_delta ); */ + "padd.h\t%[p0], %[p0], %[tmp2]\n" + + /* pix[0] = clip_uint8( q0 - i_delta ); */ + "psub.h\t%[q0], %[q0], %[tmp2]\n" + + "1:\n" + "packsh.ub\t%[p0], %[tmp4], %[p0]\n" + "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n" + + : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), + [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1) + : [tc]"r"(tc), [inv_mask]"r"(~mask)); + + ST32(pix - stride, (mask & old_p0) | (p0 & ~mask)); + ST32(pix, (mask & old_q0) | (q0 & ~mask)); + + } + pix += 1; +} + + + + +#ifdef CHECK_DSP_FUNCS_AGAINST_C + +void dump_block8(uint8_t *block, int line_size, int h){ + int i, j; + + for ( i = 0; i < h ; i++ ){ + av_log(NULL, AV_LOG_ERROR, "\t"); + for ( j = 0; j < 8 ; j++ ){ + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); + } + av_log(NULL, AV_LOG_ERROR, "\n"); + } +} + +void dump_block4(uint8_t *block, int line_size, int h){ + int i, j; + + for ( i = 0; i < h ; i++ ){ + av_log(NULL, AV_LOG_ERROR, "\t"); + for ( j = 0; j < 4 ; j++ ){ + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); + } + av_log(NULL, AV_LOG_ERROR, "\n"); + } +} + +void dump_block(uint8_t *block, int line_size, int h, int w){ + int i, j; + + for ( i = 0; i < h ; i++ ){ + av_log(NULL, AV_LOG_ERROR, "\t"); + for ( j = 0; j < w ; j++ ){ + av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); + } + av_log(NULL, AV_LOG_ERROR, "\n"); + } +} + +void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, + int h, char *name, int max_dev){ + int i,j; + for ( i = 0; i < 8 ; i++ ){ + for ( j = 0; j < h ; j++ ){ + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j]; + diff = diff < 0 ? -diff : diff; + if ( diff > max_dev ){ + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]); + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name); + dump_block8(test, line_size_test, h); + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n"); + dump_block8(correct, line_size_correct, h); + exit(1); + } + } + } +} + +void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, + int h, char *name, int max_dev){ + int i,j; + for ( i = 0; i < 4 ; i++ ){ + for ( j = 0; j < h ; j++ ){ + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j]; + diff = diff < 0 ? -diff : diff; + if ( diff > max_dev ){ + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]); + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name); + dump_block8(test, line_size_test, h); + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n"); + dump_block4(correct, line_size_correct, h); + exit(1); + } + } + } +} + +void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, + int h, int width, char *name, int max_dev){ + int i,j; + for ( i = 0; i < width ; i++ ){ + for ( j = 0; j < h ; j++ ){ + int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j]; + diff = diff < 0 ? -diff : diff; + if ( diff > max_dev ){ + av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", + i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]); + av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name); + dump_block(test, line_size_test, h, width); + av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n"); + dump_block(correct, line_size_correct, h, width); + exit(1); + } + } + } +} + +void dump_dct_block(DCTELEM *block){ + int i, j; + + for ( i = 0; i < 8 ; i++ ){ + av_log(NULL, AV_LOG_ERROR, "\t"); + for ( j = 0; j < 8 ; j++ ){ + av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]); + } + av_log(NULL, AV_LOG_ERROR, "\n"); + } +} + +void test_idct_avr32(DCTELEM *block){ + DCTELEM testBlock[64]; + int i, j; + + /* Copy transposed block to testBlock */ + for ( i = 0; i < 8 ; i++ ){ + for ( j = 0; j < 8 ; j++ ){ + testBlock[i + 8*j] = block[j + i*8]; + } + } + + idct_avr32(block); + simple_idct(&testBlock); + + for ( i = 0; i < 64 ; i++ ){ + if ( block[i] != testBlock[i] ){ + av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n"); + dump_dct_block(block); + av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n"); + dump_dct_block(testBlock); + exit(1); + } + } +} + +void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){ + uint8_t testBlock[64]; + DCTELEM blockCopy[64]; + int i, j; + + /* Copy transposed block to blockCopy */ + for ( i = 0; i < 8 ; i++ ){ + for ( j = 0; j < 8 ; j++ ){ + blockCopy[i + 8*j] = block[j + i*8]; + } + } + + idct_put_avr32(dest, line_size, block); + simple_idct_put(&testBlock, 8, blockCopy); + + check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1); +} + + +void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){ + uint8_t testBlock[64]; + DCTELEM blockCopy[64]; + int i, j; + + /* Copy dest to testBlock */ + for ( i = 0; i < 8 ; i++ ){ + for ( j = 0; j < 8 ; j++ ){ + testBlock[i + 8*j] = dest[i + j*line_size]; + } + } + + /* Copy transposed block to blockCopy */ + for ( i = 0; i < 8 ; i++ ){ + for ( j = 0; j < 8 ; j++ ){ + blockCopy[i + 8*j] = block[j + i*8]; + } + } + + idct_add_avr32(dest, line_size, block); + simple_idct_add(&testBlock, 8, blockCopy); + + check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1); +} + +void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){ + uint8_t testBlock[16]; + DCTELEM blockCopy[16]; + int i, j; + + /* Copy dest to testBlock */ + for ( i = 0; i < 4 ; i++ ){ + for ( j = 0; j < 4 ; j++ ){ + testBlock[i + 4*j] = dest[i + j*stride]; + } + } + + /* Copy transposed block to blockCopy */ + for ( i = 0; i < 16 ; i++ ){ + blockCopy[i] = block[i]; + } + + ff_h264_idct_add_c(dest, block, stride); + + h264_idct_add_avr32(testBlock, blockCopy, 4); + + check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0); +} + +void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){ + uint8_t testBlock[8*8]; + DCTELEM blockCopy[8*8]; + int i, j; + + /* Copy dest to testBlock */ + for ( i = 0; i < 8 ; i++ ){ + for ( j = 0; j < 8 ; j++ ){ + testBlock[i + 8*j] = dest[i + j*stride]; + } + } + + /* Copy source block to blockCopy */ + for ( i = 0; i < 8*8 ; i++ ){ + blockCopy[i] = block[i]; + } + + ff_h264_idct8_add_c(dest, block, stride); + h264_idct8_add_avr32(testBlock, blockCopy, 8); + + check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0); +} + +void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block, + const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){ + uint8_t *testBlock, *testBlock2; + int i, j; + int input_v_size = h + in_v_size; + int input_h_size = 8 + in_h_size; + + testBlock = alloca(input_h_size*input_v_size); + testBlock2 = alloca(input_h_size*input_v_size); + + for ( i = 0; i < input_h_size ; i++ ){ + for ( j = 0; j < input_v_size ; j++ ){ + testBlock[i + input_h_size*j] = pixels[i + j*line_size]; + } + } + + test(block, pixels, line_size, h); + correct(testBlock2, testBlock, input_h_size, h); + + check_block8(block, testBlock2, line_size, input_h_size, h, name, 0); + +} + +void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst, + uint8_t *src, int stride, int h, int w, int x, int y, char *name){ + uint8_t *testBlock, *testBlock2; + int i, j; + int input_v_size = h + 1; + int input_h_size = ((w + 1) + 3) & ~3; + + testBlock = alloca(input_h_size*input_v_size); + testBlock2 = alloca(input_h_size*input_v_size); + + for ( i = 0; i < w + 1 ; i++ ){ + for ( j = 0; j < h + 1 ; j++ ){ + testBlock[i + input_h_size*j] = src[i + j*stride]; + } + } + + for ( i = 0; i < w ; i++ ){ + for ( j = 0; j < h ; j++ ){ + testBlock2[i + input_h_size*j] = dst[i + j*stride]; + } + } + + test(dst, src, stride, h, x, y); + correct(testBlock2, testBlock, input_h_size, h, x, y); + + check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0); + +} + +void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst, + uint8_t *src, int stride, int size, char *name){ + uint8_t *testBlock, *testBlock2; + int i, j; + int test_stride = size + 8; + + testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4; + testBlock2 = alloca(test_stride*size); + + for ( i = -4; i < size+4 ; i++ ){ + for ( j = -4; j < size+4 ; j++ ){ + testBlock[i + test_stride*j] = src[i + j*stride]; + } + } + + for ( i = 0; i < size ; i++ ){ + for ( j = 0; j < size ; j++ ){ + testBlock2[i + test_stride*j] = dst[i + j*stride]; + } + } + + correct(dst, src, stride); + test(testBlock2, testBlock, test_stride); + + check_block(testBlock2, dst, test_stride, stride, size, size, name, 0); + +} + + +#define test_pixels_funcs(PFX, NUM ) \ +void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \ + block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \ +void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \ + block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \ +void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \ + block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \ +void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ + test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \ + block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); } + +test_pixels_funcs(put, 8); +test_pixels_funcs(put_no_rnd, 8); +test_pixels_funcs(put, 16); +test_pixels_funcs(put_no_rnd, 16); + +test_pixels_funcs(avg, 8); +test_pixels_funcs(avg_no_rnd, 8); +test_pixels_funcs(avg, 16); +test_pixels_funcs(avg_no_rnd, 16); + +#define test_h264_chroma_mc_funcs(PFX, NUM ) \ +void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \ + test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \ + dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \ + +test_h264_chroma_mc_funcs(put, 2); +test_h264_chroma_mc_funcs(put, 4); +test_h264_chroma_mc_funcs(put, 8); +test_h264_chroma_mc_funcs(avg, 2); +test_h264_chroma_mc_funcs(avg, 4); +test_h264_chroma_mc_funcs(avg, 8); + +#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \ +void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \ + test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \ + dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); } + +#define test_qpel_mc_funcs(PFX, NUM) \ + test_qpel_mc_funcs_type(PFX, NUM, mc00);\ + test_qpel_mc_funcs_type(PFX, NUM, mc10);\ + test_qpel_mc_funcs_type(PFX, NUM, mc20);\ + test_qpel_mc_funcs_type(PFX, NUM, mc30);\ + test_qpel_mc_funcs_type(PFX, NUM, mc01);\ + test_qpel_mc_funcs_type(PFX, NUM, mc11);\ + test_qpel_mc_funcs_type(PFX, NUM, mc21);\ + test_qpel_mc_funcs_type(PFX, NUM, mc31);\ + test_qpel_mc_funcs_type(PFX, NUM, mc02);\ + test_qpel_mc_funcs_type(PFX, NUM, mc12);\ + test_qpel_mc_funcs_type(PFX, NUM, mc22);\ + test_qpel_mc_funcs_type(PFX, NUM, mc32);\ + test_qpel_mc_funcs_type(PFX, NUM, mc03);\ + test_qpel_mc_funcs_type(PFX, NUM, mc13);\ + test_qpel_mc_funcs_type(PFX, NUM, mc23);\ + test_qpel_mc_funcs_type(PFX, NUM, mc33) + +test_qpel_mc_funcs(put_h264_qpel, 4); +test_qpel_mc_funcs(put_h264_qpel, 8); +test_qpel_mc_funcs(put_h264_qpel, 16); +test_qpel_mc_funcs(avg_h264_qpel, 4); +test_qpel_mc_funcs(avg_h264_qpel, 8); +test_qpel_mc_funcs(avg_h264_qpel, 16); + + +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \ + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \ + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \ + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \ + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \ + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \ + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \ + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \ + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \ + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \ + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \ + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \ + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \ + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \ + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \ + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico ) + +#endif + +void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx) +{ + + /* H264 */ + + if ( 0 /*avr32_use_pico*/ ){ + c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico); + c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico); + c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico); + + c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico); + c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico); + c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico); + } + +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \ + c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \ + c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \ + c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \ + c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \ + c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \ + c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \ + c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \ + c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \ + c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \ + c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \ + c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \ + c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \ + c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \ + c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \ + c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico ) + + if ( avr32_use_pico ){ + dspfunc(put_h264_qpel, 0, 16); + dspfunc(put_h264_qpel, 1, 8); + dspfunc(put_h264_qpel, 2, 4); + dspfunc(avg_h264_qpel, 0, 16); + dspfunc(avg_h264_qpel, 1, 8); + dspfunc(avg_h264_qpel, 2, 4); + } + + c->idct_put= DSP_FUNC_NAME(idct_put_avr32); + c->idct_add= DSP_FUNC_NAME(idct_add_avr32); + c->idct = DSP_FUNC_NAME(idct_avr32); + c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32); + c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32); + + /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/ + + c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; + + c->fdct = fdct_avr32; + + c->clear_blocks = clear_blocks_avr32; + +#undef dspfunc +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \ + c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \ + c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \ + c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32) + + dspfunc(put, 0, 16); + dspfunc(put_no_rnd, 0, 16); + dspfunc(put, 1, 8); + dspfunc(put_no_rnd, 1, 8); + + dspfunc(avg, 1, 8); + dspfunc(avg_no_rnd, 1, 8); + dspfunc(avg, 0, 16); + dspfunc(avg_no_rnd, 0, 16); +#undef dspfunc + +} + + + +#if 0 +int main(int argc, char *argv[]){ + + +} +#endif + diff --git a/libavcodec/avr32/fdct.S b/libavcodec/avr32/fdct.S new file mode 100644 index 0000000..be45b86 --- /dev/null +++ b/libavcodec/avr32/fdct.S @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2007 Atmel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. The name of ATMEL may not be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +//********************************************************** +//* 2-D fDCT, Based on: * +//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical * +//* Fast 1-D DCT Algorithms with 11 Multiplications", * +//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal * +//* Processing 1989 (ICASSP '89), pp. 988-991. * +//* * +//* Fixed point implementation optimized for the AVR-II * +//* instruction set. If a table is used for the * +//* coeffisients we can load two and two of them from * +//* This will give a reduction of +//* * +//* * +//********************************************************** + + +/* This routine is a slow-but-accurate integer implementation of the + * forward DCT (Discrete Cosine Transform). Taken from the IJG software + * + * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT + * on each column. Direct algorithms are also available, but they are + * much more complex and seem not to be any faster when reduced to code. + * + * This implementation is based on an algorithm described in + * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT + * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, + * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. + * The primary algorithm described there uses 11 multiplies and 29 adds. + * We use their alternate method with 12 multiplies and 32 adds. + * The advantage of this method is that no data path contains more than one + * multiplication; this allows a very simple and accurate implementation in + * scaled fixed-point arithmetic, with a minimal number of shifts. + * + * The poop on this scaling stuff is as follows: + * + * Each 1-D DCT step produces outputs which are a factor of sqrt(N) + * larger than the true DCT outputs. The final outputs are therefore + * a factor of N larger than desired; since N=8 this can be cured by + * a simple right shift at the end of the algorithm. The advantage of + * this arrangement is that we save two multiplications per 1-D DCT, + * because the y0 and y4 outputs need not be divided by sqrt(N). + * In the IJG code, this factor of 8 is removed by the quantization step + * (in jcdctmgr.c), here it is removed. + * + * We have to do addition and subtraction of the integer inputs, which + * is no problem, and multiplication by fractional constants, which is + * a problem to do in integer arithmetic. We multiply all the constants + * by CONST_SCALE and convert them to integer constants (thus retaining + * CONST_BITS bits of precision in the constants). After doing a + * multiplication we have to divide the product by CONST_SCALE, with proper + * rounding, to produce the correct output. This division can be done + * cheaply as a right shift of CONST_BITS bits. We postpone shifting + * as long as possible so that partial sums can be added together with + * full fractional precision. + * + * The outputs of the first pass are scaled up by PASS1_BITS bits so that + * they are represented to better-than-integral precision. These outputs + * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word + * with the recommended scaling. (For 12-bit sample data, the intermediate + * array is INT32 anyway.) + * + * To avoid overflow of the 32-bit intermediate results in pass 2, we must + * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis + * shows that the values given below are the most effective. + * + * We can gain a little more speed, with a further compromise in accuracy, + * by omitting the addition in a descaling shift. This yields an incorrectly + * rounded result half the time... + */ + + .global fdct_avr32 + + + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define FIX_0_298631336 2446 /* FIX(0.298631336) */ +#define FIX_0_390180644 3196 /* FIX(0.390180644) */ +#define FIX_0_541196100 4433 /* FIX(0.541196100) */ +#define FIX_0_765366865 6270 /* FIX(0.765366865) */ +#define FIX_0_899976223 7373 /* FIX(0.899976223) */ +#define FIX_1_175875602 9633 /* FIX(1.175875602) */ +#define FIX_1_501321110 12299 /* FIX(1.501321110) */ +#define FIX_1_847759065 15137 /* FIX(1.847759065) */ +#define FIX_1_961570560 16069 /* FIX(1.961570560) */ +#define FIX_2_053119869 16819 /* FIX(2.053119869) */ +#define FIX_2_562915447 20995 /* FIX(2.562915447) */ +#define FIX_3_072711026 25172 /* FIX(3.072711026) */ + + +/* + * Perform an integer forward DCT on one block of samples. + */ + +//void +//fdct_int32(short *const block) +//{ +// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; +// int tmp10, tmp11, tmp12, tmp13; +// int z1, z2, z3, z4, z5; +// short *blkptr; +// int *dataptr; +// int data[64]; +// int i; +// +// /* Pass 1: process rows. */ +// /* Note results are scaled up by sqrt(8) compared to a true DCT; */ +// /* furthermore, we scale the results by 2**PASS1_BITS. */ +// +// dataptr = data; +// blkptr = block; + + .text +fdct_avr32: + pushm r0-r3, r4-r7, lr +#define loop_ctr r0 +#define blkptr r12 +#define x0 r1 +#define x1 r2 +#define x2 r3 +#define x3 r4 +#define x4 r5 +#define x5 r6 +#define x6 r7 +#define x7 r8 +#define tmp0 r5 +#define tmp7 r2 +#define tmp1 r3 +#define tmp6 r4 +#define tmp2 r9 +#define tmp5 r8 +#define tmp3 r7 +#define tmp4 r6 + + + mov loop_ctr, 8 +// for (i = 0; i < 8; i++) { +ROW_LOOP: + + ldm blkptr, r1, r2, r3, r4 + +// tmp2 = blkptr[2] + blkptr[5]; +// tmp3 = blkptr[3] + blkptr[4]; + paddx.h r5, r3, r2 +// tmp5 = blkptr[2] - blkptr[5]; +// tmp4 = blkptr[3] - blkptr[4]; + psubx.h r6, r3, r2 +// tmp0 = blkptr[0] + blkptr[7]; +// tmp1 = blkptr[1] + blkptr[6]; + paddx.h r2, r4, r1 +// tmp7 = blkptr[0] - blkptr[7]; +// tmp6 = blkptr[1] - blkptr[6]; + psubx.h r3, r4, r1 + +// /* Even part per LL&M figure 1 --- note that published figure is faulty; +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". +// */ + +#define tmp10 r1 +#define tmp13 r5 +#define tmp11 r7 +#define tmp12 r3 +#define z1 r9 + +// tmp10 = tmp0 + tmp3; +// tmp13 = tmp0 - tmp3; + paddsub.h r1, r2:t, r5:b +// tmp11 = tmp1 + tmp2; +// tmp12 = tmp1 - tmp2; + paddsub.h r4, r2:b, r5:t + + +// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS; +// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS; + paddsub.h r7, r1:t, r4:t + ld.w r10, pc[const_table - .] + plsl.h r7, r7, PASS1_BITS + +// z1 = (tmp12 + tmp13) * FIX_0_541196100; + addhh.w r8, r4:b, r1:b + mulhh.w r8, r8:b, r10:t + +// dataptr[2] = +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS); +// dataptr[6] = +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS); + mulhh.w r9, r1:b, r10:b + ld.w r10, pc[const_table - . + 4] + add r1, r8, r9 + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 + + mulhh.w r9, r4:b, r10:t + add r4, r8, r9 + satrnds r4 >> (CONST_BITS - PASS1_BITS), 31 + + +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2). +// * cK represents cos(K*pi/16). +// * i0..i3 in the paper are tmp4..tmp7 here. +// */ + +#define z2 r5 +#define z3 r6 +#define z4 r7 +#define z5 r8 + +// z4 = tmp5 + tmp7; +// z3 = tmp4 + tmp6; + padd.h r2, r6, r3 +// z2 = tmp5 + tmp6; +// z1 = tmp4 + tmp7; + paddx.h r5, r6, r3 + + lddpc r9, pc[const_table - . + 8] +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ + addhh.w r8, r2:t, r2:b + mulhh.w r8, r8:b, r10:b + lddpc r10, pc[const_table - . + 12] + + +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ + mulhh.w r11, r6:b, r9:t + +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ + mulhh.w r6, r6:t, r9:b + +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ + lddpc r9, pc[const_table - . + 20] + mulhh.w lr, r3:b, r10:t + +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ + mulhh.w r3, r3:t, r10:b + +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ + mulhh.w r10, r2:b, r9:t + +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ + mulhh.w r2, r2:t, r9:b + lddpc r9, pc[const_table - . + 16] +// z3 += z5; +// z4 += z5; + add r10, r8 + add r2, r8 + +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ + mulhh.w r8, r5:b, r9:t + +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ + mulhh.w r5, r5:t, r9:b + +// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS); + add r11, r8 + add r11, r10 + satrnds r11 >> (CONST_BITS - PASS1_BITS), 31 + +// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS); + add r6, r5 + + sthh.w blkptr[6*2], r4:b, r11:b + add r6, r2 + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 + +// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS); + add lr, r5 + sthh.w blkptr[4*2], r7:b, r6:b + add lr, r10 + satrnds lr >> (CONST_BITS - PASS1_BITS), 31 + +// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS); + add r3, r8 + sthh.w blkptr[2*2], r1:b, lr:b + add r3, r2 + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 + + + +// dataptr += 8; /* advance pointer to next row */ +// blkptr += 8; + sthh.w blkptr[0], r7:t, r3:b + sub blkptr, -16 + sub loop_ctr, 1 + brne ROW_LOOP + +// } + + /* Pass 2: process columns. + * We remove the PASS1_BITS scaling, but leave the results scaled up + * by an overall factor of 8. + */ + +// dataptr = data; + sub blkptr, 128 + + mov loop_ctr, 4 +// for (i = 0; i < 8; i++) { +COLOUMN_LOOP: + ld.w r1, blkptr[0] + ld.w r2, blkptr[1*8*2] + ld.w r3, blkptr[2*8*2] + ld.w r4, blkptr[3*8*2] + ld.w r5, blkptr[4*8*2] + ld.w r6, blkptr[5*8*2] + ld.w r7, blkptr[6*8*2] + ld.w r8, blkptr[7*8*2] + +// tmp0 = blkptr[0] + blkptr[7*8]; + padds.sh r9, r1, r8 +// tmp7 = blkptr[0] - blkptr[7*8]; + psubs.sh r1, r1, r8 +// tmp1 = blkptr[1*8] + blkptr[6*8]; + padds.sh r8, r2, r7 +// tmp6 = blkptr[1*8] - blkptr[6*8]; + psubs.sh r2, r2, r7 +// tmp2 = blkptr[2*8] + blkptr[5*8]; + padds.sh r7, r3, r6 +// tmp5 = blkptr[2*8] - blkptr[5*8]; + psubs.sh r3, r3, r6 +// tmp3 = blkptr[3*8] + blkptr[4*8]; + padds.sh r6, r4, r5 +// tmp4 = blkptr[3*8] - blkptr[4*8]; + psubs.sh r4, r4, r5 + +// /* even part per ll&m figure 1 --- note that published figure is faulty; +// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". +// */ +// +// tmp10 = tmp0 + tmp3; + padds.sh r5, r9, r6 +// tmp13 = tmp0 - tmp3; + psubs.sh r9, r9, r6 +// tmp11 = tmp1 + tmp2; + padds.sh r6, r8, r7 +// tmp12 = tmp1 - tmp2; + psubs.sh r8, r8, r7 + +// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS); +// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS); +//Might get an overflow here + padds.sh r7, r5, r6 + psubs.sh r5, r5, r6 + + //Rounding + mov lr, (1 << (PASS1_BITS + 2)) + orh lr, hi(1 << (16 + PASS1_BITS + 2)) + padds.sh r7, r7, lr + padds.sh r5, r5, lr + + pasr.h r7, r7, PASS1_BITS + 3 + pasr.h r5, r5, PASS1_BITS + 3 + st.w r12[0], r7 + st.w r12[4*8*2], r5 + + lddpc r10, const_table2 + + +// z1 = (tmp12 + tmp13) * FIX_0_541196100; + padds.sh r5, r8, r9 + mulhh.w r6, r5:t, r10:t + mulhh.w r7, r5:b, r10:t + +// dataptr[16] = +// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS); + lddpc r11, const_table2 + 4 + mulhh.w lr, r9:t, r10:b + mulhh.w r9, r9:b, r10:b + add lr, r6 + add r9, r7 + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31 + satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31 + sthh.w r12[2*8*2], lr:b, r9:b + +// dataptr[48] = +// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS); + mulhh.w lr, r8:t, r11:t + mulhh.w r8, r8:b, r11:t + add lr, r6 + add r8, r7 + satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31 + satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31 + sthh.w r12[6*8*2], lr:b, r8:b + +// /* Odd part per figure 8 --- note paper omits factor of sqrt(2). +// * cK represents cos(K*pi/16). +// * i0..i3 in the paper are tmp4..tmp7 here. +// */ +// +// z2 = tmp5 + tmp6; +// z3 = tmp4 + tmp6; +// z4 = tmp5 + tmp7; + padds.sh r5, r3, r2 + padds.sh r6, r4, r2 + padds.sh r7, r3, r1 + +// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ + padds.sh r8, r6, r7 + mulhh.w r9, r8:t, r11:b + mulhh.w r8, r8:b, r11:b + +// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ +// z3 += z5; + lddpc r11, const_table2 + 8 + mulhh.w r10, r6:t, r11:t + mulhh.w r6, r6:b, r11:t + add r10, r9 + add r6, r8 + +// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ +// z4 += z5; + mulhh.w lr, r7:t, r11:b + mulhh.w r7, r7:b, r11:b + lddpc r11, const_table2 + 12 + st.w --sp,r0 + add lr, r9 + add r7, r8 + +// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ + mulhh.w r0, r2:t, r11:t + machh.w r0, r5:t, r11:b + mulhh.w r2, r2:b, r11:t + machh.w r2, r5:b, r11:b + +// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ +// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS); + add r0, r10 + lddpc r11, const_table2 + 16 + add r2, r6 + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31 + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31 + sthh.w r12[3*8*2], r0:b, r2:b +// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ + mulhh.w r0, r3:t, r11:t + machh.w r0, r5:t, r11:b + mulhh.w r2, r3:b, r11:t + machh.w r2, r5:b, r11:b + add r0, lr + lddpc r11, const_table2 + 20 + add r2, r7 + +// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS); + satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31 + satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31 + sthh.w r12[5*8*2], r0:b, r2:b + + +// z1 = tmp4 + tmp7; + padds.sh r2, r4, r1 + +// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ + mulhh.w r3, r4:t, r11:t + machh.w r3, r2:t, r11:b + mulhh.w r4, r4:b, r11:t + machh.w r4, r2:b, r11:b + add r3, r10 + lddpc r11, const_table2 + 24 + add r4, r6 + +// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ +// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS); + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31 + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31 + sthh.w r12[7*8*2], r3:b, r4:b + + +// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ + mulhh.w r3, r1:t, r11:t + machh.w r3, r2:t, r11:b + mulhh.w r4, r1:b, r11:t + machh.w r4, r2:b, r11:b + add r3, lr + add r4, r7 + +// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS); + satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31 + satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31 + sthh.w r12[1*8*2], r3:b, r4:b + ld.w r0, sp++ + +// dataptr++; /* advance pointer to next column */ + sub blkptr, -4 + sub loop_ctr, 1 + brne COLOUMN_LOOP + +// } + + popm r0-r3, r4-r7, pc + +// /* descale */ +// for (i = 0; i < 64; i++) +// block[i] = (short int) DESCALE(data[i], 3); + + +//} + + + .align 2 +const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602 + .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110 + .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644 + +const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602 + .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447 + .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223 + .short FIX_1_501321110, -FIX_0_899976223 + + + + diff --git a/libavcodec/avr32/h264idct.S b/libavcodec/avr32/h264idct.S new file mode 100644 index 0000000..4b23e2d --- /dev/null +++ b/libavcodec/avr32/h264idct.S @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2007 Atmel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. The name of ATMEL may not be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + + .global h264_idct_add_avr32 + + /* Macro for performing the 1-D transform on one row line. + + The register 'w01' should contain the first two pixels, + and the register 'w23' should contain the last two pixels + in the line. The resulting line is placed in p01 and p23 + so that { w01, w23 } = { x0, x1, x3, x2 }. + 'tmp' and 'tmp2' should be scratchpad registers. */ + .macro transform_row w01, w23, tmp, tmp2 + add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */ + sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */ + bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */ + pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */ + paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */ + padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */ + psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */ + .endm + + /* Macro for performing the 1-D transform on two columns. + + The registers w0, w1, w2, w3 should each contain two + packed samples from the two colomns to transform. + tmp and tmp2 are scratchpad registers. + + The resulting transformed columns are placed in the + same positions as the input columns. + */ + .macro transform_2columns w0, w1, w2, w3, tmp, tmp2 + padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */ + psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */ + pasr.h \w2, \w1, 1 /* w2 = w1/2 */ + pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */ + psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */ + padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */ + padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */ + psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */ + padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */ + psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */ + /* Scale down result. */ + pasr.h \w0, \w0, 6 + pasr.h \w1, \w1, 6 + pasr.h \w2, \w2, 6 + pasr.h \w3, \w3, 6 + .endm + +/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/ + +h264_idct_add_avr32: + + stm --sp,r0-r3,r4-r7, lr + + /* Setup rounding factor. */ + mov r0, (1 << 5) + lsl r0, 16 + + /* Load block */ + ldm r11,r2-r9 + /* r9 = { w00, w01 }, + r8 = { w02, w03 }, + r7 = { w10, w11 }, + r6 = { w12, w13 }, + r5 = { w20, w21 }, + r4 = { w22, w23 }, + r3 = { w30, w31 }, + r2 = { w32, w33 } */ + + + /* Add the rounding factor to w00. */ + add r9, r0 + + /* Transform rows */ + transform_row r9, r8, r0, r1 + transform_row r7, r6, r0, r1 + transform_row r5, r4, r0, r1 + transform_row r3, r2, r0, r1 + + /* Transform columns */ + transform_2columns r9, r7, r5, r3, r0, r1 + transform_2columns r8, r6, r4, r2, r0, r1 + + /* Load predicted pixels.*/ + ld.w lr, r12[0] + ld.w r11, r12[r10] + + /* Unpack to halwords. */ + punpckub.h r0, lr:t + punpckub.h r1, lr:b + + /* Add with transformed row. */ + padd.h r0, r0, r9 + paddx.h r1, r1, r8 + /* Pack and saturate back to 8-bit pixels. */ + packsh.ub r0, r0, r1 + + /* Unpack to halwords. */ + punpckub.h lr, r11:t + punpckub.h r11, r11:b + + /* Add with transformed row. */ + padd.h lr, lr, r7 + paddx.h r11, r11, r6 + /* Pack and saturate back to 8-bit pixels. */ + packsh.ub r1, lr, r11 + + /* Store back to frame. */ + st.w r12[0], r0 + st.w r12[r10], r1 + + add r12, r12, r10 << 1 + + /* Load predicted pixels.*/ + ld.w lr, r12[0] + ld.w r11, r12[r10] + + /* Unpack to halwords. */ + punpckub.h r0, lr:t + punpckub.h r1, lr:b + + /* Add with transformed row. */ + padd.h r0, r0, r5 + paddx.h r1, r1, r4 + /* Pack and saturate back to 8-bit pixels. */ + packsh.ub r0, r0, r1 + + /* Unpack to halwords. */ + punpckub.h lr, r11:t + punpckub.h r11, r11:b + + /* Add with transformed row. */ + padd.h lr, lr, r3 + paddx.h r11, r11, r2 + /* Pack and saturate back to 8-bit pixels. */ + packsh.ub r1, lr, r11 + + /* Store back to frame. */ + st.w r12[0], r0 + st.w r12[r10], r1 + + ldm sp++,r0-r3,r4-r7, pc + + + .global h264_idct8_add_avr32 +//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){ + +h264_idct8_add_avr32: + stm --sp,r0-r3,r4-r7, lr + + /* Push dst and stride on stack */ + stm --sp,r10,r12 + +// int i; +// DCTELEM (*src)[8] = (DCTELEM(*)[8])block; +// uint8_t *cm = cropTbl + MAX_NEG_CROP; + +// block[0] += 32; + + +// for( i = 0; i < 8; i++ ) +// { + mov lr, 4 +0: + ld.w r7, r11[0*(8*2)] + ld.w r6, r11[1*(8*2)] + ld.w r5, r11[2*(8*2)] + ld.w r4, r11[3*(8*2)] + ld.w r3, r11[4*(8*2)] + ld.w r2, r11[5*(8*2)] + ld.w r1, r11[6*(8*2)] + ld.w r0, r11[7*(8*2)] + +/* + + const int a0 = src[0][i] + src[4][i]; + const int a2 = src[0][i] - src[4][i]; + const int a4 = (src[2][i]>>1) - src[6][i]; + const int a6 = (src[6][i]>>1) + src[2][i]; +*/ + padd.h r8, r7, r3 /* r8 = a0 */ + psub.h r7, r7, r3 /* r7 = a2 */ + pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */ + pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */ + psub.h r3, r3, r1 /* r3 = a4 */ + padd.h r9, r9, r5 /* r9 = a6 */ + +/* + const int b0 = a0 + a6; + const int b2 = a2 + a4; + const int b4 = a2 - a4; + const int b6 = a0 - a6; +*/ + padd.h r1, r8, r9 /* r1 = b0 */ + psub.h r8, r8, r9 /* r8 = b6 */ + padd.h r5, r7, r3 /* r5 = b2 */ + psub.h r7, r7, r3 /* r7 = b4 */ + +/* + const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1); + const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1); + const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1); + const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1); +*/ + pasr.h r3, r0, 1 + padd.h r3, r3, r0 + psub.h r3, r2, r3 + psub.h r3, r3, r4 /* r3 = a1 */ + + pasr.h r9, r4, 1 + padd.h r9, r9, r4 + psub.h r9, r0, r9 + padd.h r9, r6, r9 /* r9 = a3 */ + + pasr.h r10, r2, 1 + padd.h r10, r10, r2 + padd.h r10, r10, r0 + psub.h r10, r10, r6 /* r10 = a5 */ + + pasr.h r0, r6, 1 + padd.h r0, r0, r6 + padd.h r0, r0, r2 + padd.h r0, r0, r4 /* r0 = a7 */ +/* + const int b1 = (a7>>2) + a1; + const int b3 = a3 + (a5>>2); + const int b5 = (a3>>2) - a5; + const int b7 = a7 - (a1>>2); +*/ + pasr.h r2, r0, 2 + padd.h r2, r2, r3 /* r2 = b1 */ + pasr.h r3, r3, 2 + psub.h r3, r0, r3 /* r3 = b7 */ + + pasr.h r0, r10, 2 + padd.h r0, r0, r9 /* r0 = b3 */ + pasr.h r9, r9, 2 + psub.h r9, r9, r10 /* r9 = b5 */ + + +/* + src[0][i] = b0 + b7; + src[7][i] = b0 - b7; + src[1][i] = b2 + b5; + src[6][i] = b2 - b5; + src[2][i] = b4 + b3; + src[5][i] = b4 - b3; + src[3][i] = b6 + b1; + src[4][i] = b6 - b1; */ + + padd.h r4, r1, r3 + psub.h r1, r1, r3 + st.w r11[0*(8*2)], r4 + st.w r11[7*(8*2)], r1 + + padd.h r3, r5, r9 + psub.h r5, r5, r9 + st.w r11[1*(8*2)], r3 + st.w r11[6*(8*2)], r5 + + padd.h r9, r7, r0 + psub.h r7, r7, r0 + st.w r11[2*(8*2)], r9 + st.w r11[5*(8*2)], r7 + + padd.h r0, r8, r2 + psub.h r8, r8, r2 + st.w r11[3*(8*2)], r0 + st.w r11[4*(8*2)], r8 + + sub r11, -4 + sub lr, 1 + brne 0b + +// } + + lddsp r12, sp[0] /* r12 = dst */ + sub r11, 4*4 + ldm r11++, r4-r7 + mov lr, 8 + /* Push dst and stride on stack */ + +1: +// for( i = 0; i < 8; i++ ) +// { + + /* r7 = {src[i][0], src[i][1]} + r6 = {src[i][2], src[i][3]} + r5 = {src[i][4], src[i][5]} + r4 = {src[i][6], src[i][7]} */ + +/* + const int a0 = src[i][0] + src[i][4]; + const int a2 = src[i][0] - src[i][4]; + const int a4 = (src[i][2]>>1) - src[i][6]; + const int a6 = (src[i][6]>>1) + src[i][2]; +*/ + pasr.h r8, r6, 1 + pasr.h r9, r4, 1 + addhh.w r0, r7:t, r5:t /* r0 = a0 */ + subhh.w r1, r7:t, r5:t /* r1 = a2 */ + subhh.w r2, r8:t, r4:t /* r2 = a4 */ + addhh.w r3, r9:t, r6:t /* r3 = a6 */ + +/* + const int b0 = a0 + a6; + const int b2 = a2 + a4; + const int b4 = a2 - a4; + const int b6 = a0 - a6; +*/ + add r10, r0, r3 /* r10 = b0 */ + sub r0, r3 /* r0 = b6 */ + add r3, r1, r2 /* r3 = b2 */ + sub r1, r2 /* r1 = b4 */ +/* + + + const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1); + const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1); + const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1); + const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */ + addhh.w r8, r8:b, r6:b + addhh.w r2, r4:b, r7:b + sub r2, r8 /* r2 = a3 */ + + addhh.w r9, r9:b, r4:b + subhh.w r8, r5:b, r6:b + sub r8, r9 /* r8 = a1 */ + + pasr.h r9, r7, 1 + addhh.w r9, r9:b, r7:b + addhh.w r6, r5:b, r6:b + add r6, r9 /* r6 = a7 */ + + pasr.h r9, r5, 1 + addhh.w r9, r9:b, r5:b + subhh.w r5, r4:b, r7:b + add r5, r9 /* r5 = a5 */ + +/* const int b1 = (a7>>2) + a1; + const int b3 = (a5>>2) + a3; + const int b5 = (a3>>2) - a5; + const int b7 = -(a1>>2) + a7 ; */ + asr r4, r6, 2 + add r4, r8 /* r4 = b1 */ + asr r8, 2 + rsub r8, r6 /* r8 = b7 */ + + asr r6, r5, 2 + add r6, r2 /* r6 = b3 */ + asr r2, 2 + sub r2, r5 /* r2 = b5 */ + +/* + dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ]; + dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ]; + dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ]; + dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ]; + dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ]; + dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ]; + dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ]; + dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ]; +*/ + add r5, r10, r8 + satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */ + sub r10, r8 + satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */ + add r8, r3, r2 + satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */ + sub r3, r2 + satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */ + + add r2, r1, r6 + satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */ + sub r1, r6 + satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */ + + add r6, r0, r4 + satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */ + sub r0, r4 + satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */ + + ld.w r4, r12[0] + + packw.sh r8, r5, r8 + packw.sh r7, r2, r6 + ld.w r9, r12[4] + packw.sh r6, r0, r1 + packw.sh r5, r3, r10 + + punpckub.h r10, r4:t + punpckub.h r4, r4:b + punpckub.h r3, r9:t + punpckub.h r9, r9:b + + padd.h r8, r8, r10 + padd.h r7, r7, r4 + padd.h r6, r6, r3 + padd.h r5, r5, r9 + + lddsp r10, sp[4] /* r10 = stride */ + packsh.ub r0, r8, r7 + packsh.ub r1, r6, r5 + + st.w r12[0], r0 + st.w r12[4], r1 + + ldm r11++, r4-r7 + add r12, r10 /* dst += stride */ + + sub lr, 1 + brne 1b + + sub sp, -8 + ldm sp++,r0-r3,r4-r7, pc + + + +// } +//} diff --git a/libavcodec/avr32/idct.S b/libavcodec/avr32/idct.S new file mode 100644 index 0000000..e7551ec --- /dev/null +++ b/libavcodec/avr32/idct.S @@ -0,0 +1,829 @@ +/* + * Copyright (c) 2007 Atmel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. The name of ATMEL may not be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + + .global idct_add_avr32 + .global idct_put_avr32 + .global idct_avr32 + + +#define CONST_BITS 13 +#define PASS1_BITS 2 + +#define ONE ((INT32) 1) + +#define CONST_SCALE (ONE << CONST_BITS) + +#define LINE_SIZE 32 + +#define FIX_0_298631336 (2446) /* FIX(0.298631336) */ +#define FIX_0_390180644 (3196) /* FIX(0.390180644) */ +#define FIX_0_541196100 (4433) /* FIX(0.541196100) */ +#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ +#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ +#define FIX_1_175875602 (9633) /* FIX(1.175875602) */ +#define FIX_1_501321110 (12299)/* FIX(1.501321110) */ +#define FIX_1_847759065 (15137)/* FIX(1.847759065) */ +#define FIX_1_961570560 (16069)/* FIX(1.961570560) */ +#define FIX_2_053119869 (16819)/* FIX(2.053119869) */ +#define FIX_2_562915447 (20995)/* FIX(2.562915447) */ +#define FIX_3_072711026 (25172)/* FIX(3.072711026) */ + + +#define loop_cnt r11 + + .text + +idct_add_avr32: + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables + + // Give room for some variables on the stack + sub sp, 8 + stdsp SP[0], r12 // rfp + stdsp SP[4], r11 // iinc + + mov loop_cnt, 8 //Initialize loop counter + +FOR_ROW: + + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block + mov r6, 0 +#ifdef USE_PREFETCH + pref r10[LINE_SIZE] //Prefetch next line +#endif + or r4, r2, r3 << 16 + or r4, r1 //Check if all DCT-coeffisients except the DC is zero + or r4, r0 + brne AC_ROW //If there are non-zero AC coeffisients perform row-transform + + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5 + plsl.h r5, r5, PASS1_BITS + mov r4, r5 + st.d r10++, r4 + st.d r10++, r4 + + sub loop_cnt, 1 //Decrement loop counter + brne FOR_ROW //Perform loop one more time if loop_cnt is not zero + + bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed + + +AC_ROW: + + + ld.w r12, pc[coef_table - .] + ld.w r9, pc[coef_table - . + 4] + + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] + mulhh.w r5, r4:t, r12:t + mulhh.w r6, r0:t, r12:b + ld.w r12, pc[coef_table - . + 8] + mulhh.w r7, r2:t, r9:t + add r6, r5 // tmp2 + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 + add r7, r5 // tmp3 + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31 + + paddsub.h r5, r3:t, r1:t + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1 + + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13 + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12 + + + addhh.w lr, r3:b, r1:b // lr = z4 + addhh.w r5, r4:b, lr:b + mulhh.w r5, r5:b, r9:b // r5 = z5 + + ld.w r9, pc[coef_table - . + 12] + mulhh.w r4, r4:b, r12:t // r4 = z3 + mulhh.w lr, lr:b, r12:b // lr = z4 + + add r4, r5 + add lr, r5 + + addhh.w r5, r2:b, r1:b // r5 = z2 + addhh.w r8, r3:b, r0:b // r8 = z1 + + + mulhh.w r0, r0:b, r9:t // r0 = tmp0 + ld.w r12, pc[coef_table - . + 16] + mulhh.w r1, r1:b, r9:b // r1 = tmp1 + ld.w r9, pc[coef_table - . + 20] + mulhh.w r2, r2:b, r12:t // r2 = tmp2 + mulhh.w r3, r3:b, r12:b // r3 = tmp3 + mulhh.w r8, r8:b, r9:t // r8 = z1 + mulhh.w r5, r5:b, r9:b // r5 = z2 + + + add r0, r8 + add r0, r4 + add r1, r5 + add r1, lr + add r2, r5 + add r2, r4 + add r3, r8 + add r3, lr + + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31 + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31 + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 + + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6] + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7] + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5] + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4] + + sthh.w r10[0], r4:t, r5:t + sthh.w r10[4], r3:t, r2:t + sthh.w r10[8], r2:b, r3:b + sthh.w r10[12], r5:b, r4:b + + + + sub r10, -16 + sub loop_cnt, 1 + brne FOR_ROW, e + +COLOUMN_TRANSFORM: + + sub r10, 128 //Set pointer to start of DCT block + + + mov loop_cnt, 8 +FOR_COLOUMN: + ldins.h r3:t,r10[0] // r3:t = dataptr[0] + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1] + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2] + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5] + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4] + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3] + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6] + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7] + + or r4, r1, r3 << 16 + or r4, r2 + or r4, r0 + brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform + + lddsp r12, SP[0] // rfp + lddsp r9, SP[4] // iinc + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9 + ld.d r0, r12[0] + sub r10, -2 // Increment the dataptr + bfins r3, r3, 16, 16 + punpckub.h r2, r1:t + padd.h r2, r2, r3 + punpckub.h r1, r1:b + padd.h r1, r1, r3 + packsh.ub r1, r2, r1 + punpckub.h r2, r0:t + padd.h r2, r2, r3 + punpckub.h r0, r0:b + padd.h r0, r0, r3 + packsh.ub r0, r2, r0 + st.d r12[0], r0 + add r12, r9 // increment rfp + stdsp SP[0], r12 + + sub loop_cnt, 1//Decrement loop counter + brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero + + sub sp, -8 + popm r0-r3, r4-r7, pc//Pop back registers and PC + +AC_COLOUMN: + + ld.w r12, pc[coef_table - .] + ld.w r9, pc[coef_table - . + 4] + + addhh.w r4, r2:t, r2:b + mulhh.w r4, r4:b, r12:t // r4 = z1 + mulhh.w r5, r2:b, r12:b + ld.w r12, pc[coef_table - . + 8] + mulhh.w r6, r2:t, r9:t + add r5, r4 // r5 = tmp2 + add r6, r4 // r6 = tmp3 + + addhh.w r7, r3:t, r3:b + subhh.w r8, r3:t, r3:b + + lsl r7, CONST_BITS + lsl r8, CONST_BITS + + add r2, r7, r6 // r2 = tmp10 + sub r3, r7, r6 // r3 = tmp13 + add r4, r8, r5 // r4 = tmp11 + sub r5, r8, r5 // r5 = tmp12 + + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3 + addhh.w r7, r6:t, r6:b + mulhh.w r7, r7:b, r9:b // r7 = z5 + + ld.w r9, pc[coef_table - . + 12] + mulhh.w r8, r6:b, r12:t // r8 = z3 + mulhh.w r6, r6:t, r12:b // r6 = z4 + + add r8, r7 + add r6, r7 + + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1 + + mulhh.w r12, r0:b, r9:t // r12 = tmp0 + mulhh.w r0, r0:t, r9:b // r0 = tmp1 + ld.w r9, pc[coef_table - . + 16] + add r12, r8 + add r0, r6 + + ld.w lr, pc[coef_table - . + 20] + machh.w r8, r1:b, r9:t // r8 = tmp2 + machh.w r6, r1:t, r9:b // r6 = tmp3 + mulhh.w r9, r7:b, lr:t // r9 = z1 + mulhh.w r7, r7:t, lr:b // r7 = z2 + + + add r12, r9 + add r0, r7 + add r8, r7 + add r6, r9 + + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0] + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7] + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1] + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6] + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2] + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5] + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3] + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4] + + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9 + + packw.sh r1, r1, r6 + packw.sh r8, r8, r0 + packw.sh r3, r3, r5 + packw.sh r4, r4, r2 + + lddsp r12, SP[0] // rfp + lddsp r9, SP[4] // iinc + ld.d r6, r12[0] + sub r10, -2 // Increment the dataptr + punpckub.h r0, r7:t + padd.h r1, r1, r0 + punpckub.h r0, r7:b + padd.h r8, r8, r0 + packsh.ub r7, r1, r8 + punpckub.h r0, r6:t + padd.h r3, r3, r0 + punpckub.h r0, r6:b + padd.h r4, r4, r0 + packsh.ub r6, r3, r4 + st.d r12[0], r6 + add r12, r9 // increment rfp + stdsp SP[0], r12 + + sub loop_cnt, 1 //Decrement loop counter + brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero + + sub sp, -8 + popm r0-r3, r4-r7, pc //Pop back registers and PC + + + +//Coeffisient Table: + .align 2 +coef_table: + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869 + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447 + + +idct_put_avr32: + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables + + //; Give room for some variables on the stack + sub sp, 8 + stdsp SP[0], r12 // rfp + stdsp SP[4], r11 // iinc + + mov loop_cnt, 8 //Initialize loop counter + +0: + + ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block + mov r6, 0 +#ifdef USE_PREFETCH + pref r10[LINE_SIZE] //Prefetch next line +#endif + or r4, r2, r3 << 16 + or r4, r1 //Check if all DCT-coeffisients except the DC is zero + or r4, r0 + brne 1f //If there are non-zero AC coeffisients perform row-transform + + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5 + plsl.h r5, r5, PASS1_BITS + mov r4, r5 + st.d r10++, r4 + st.d r10++, r4 + + sub loop_cnt, 1 //Decrement loop counter + brne 0b //Perform loop one more time if loop_cnt is not zero + + bral 2f //Perform coloumn transform after row transform is computed + +1: + + ld.w r12, pc[coef_table_copy - .] + ld.w r9, pc[coef_table_copy - . + 4] + + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] + mulhh.w r5, r4:t, r12:t + mulhh.w r6, r0:t, r12:b + ld.w r12, pc[coef_table_copy - . + 8] + mulhh.w r7, r2:t, r9:t + add r6, r5 // tmp2 + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 + add r7, r5 // tmp3 + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31 + + paddsub.h r5, r3:t, r1:t + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1 + + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13 + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12 + + + + addhh.w lr, r3:b, r1:b // lr = z4 + addhh.w r5, r4:b, lr:b + mulhh.w r5, r5:b, r9:b // r5 = z5 + + ld.w r9, pc[coef_table_copy - . + 12] + mulhh.w r4, r4:b, r12:t // r4 = z3 + mulhh.w lr, lr:b, r12:b // lr = z4 + + add r4, r5 + add lr, r5 + + addhh.w r5, r2:b, r1:b // r5 = z2 + addhh.w r8, r3:b, r0:b // r8 = z1 + + + mulhh.w r0, r0:b, r9:t // r0 = tmp0 + ld.w r12, pc[coef_table_copy - . + 16] + mulhh.w r1, r1:b, r9:b // r1 = tmp1 + ld.w r9, pc[coef_table_copy - . + 20] + mulhh.w r2, r2:b, r12:t // r2 = tmp2 + mulhh.w r3, r3:b, r12:b // r3 = tmp3 + mulhh.w r8, r8:b, r9:t // r8 = z1 + mulhh.w r5, r5:b, r9:b // r5 = z2 + + + add r0, r8 + add r0, r4 + add r1, r5 + add r1, lr + add r2, r5 + add r2, r4 + add r3, r8 + add r3, lr + + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31 + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31 + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 + + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6] + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7] + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5] + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4] + + sthh.w r10[0], r4:t, r5:t + sthh.w r10[4], r3:t, r2:t + sthh.w r10[8], r2:b, r3:b + sthh.w r10[12], r5:b, r4:b + + + + sub r10, -16 + sub loop_cnt, 1 + brne 0b + +2: + + sub r10, 128 //Set pointer to start of DCT block + + mov loop_cnt, 8 + +0: + ldins.h r3:t,r10[0] // r3:t = dataptr[0] + ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1] + ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2] + ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5] + ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4] + ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3] + ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6] + ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7] + + or r4, r1, r3 << 16 + or r4, r2 + or r4, r0 + brne 1f //If there are non-zero AC coeffisients perform row-transform + + lddsp r12, SP[0] // rfp + lddsp r9, SP[4] // iinc + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31 + packw.sh r3, r3, r3 + packsh.ub r3, r3, r3 + mov r2, r3 + st.d r12[0], r2 + add r12, r9 // increment rfp + sub r10, -2 // Increment the dataptr + stdsp SP[0], r12 + + sub loop_cnt, 1//Decrement loop counter + brne 0b //Perform loop one more time if loop_cnt is not zero + + sub sp, -8 + popm r0-r3, r4-r7, pc//Pop back registers and PC + +1: + + ld.w r12, pc[coef_table_copy - .] + ld.w r9, pc[coef_table_copy - . + 4] + + addhh.w r4, r2:t, r2:b + mulhh.w r4, r4:b, r12:t // r4 = z1 + mulhh.w r5, r2:b, r12:b + ld.w r12, pc[coef_table_copy - . + 8] + mulhh.w r6, r2:t, r9:t + add r5, r4 // r5 = tmp2 + add r6, r4 // r6 = tmp3 + + addhh.w r7, r3:t, r3:b + subhh.w r8, r3:t, r3:b + + lsl r7, CONST_BITS + lsl r8, CONST_BITS + + add r2, r7, r6 // r2 = tmp10 + sub r3, r7, r6 // r3 = tmp13 + add r4, r8, r5 // r4 = tmp11 + sub r5, r8, r5 // r5 = tmp12 + + + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3 + addhh.w r7, r6:t, r6:b + mulhh.w r7, r7:b, r9:b // r7 = z5 + + ld.w r9, pc[coef_table_copy - . + 12] + mulhh.w r8, r6:b, r12:t // r8 = z3 + mulhh.w r6, r6:t, r12:b // r6 = z4 + + add r8, r7 + add r6, r7 + + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1 + + mulhh.w r12, r0:b, r9:t // r12 = tmp0 + mulhh.w r0, r0:t, r9:b // r0 = tmp1 + ld.w r9, pc[coef_table_copy - . + 16] + add r12, r8 + add r0, r6 + + ld.w lr, pc[coef_table_copy - . + 20] + machh.w r8, r1:b, r9:t // r8 = tmp2 + machh.w r6, r1:t, r9:b // r6 = tmp3 + mulhh.w r9, r7:b, lr:t // r9 = z1 + mulhh.w r7, r7:t, lr:b // r7 = z2 + + + add r12, r9 + add r0, r7 + add r8, r7 + add r6, r9 + + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0] + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7] + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1] + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6] + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2] + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5] + add r0, r3, r12 // r0 = dataptr[DCTSIZE*3] + sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4] + + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9 + + packw.sh r1, r1, r6 + packw.sh r8, r8, r0 + packw.sh r3, r3, r5 + packw.sh r4, r4, r2 + + packsh.ub r1, r1, r8 + packsh.ub r0, r3, r4 + lddsp r12, SP[0] // rfp + lddsp r9, SP[4] // iinc + st.d r12[0], r0 + sub r10, -2 // Increment the dataptr + add r12, r9 // increment rfp + stdsp SP[0], r12 + + sub loop_cnt, 1 //Decrement loop counter + brne 0b //Perform loop one more time if loop_cnt is not zero + + sub sp, -8 + popm r0-r3, r4-r7, pc //Pop back registers and PC + + + + .align 2 +coef_table_copy: + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869 + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447 + + +idct_avr32: + pushm r0-r3, r4-r7, lr //Free up registers to use for local variables + + //; Give room for a temporary block on the stack + sub sp, 8*8*2 + + mov loop_cnt, 8 //Initialize loop counter + +0: + + ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block + mov r6, 0 +#ifdef USE_PREFETCH + pref r12[LINE_SIZE] //Prefetch next line +#endif + or r4, r2, r3 << 16 + or r4, r1 //Check if all DCT-coeffisients except the DC is zero + or r4, r0 + brne 1f //If there are non-zero AC coeffisients perform row-transform + + paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5 + plsl.h r5, r5, PASS1_BITS + mov r4, r5 + st.d sp++, r4 + st.d sp++, r4 + + sub loop_cnt, 1 //Decrement loop counter + brne 0b //Perform loop one more time if loop_cnt is not zero + + bral 2f //Perform coloumn transform after row transform is computed + +1: + + ld.w r10, pc[coef_table_idct - .] + ld.w r9, pc[coef_table_idct - . + 4] + + padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] + mulhh.w r5, r4:t, r10:t + mulhh.w r6, r0:t, r10:b + ld.w r10, pc[coef_table_idct - . + 8] + mulhh.w r7, r2:t, r9:t + add r6, r5 // tmp2 + satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 + add r7, r5 // tmp3 + satrnds r7 >> (CONST_BITS - PASS1_BITS), 31 + + paddsub.h r5, r3:t, r1:t + plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1 + + paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13 + paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12 + + + + addhh.w lr, r3:b, r1:b // lr = z4 + addhh.w r5, r4:b, lr:b + mulhh.w r5, r5:b, r9:b // r5 = z5 + + ld.w r9, pc[coef_table_idct - . + 12] + mulhh.w r4, r4:b, r10:t // r4 = z3 + mulhh.w lr, lr:b, r10:b // lr = z4 + + add r4, r5 + add lr, r5 + + addhh.w r5, r2:b, r1:b // r5 = z2 + addhh.w r8, r3:b, r0:b // r8 = z1 + + + mulhh.w r0, r0:b, r9:t // r0 = tmp0 + ld.w r10, pc[coef_table_idct - . + 16] + mulhh.w r1, r1:b, r9:b // r1 = tmp1 + ld.w r9, pc[coef_table_idct - . + 20] + mulhh.w r2, r2:b, r10:t // r2 = tmp2 + mulhh.w r3, r3:b, r10:b // r3 = tmp3 + mulhh.w r8, r8:b, r9:t // r8 = z1 + mulhh.w r5, r5:b, r9:b // r5 = z2 + + + add r0, r8 + add r0, r4 + add r1, r5 + add r1, lr + add r2, r5 + add r2, r4 + add r3, r8 + add r3, lr + + satrnds r0 >> (CONST_BITS - PASS1_BITS), 31 + satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 + satrnds r2 >> (CONST_BITS - PASS1_BITS), 31 + satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 + + paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6] + paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7] + paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5] + paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4] + + sthh.w sp[0], r4:t, r5:t + sthh.w sp[4], r3:t, r2:t + sthh.w sp[8], r2:b, r3:b + sthh.w sp[12], r5:b, r4:b + + + + sub sp, -16 + sub loop_cnt, 1 + brne 0b + +2: + + sub sp, 8*8*2 //Set pointer to start of DCT block + sub r12, 8*8*2 //Set pointer to start of DCT block + + mov loop_cnt, 8 + +0: + ldins.h r3:t,sp[0] // r3:t = dataptr[0] + ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1] + ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2] + ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5] + ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4] + ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3] + ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6] + ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7] + + or r4, r1, r3 << 16 + or r4, r2 + or r4, r0 + brne 1f //If there are non-zero AC coeffisients perform row-transform + + satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31 + packw.sh r3, r3, r3 + mov r2, r3 + st.d r12++, r2 + st.d r12++, r2 + sub sp, -2 // Increment the dataptr + + sub loop_cnt, 1//Decrement loop counter + brne 0b //Perform loop one more time if loop_cnt is not zero + + sub sp, -(8*8*2 - 8) + popm r0-r3, r4-r7, pc//Pop back registers and PC + +1: + + ld.w r10, pc[coef_table_idct - .] + ld.w r9, pc[coef_table_idct - . + 4] + + addhh.w r4, r2:t, r2:b + mulhh.w r4, r4:b, r10:t // r4 = z1 + mulhh.w r5, r2:b, r10:b + ld.w r10, pc[coef_table_idct - . + 8] + mulhh.w r6, r2:t, r9:t + add r5, r4 // r5 = tmp2 + add r6, r4 // r6 = tmp3 + + addhh.w r7, r3:t, r3:b + subhh.w r8, r3:t, r3:b + + lsl r7, CONST_BITS + lsl r8, CONST_BITS + + add r2, r7, r6 // r2 = tmp10 + sub r3, r7, r6 // r3 = tmp13 + add r4, r8, r5 // r4 = tmp11 + sub r5, r8, r5 // r5 = tmp12 + + + padd.h r6, r0, r1 // r6:t = z4, r6:b = z3 + addhh.w r7, r6:t, r6:b + mulhh.w r7, r7:b, r9:b // r7 = z5 + + ld.w r9, pc[coef_table_idct - . + 12] + mulhh.w r8, r6:b, r10:t // r8 = z3 + mulhh.w r6, r6:t, r10:b // r6 = z4 + + add r8, r7 + add r6, r7 + + paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1 + + mulhh.w r10, r0:b, r9:t // r10 = tmp0 + mulhh.w r0, r0:t, r9:b // r0 = tmp1 + ld.w r9, pc[coef_table_idct - . + 16] + add r10, r8 + add r0, r6 + + ld.w lr, pc[coef_table_idct - . + 20] + machh.w r8, r1:b, r9:t // r8 = tmp2 + machh.w r6, r1:t, r9:b // r6 = tmp3 + mulhh.w r9, r7:b, lr:t // r9 = z1 + mulhh.w r7, r7:t, lr:b // r7 = z2 + + + add r10, r9 + add r0, r7 + add r8, r7 + add r6, r9 + + add r1, r2, r6 // r1 = dataptr[DCTSIZE*0] + sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7] + add r6, r4, r8 // r6 = dataptr[DCTSIZE*1] + sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6] + add r8, r5, r0 // r8 = dataptr[DCTSIZE*2] + sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5] + add r0, r3, r10 // r0 = dataptr[DCTSIZE*3] + sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4] + + satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9 + satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9 + + packw.sh r7, r1, r6 + packw.sh r6, r8, r0 + packw.sh r5, r3, r5 + packw.sh r4, r4, r2 + + stm r12, r4-r7 + sub sp, -2 // Increment the dataptr + sub r12, -16 + + sub loop_cnt, 1 //Decrement loop counter + brne 0b //Perform loop one more time if loop_cnt is not zero + + sub sp, -(8*8*2 - 8) + popm r0-r3, r4-r7, pc //Pop back registers and PC + + + + .align 2 +coef_table_idct: + .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 + .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869 + .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447 + diff --git a/libavcodec/avr32/mc.S b/libavcodec/avr32/mc.S new file mode 100644 index 0000000..07a002d --- /dev/null +++ b/libavcodec/avr32/mc.S @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2007 Atmel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. The name of ATMEL may not be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + + + /* Macro for masking the lowest bit of each byte in a + packed word */ + .macro packedmask1 reg, round + .if \round + and \reg, \reg, r8 >> 1 + .else + and \reg, r8 + .endif + .endm + + /* Macro for 8 pixel wide horizontal and vertical interpolation functions */ + .macro pixels8_hv round, put + + + pushm r0-r7, lr + + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ + + /* Rounding immediate */ + .if \round + mov r8, lo(0x02020202) + orh r8, hi(0x02020202) + .else + mov r8, lo(0x01010101) + orh r8, hi(0x01010101) + .endif + mov r7, 2 + + /* Pixel naming convention : + + |-----------------------------------------------------| + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 | + |----d00---d01---d02---d03---d04---d05---d06---d07----| + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 | + |-----------------------------------------------------| + */ +1: + ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 } + ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 } + mov lr, r9 + eor r2, r0, r1 + packedmask1 r2, \round + add r2, r8 + + paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} + + add r11, r10 // pixels += line_size + ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 } + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 } +0: + eor r5, r1, r3 + packedmask1 r5, \round + add r2, r5 + + paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2} + eor r6, r0, r1 + packedmask1 r6, \round + add r2, r2, r6 << 1 + + ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 } + add r11, r10 // pixels += line_size + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 } + + paddh.ub r0, r0, r1 + plsr.b r2, r2, 2 + padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 } + + /* Next row */ + .if \put + eor r2, r3, r4 + packedmask1 r2, \round + add r2, r8 + .else + ld.w r6, r12[0] + eor r2, r3, r4 + packedmask1 r2, \round + add r2, r8 + pavg.ub r0, r0, r6 + .endif + st.w r12[0], r0 // Put data into the block + + add r5, r2 + paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} + + eor r6, r0, r1 + packedmask1 r6, \round + add r5, r5, r6 << 1 + + .if \put + paddh.ub r1, r0, r1 + plsr.b r5, r5, 2 + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 } + .else + ld.w r3, r12[r10] + paddh.ub r1, r0, r1 + plsr.b r5, r5, 2 + padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 } + pavg.ub r1, r1, r3 + .endif + + st.w r12[r10], r1 // Put data into the block + + + ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 } + add r11, r10 // pixels += line_size + ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 } + add r12, r12, r10 << 1 // block += 2*line_size + sub lr, 2 + brne 0b + + mul r0, r10, r9 // r0 = line_size * h + rsub r0, r0, 4 // r0 = 4 - (line_size * h) + add r11, r0 + sub r11, r10 // pixels += 4 - (line_size * (h+1)) + add r12, r0 // pixels += 4 - (line_size * (h)) + sub r7, 1 + brne 1b + + popm r0-r7, pc + .endm + + + /* Macro for 8 pixel wide vertical interpolation functions */ + + .macro pixels8_v round, put + pushm r4-r7,lr + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ + + /* + Pixel Naming Convention : + |-----------------------------------------------| + | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | + |-d00---d01---d02---d03---d04---d05---d06---d07-| + | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | + |-----------------------------------------------| + */ + ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 } + ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4 + ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 } + ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 } + sub r10, 4 // stride -= 4 + add r11, r11, r10 << 1 // src += 2*stride + sub r11, -4 // src += 4 + +0: + .if \round + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} + .else + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} + .endif + + .if \put + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4 + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 } + .else + ld.w lr, r12[0] + ld.w r7, r12[4] + pavg.ub r5, r5, lr + pavg.ub r4, r4, r7 + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } + ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4 + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } + ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 } + .endif + add r11, r10 // src += stride +#ifdef USE_PREFETCH + pref r11[0] +#endif + add r12, r10 // dst += stride + + .if \round + pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} + pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} + .else + paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} + paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} + .endif + .if \put + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4 + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 } + .else + ld.w r8, r12[0] + ld.w r6, r12[4] + pavg.ub r5, r5, r8 + pavg.ub r4, r4, r6 + st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } + ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4 + st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } + ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 } + .endif + + add r11, r10 // src += stride +#ifdef USE_PREFETCH + pref r11[0] +#endif + add r12, r10 // dst += stride + sub r9, 2 + brne 0b + + popm r4-r7,pc + .endm + + /* Macro for 8 pixel wide horizontal interpolation functions */ + + .macro pixels8_h round, put + pushm r4-r7, lr + + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ + /* + Pixel Naming Convention: + |--------------------------------------------------------------------| + | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08| + |------|-------|-------|-------|-------|-------|-------|-------|-----| + | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18| + |--------------------------------------------------------------------| + */ + + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 } + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 } + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 } + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 } + add r11, r10 // src += stride + +0: + .if \round + pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} + pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} + .else + paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} + paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} + .endif + .if \put + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 } + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 } + .else + ld.w r8, r12[0] + ld.w r6, r12[4] + ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 } + ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 } + pavg.ub lr, lr, r8 + pavg.ub r7, r7, r6 + .endif + st.w r12[0], lr // dst = { d00, d01, d02, d03 } + st.w r12[4], r7 // dst = { d04, d05, d06, d07 } + ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 } + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 } + add r11, r10 // src += stride +#ifdef USE_PREFETCH + pref r11[0] +#endif + add r12, r10 // dst += stride + + .if \round + pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} + pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} + .else + paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} + paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} + .endif + .if \put + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 } + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 } + .else + ld.w r7, r12[0] + ld.w r6, r12[4] + ld.w lr, r11[0] // lr = { s00, s01, s02, s03 } + ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 } + pavg.ub r5, r5, r7 + pavg.ub r4, r4, r6 + .endif + st.w r12[0], r5 // dst = { d00, d01, d02, d03 } + st.w r12[4], r4 // dst = { d04, d05, d06, d07 } + ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 } + ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 } + add r11, r10 // src += stride +#ifdef USE_PREFETCH + pref r11[0] +#endif + add r12, r10 // dst += stride + sub r9, 2 + brne 0b + + popm r4-r7, pc + .endm + + /* Macro for 8 pixel wide copy functions */ + .macro pixels8 put + stm --sp, r3-r7,lr + /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ + mov lr, r9 + sub r3, r10, 2 // stride2 = stride - 2 +0: + .if \put + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 } + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4 + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 } + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 } + .else + ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 } + ld.d r4, r12[0] + ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4 + ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 } + ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 } + pavg.ub r6, r6, r4 + pavg.ub r7, r7, r5 + ld.d r4, r12[r10] + .endif + st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 } + add r11, r11, r3 << 1 // src += stride2 * 2 + .ifeq \put + pavg.ub r8, r8, r4 + pavg.ub r9, r9, r5 + .endif + st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 } + add r12, r12, r10 << 1 // dst += 2*stride + sub lr, 2 + brne 0b + ldm sp++, r3-r7,pc + + .endm + + .global put_no_rnd_pixels8_hv_avr32 + .text +put_no_rnd_pixels8_hv_avr32: + pixels8_hv 0, 1 + + .global put_pixels8_hv_avr32 + .text +put_pixels8_hv_avr32: + pixels8_hv 1, 1 + + .global avg_no_rnd_pixels8_hv_avr32 + .text +avg_no_rnd_pixels8_hv_avr32: + pixels8_hv 0, 0 + + .global avg_pixels8_hv_avr32 + .text +avg_pixels8_hv_avr32: + pixels8_hv 1, 0 + + .global put_no_rnd_pixels8_v_avr32 + .text +put_no_rnd_pixels8_v_avr32: + pixels8_v 0, 1 + + .global put_pixels8_v_avr32 + .text +put_pixels8_v_avr32: + pixels8_v 1, 1 + + .global avg_no_rnd_pixels8_v_avr32 + .text +avg_no_rnd_pixels8_v_avr32: + pixels8_v 0, 0 + + .global avg_pixels8_v_avr32 + .text +avg_pixels8_v_avr32: + pixels8_v 1, 0 + + .global put_no_rnd_pixels8_h_avr32 + .text +put_no_rnd_pixels8_h_avr32: + pixels8_h 0, 1 + + .global put_pixels8_h_avr32 + .text +put_pixels8_h_avr32: + pixels8_h 1, 1 + + .global avg_no_rnd_pixels8_h_avr32 + .text +avg_no_rnd_pixels8_h_avr32: + pixels8_h 0, 0 + + .global avg_pixels8_h_avr32 + .text +avg_pixels8_h_avr32: + pixels8_h 1, 0 + + .global put_pixels8_avr32 + .global put_no_rnd_pixels8_avr32 + .text +put_pixels8_avr32: +put_no_rnd_pixels8_avr32: + pixels8 1 + + .global avg_no_rnd_pixels8_avr32 + .global avg_pixels8_avr32 + .text +avg_pixels8_avr32: +avg_no_rnd_pixels8_avr32: + pixels8 0 diff --git a/libavcodec/avr32/pico.h b/libavcodec/avr32/pico.h new file mode 100644 index 0000000..32201ba --- /dev/null +++ b/libavcodec/avr32/pico.h @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2007 Atmel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. The name of ATMEL may not be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ +#ifndef __PICO_H__ +#define __PICO_H__ + + + +/* Coprocessor Number */ +#define PICO_CPNO 1 + +/* Pixel Coprocessor Register file */ +#define PICO_REGVECT_INPIX2 cr0 +#define PICO_REGVECT_INPIX1 cr1 +#define PICO_REGVECT_INPIX0 cr2 +#define PICO_REGVECT_OUTPIX2 cr3 +#define PICO_REGVECT_OUTPIX1 cr4 +#define PICO_REGVECT_OUTPIX0 cr5 +#define PICO_REGVECT_COEFF0_A cr6 +#define PICO_REGVECT_COEFF0_B cr7 +#define PICO_REGVECT_COEFF1_A cr8 +#define PICO_REGVECT_COEFF1_B cr9 +#define PICO_REGVECT_COEFF2_A cr10 +#define PICO_REGVECT_COEFF2_B cr11 +#define PICO_REGVECT_VMU0_OUT cr12 +#define PICO_REGVECT_VMU1_OUT cr13 +#define PICO_REGVECT_VMU2_OUT cr14 +#define PICO_REGVECT_CONFIG cr15 + +#define PICO_INPIX2 0 +#define PICO_INPIX1 1 +#define PICO_INPIX0 2 +#define PICO_OUTPIX2 3 +#define PICO_OUTPIX1 4 +#define PICO_OUTPIX0 5 +#define PICO_COEFF0_A 6 +#define PICO_COEFF0_B 7 +#define PICO_COEFF1_A 8 +#define PICO_COEFF1_B 9 +#define PICO_COEFF2_A 10 +#define PICO_COEFF2_B 11 +#define PICO_VMU0_OUT 12 +#define PICO_VMU1_OUT 13 +#define PICO_VMU2_OUT 14 +#define PICO_CONFIG 15 + +/* Config Register */ +#define PICO_COEFF_FRAC_BITS_OFFSET 0 +#define PICO_COEFF_FRAC_BITS_SIZE 4 +#define PICO_OFFSET_FRAC_BITS_OFFSET 4 +#define PICO_OFFSET_FRAC_BITS_SIZE 4 +#define PICO_INPUT_MODE_OFFSET 8 +#define PICO_INPUT_MODE_SIZE 2 +#define PICO_OUTPUT_MODE_OFFSET 10 +#define PICO_OUTPUT_MODE_SIZE 1 + +struct pico_config_t { + unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE; + unsigned int output_mode : PICO_OUTPUT_MODE_SIZE; + unsigned int input_mode : PICO_INPUT_MODE_SIZE; + unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE; + unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE; + int vmu2_out; + int vmu1_out; + int vmu0_out; + short coeff2_2; + short coeff2_3; + short coeff2_0; + short coeff2_1; + short coeff1_2; + short coeff1_3; + short coeff1_0; + short coeff1_1; + short coeff0_2; + short coeff0_3; + short coeff0_0; + short coeff0_1; +}; + + +#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET) +#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET) +#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET) +#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET) + +#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1)) +#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1)) +#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1)) +#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1)) + +enum pico_input_mode { PICO_TRANSFORMATION_MODE, + PICO_HOR_FILTER_MODE, + PICO_VERT_FILTER_MODE }; + +enum pico_output_mode { PICO_PACKED_MODE, + PICO_PLANAR_MODE }; + +/* Bits in coefficients */ +#define PICO_COEFF_BITS 12 + +/* Operation bits */ +#define PICO_MATRIX (0) +#define PICO_USE_ACC (1 << 2) +#define PICO_SINGLE_VECTOR (1 << 3) + + +#define __str(x...) #x +#define __xstr(x...) __str(x) + +#define PICO_PUT_W(pico_reg, x) \ + __builtin_mvrc_w(PICO_CPNO, pico_reg, x); +#define PICO_GET_W(pico_reg) \ + __builtin_mvcr_w(PICO_CPNO, pico_reg) + +#define PICO_MVCR_W(x, pico_reg) \ + asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x)); + +#define PICO_MVRC_W(pico_reg, x) \ + asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x)); + +#define PICO_PUT_D(pico_reg, x) \ + __builtin_mvrc_d(PICO_CPNO, pico_reg, x); +#define PICO_GET_D(pico_reg) \ + __builtin_mvcr_d(PICO_CPNO, pico_reg) + +#define PICO_MVCR_D(x, pico_reg) \ + asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x)); +#define PICO_MVRC_D(pico_reg, x) \ + asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x)); + +#define PICO_STCM_W(ptr, pico_regs...) \ + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); +#define PICO_STCM_D(ptr, pico_regs...) \ + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); + +#define PICO_STCM_W_DEC(ptr, pico_regs...) \ + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); +#define PICO_STCM_D_DEC(ptr, pico_regs...) \ + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); + +#define PICO_LDCM_W(ptr, pico_regs...) \ + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); +#define PICO_LDCM_D(ptr, pico_regs...) \ + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); + +#define PICO_LDCM_W_INC(ptr, pico_regs...) \ + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); +#define PICO_LDCM_D_INC(ptr, pico_regs...) \ + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); + +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \ + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr); + +static inline void set_pico_config(struct pico_config_t *config){ + PICO_LDCM_D(config, + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B, + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B, + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG); +} + +static inline void get_pico_config(struct pico_config_t *config){ + PICO_STCM_D(config, + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B, + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B, + PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT, + PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG); +} + +static inline void dump_pico_config(){ + struct pico_config_t pico_config; + char *input_mode, *output_mode; + get_pico_config(&pico_config); + + + av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n"); + av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits); + av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits); + + switch ( pico_config.input_mode ){ + case PICO_TRANSFORMATION_MODE: + input_mode = "Transformation Mode"; + break; + case PICO_HOR_FILTER_MODE: + input_mode = "Horisontal Filter Mode"; + break; + case PICO_VERT_FILTER_MODE: + input_mode = "Vertical Filter Mode"; + break; + default: + input_mode = "Unknown Mode!!"; + break; + } + av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode); + + switch ( pico_config.output_mode ){ + case PICO_PLANAR_MODE: + output_mode = "Planar Mode"; + break; + case PICO_PACKED_MODE: + output_mode = "Packed Mode"; + break; + default: + output_mode = "Unknown Mode!!"; + break; + } + + av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode); + + av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits)); + av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits)); + av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits)); + av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits)); + + av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits)); + av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits)); + av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits)); + av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits)); + + av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits)); + av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits)); + av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits)); + av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits)); +} + + + +#endif + diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h index 26b4f8d..1f8fabf 100644 --- a/libavcodec/bitstream.h +++ b/libavcodec/bitstream.h @@ -171,7 +171,7 @@ typedef struct RL_VLC_ELEM { #endif /* used to avoid missaligned exceptions on some archs (alpha, ...) */ -#if defined(ARCH_X86) || defined(ARCH_X86_64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_AVR32) # define unaligned16(a) (*(const uint16_t*)(a)) # define unaligned32(a) (*(const uint32_t*)(a)) # define unaligned64(a) (*(const uint64_t*)(a)) @@ -813,6 +813,44 @@ void free_vlc(VLC *vlc); * if the vlc code is invalid and max_depth>1 than the number of bits removed * is undefined */ + +#if defined(ARCH_AVR32) +#define GET_VLC(code, name, gb, table, bits, max_depth)\ +{\ + int n, index, nb_bits;\ + union { VLC_TYPE vlc[2];\ + uint32_t u32; } table_elem;\ +\ + index= SHOW_UBITS(name, gb, bits);\ + table_elem.u32 = unaligned32(&table[index]); \ + code = table_elem.vlc[0];\ + n = table_elem.vlc[1];\ +\ + if(max_depth > 1 && n < 0 ){\ + LAST_SKIP_BITS(name, gb, bits)\ + UPDATE_CACHE(name, gb)\ +\ + nb_bits = -n;\ +\ + index= SHOW_UBITS(name, gb, nb_bits) + code;\ + table_elem.u32 = unaligned32(&table[index]); \ + code = table_elem.vlc[0];\ + n = table_elem.vlc[1];\ + if(max_depth > 2 && n < 0){\ + LAST_SKIP_BITS(name, gb, nb_bits)\ + UPDATE_CACHE(name, gb)\ +\ + nb_bits = -n;\ +\ + index= SHOW_UBITS(name, gb, nb_bits) + code;\ + code = table[index][0];\ + n = table[index][1];\ + }\ + }\ + SKIP_BITS(name, gb, n)\ +} + +#else #define GET_VLC(code, name, gb, table, bits, max_depth)\ {\ int n, index, nb_bits;\ @@ -821,7 +859,7 @@ void free_vlc(VLC *vlc); code = table[index][0];\ n = table[index][1];\ \ - if(max_depth > 1 && n < 0){\ + if(max_depth > 1 && n < 0 ){\ LAST_SKIP_BITS(name, gb, bits)\ UPDATE_CACHE(name, gb)\ \ @@ -843,7 +881,38 @@ void free_vlc(VLC *vlc); }\ SKIP_BITS(name, gb, n)\ } +#endif +#if defined(ARCH_AVR32) +#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\ +{\ + int n, index, nb_bits;\ + union { RL_VLC_ELEM vlc;\ + uint32_t u32; } table_elem;\ +\ + index= SHOW_UBITS(name, gb, bits);\ + table_elem.u32 = unaligned32(&table[index]); \ + level = table_elem.vlc.level;\ + n = table_elem.vlc.len;\ +\ + if(max_depth > 1 && n < 0 ){\ + SKIP_BITS(name, gb, bits)\ + if(need_update){\ + UPDATE_CACHE(name, gb)\ + }\ +\ + nb_bits = -n;\ +\ + index= SHOW_UBITS(name, gb, nb_bits) + level;\ + table_elem.u32 = unaligned32(&table[index]); \ + level = table_elem.vlc.level;\ + n = table_elem.vlc.len;\ + }\ + run= table_elem.vlc.run;\ + SKIP_BITS(name, gb, n)\ +} + +#else #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\ {\ int n, index, nb_bits;\ @@ -852,7 +921,7 @@ void free_vlc(VLC *vlc); level = table[index].level;\ n = table[index].len;\ \ - if(max_depth > 1 && n < 0){\ + if(max_depth > 1 && n < 0 ){\ SKIP_BITS(name, gb, bits)\ if(need_update){\ UPDATE_CACHE(name, gb)\ @@ -867,7 +936,7 @@ void free_vlc(VLC *vlc); run= table[index].run;\ SKIP_BITS(name, gb, n)\ } - +#endif /** * parses a vlc code, faster then get_vlc() diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 56c42b9..8fc10c6 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -4197,6 +4197,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) #ifdef ARCH_BFIN dsputil_init_bfin(c,avctx); #endif +#ifdef ARCH_AVR32 + dsputil_init_avr32(c,avctx); +#endif for(i=0; i<64; i++){ if(!c->put_2tap_qpel_pixels_tab[0][i]) diff --git a/libavcodec/h264.c b/libavcodec/h264.c index 865e80a..8f7c3f1 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -3258,7 +3258,12 @@ static void free_tables(H264Context *h){ static void init_dequant8_coeff_table(H264Context *h){ int i,q,x; +#ifdef ARCH_AVR32 + const int transpose = 0; +#else const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly +#endif + h->dequant8_coeff[0] = h->dequant8_buffer[0]; h->dequant8_coeff[1] = h->dequant8_buffer[1]; @@ -3281,7 +3286,13 @@ static void init_dequant8_coeff_table(H264Context *h){ static void init_dequant4_coeff_table(H264Context *h){ int i,j,q,x; + // Yes this is ugly as hell.... +#ifdef ARCH_AVR32 + const int transpose = 0; +#else const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly +#endif + for(i=0; i<6; i++ ){ h->dequant4_coeff[i] = h->dequant4_buffer[i]; for(j=0; j<i; j++){ @@ -4663,7 +4674,11 @@ static int decode_slice_header(H264Context *h){ if (MPV_common_init(s) < 0) return -1; +#ifdef ARCH_AVR32 + if ( 1 ){ +#else if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly +#endif memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t)); memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t)); }else{ diff --git a/libavutil/common.h b/libavutil/common.h index 3ae5971..7e52b90 100644 --- a/libavutil/common.h +++ b/libavutil/common.h @@ -283,23 +283,39 @@ static inline int mid_pred(int a, int b, int c) * @param amax maximum value of the clip range * @return cliped value */ +#if defined(ARCH_AVR32) +#define clip(a, amin, amax) \ + ({ int __tmp__; \ + asm ("min\t%0, %1, %2\n" \ + "max\t%0, %0, %3\n" \ + : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \ + __tmp__; }) +#else static inline int clip(int a, int amin, int amax) { if (a < amin) return amin; else if (a > amax) return amax; else return a; } +#endif /** * clip a signed integer value into the 0-255 range * @param a value to clip * @return cliped value */ +#if defined(ARCH_AVR32) +#define clip_uint8(a) \ + ({ int __tmp__ = a; \ + asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \ + __tmp__; }) +#else static inline uint8_t clip_uint8(int a) { if (a&(~255)) return (-a)>>31; else return a; } +#endif /* math */ int64_t ff_gcd(int64_t a, int64_t b); diff --git a/libavutil/internal.h b/libavutil/internal.h index 285d304..a8b0718 100644 --- a/libavutil/internal.h +++ b/libavutil/internal.h @@ -210,6 +210,15 @@ if((y)<(x)){\ }\ } +/* XXX: Hack for uclibc which declares lrintf but does not implement it... */ +#ifdef ARCH_AVR32 +#undef HAVE_LRINTF +#define HAVE_LRINTF 1 +#define lrintf(x) rint(x) +#define llrint(x) (long long)rint(x) +#endif + + #ifndef HAVE_LRINTF /* XXX: add ISOC specific test to avoid specific BSD testing. */ /* better than nothing implementation. */ diff --git a/libfaad2/common.h b/libfaad2/common.h index f809042..6c5fb21 100644 --- a/libfaad2/common.h +++ b/libfaad2/common.h @@ -67,7 +67,7 @@ extern "C" { /* Use if target platform has address generators with autoincrement */ //#define PREFER_POINTERS -#if defined(_WIN32_WCE) || defined(__arm__) +#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__) #define FIXED_POINT #endif diff --git a/libmpcodecs/ad_libmad.c b/libmpcodecs/ad_libmad.c index 076359a..51b77fe 100644 --- a/libmpcodecs/ad_libmad.c +++ b/libmpcodecs/ad_libmad.c @@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){ sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2; sh->samplerate=this->frame.header.samplerate; sh->i_bps=this->frame.header.bitrate/8; +#ifdef WORDS_BIGENDIAN + sh->sample_format = AF_FORMAT_S16_BE; +#else + sh->sample_format = AF_FORMAT_S16_LE; +#endif sh->samplesize=2; return 1; diff --git a/libswscale/pico-avr32.h b/libswscale/pico-avr32.h new file mode 100644 index 0000000..7ac6200 --- /dev/null +++ b/libswscale/pico-avr32.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2007 Atmel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. The name of ATMEL may not be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ +#ifndef __PICO_H__ +#define __PICO_H__ + +/* Coprocessor Number */ +#define PICO_CPNO 1 + +/* Pixel Coprocessor Register file */ +#define PICO_REGVECT_INPIX2 cr0 +#define PICO_REGVECT_INPIX1 cr1 +#define PICO_REGVECT_INPIX0 cr2 +#define PICO_REGVECT_OUTPIX2 cr3 +#define PICO_REGVECT_OUTPIX1 cr4 +#define PICO_REGVECT_OUTPIX0 cr5 +#define PICO_REGVECT_COEFF0_A cr6 +#define PICO_REGVECT_COEFF0_B cr7 +#define PICO_REGVECT_COEFF1_A cr8 +#define PICO_REGVECT_COEFF1_B cr9 +#define PICO_REGVECT_COEFF2_A cr10 +#define PICO_REGVECT_COEFF2_B cr11 +#define PICO_REGVECT_VMU0_OUT cr12 +#define PICO_REGVECT_VMU1_OUT cr13 +#define PICO_REGVECT_VMU2_OUT cr14 +#define PICO_REGVECT_CONFIG cr15 + +#define PICO_INPIX2 0 +#define PICO_INPIX1 1 +#define PICO_INPIX0 2 +#define PICO_OUTPIX2 3 +#define PICO_OUTPIX1 4 +#define PICO_OUTPIX0 5 +#define PICO_COEFF0_A 6 +#define PICO_COEFF0_B 7 +#define PICO_COEFF1_A 8 +#define PICO_COEFF1_B 9 +#define PICO_COEFF2_A 10 +#define PICO_COEFF2_B 11 +#define PICO_VMU0_OUT 12 +#define PICO_VMU1_OUT 13 +#define PICO_VMU2_OUT 14 +#define PICO_CONFIG 15 + +/* Config Register */ +#define PICO_COEFF_FRAC_BITS 0 +#define PICO_COEFF_FRAC_BITS_WIDTH 4 +#define PICO_OFFSET_FRAC_BITS 4 +#define PICO_OFFSET_FRAC_BITS_WIDTH 4 +#define PICO_INPUT_MODE 8 +#define PICO_INPUT_MODE_WIDTH 2 +#define PICO_OUTPUT_MODE 10 + +#define PICO_TRANSFORMATION_MODE 0 +#define PICO_HOR_FILTER_MODE 1 +#define PICO_VERT_FILTER_MODE 2 + +#define PICO_PLANAR_MODE 1 +#define PICO_PACKED_MODE 0 + +/* Bits in coefficients */ +#define PICO_COEFF_BITS 12 + +/* Operation bits */ +#define PICO_USE_ACC (1 << 2) +#define PICO_SINGLE_VECTOR (1 << 3) + + +#define __str(x...) #x +#define __xstr(x...) __str(x) + +#define PICO_PUT_W(pico_reg, x) \ + __builtin_mvrc_w(PICO_CPNO, pico_reg, x); +#define PICO_GET_W(pico_reg) \ + __builtin_mvcr_w(PICO_CPNO, pico_reg) + +#define PICO_PUT_D(pico_reg, x) \ + __builtin_mvrc_d(PICO_CPNO, pico_reg, x); +#define PICO_GET_D(pico_reg) \ + __builtin_mvcr_d(PICO_CPNO, pico_reg) + + +#define PICO_STCM_W(ptr, pico_regs...) \ + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); +#define PICO_STCM_D(ptr, pico_regs...) \ + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); + +#define PICO_STCM_W_DEC(ptr, pico_regs...) \ + asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); +#define PICO_STCM_D_DEC(ptr, pico_regs...) \ + asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); + +#define PICO_LDCM_W(ptr, pico_regs...) \ + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); +#define PICO_LDCM_D(ptr, pico_regs...) \ + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); + +#define PICO_LDCM_W_INC(ptr, pico_regs...) \ + asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); +#define PICO_LDCM_D_INC(ptr, pico_regs...) \ + asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); + +#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \ + __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr); + + +#endif + diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index ecd28f5..3221d0c 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -173,7 +173,7 @@ typedef struct SwsContext{ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c); int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation); -char *sws_format_name(int format); +char *sws_format_name(enum PixelFormat format); //FIXME replace this with something faster #define isPlanarYUV(x) ((x)==PIX_FMT_YUV410P || (x)==PIX_FMT_YUV420P \ diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c index 71759bc..fa83985 100644 --- a/libswscale/yuv2rgb.c +++ b/libswscale/yuv2rgb.c @@ -44,6 +44,10 @@ #include "yuv2rgb_mlib.c" #endif +#ifdef ARCH_AVR32 +#include "yuv2rgb_avr32.c" +#endif + #define DITHER1XBPP // only for mmx const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={ @@ -601,6 +605,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) if(t) return t; } #endif +#ifdef ARCH_AVR32 + { + SwsFunc t= yuv2rgb_init_avr32(c); + if(t) return t; + } +#endif #ifdef HAVE_ALTIVEC if (c->flags & SWS_CPU_CAPS_ALTIVEC) { @@ -678,6 +688,10 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv); oy -= 256*brightness; +#ifdef ARCH_AVR32 + yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation); +#endif + for (i = 0; i < 1024; i++) { int j; diff --git a/libswscale/yuv2rgb_avr32.c b/libswscale/yuv2rgb_avr32.c new file mode 100644 index 0000000..4a8341e --- /dev/null +++ b/libswscale/yuv2rgb_avr32.c @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2007 Atmel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * 3. The name of ATMEL may not be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, + * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ +#include "pico-avr32.h" + + +#define RGB(uv_part) \ + __asm__ volatile ( \ + "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \ + "ld.w\t%1, %4[%8:" uv_part " << 2]\n\t" /* g = c->table_gU[U] */ \ + "ld.w\t%2, %5[%8:" uv_part " << 2]\n\t" /* b = c->table_bU[U] */ \ + "add\t%1, %0\n\t" /* g += tmp */\ + "ld.w\t%0, %6[%7:" uv_part " << 2]" /* r = c->table_rV[V] */ \ + : "=&r" (r), "=&r" (g), "=&r" (b) \ + : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \ + "r" (&c->table_rV[0]), "r" (V), "r" (U)); + + +#undef YUV2RGB1 +#define YUV2RGB1(dst, src, y, idx) \ + { int tmp2; __asm__ volatile ( \ + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \ + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \ + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \ + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \ + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } + +#undef YUV2RGB2 +#define YUV2RGB2(dst, src, y, idx) \ + { int tmp2; __asm__ volatile ( \ + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \ + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \ + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ + "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \ + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ + "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \ + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } + + +#undef YUV2BGR1 +#define YUV2BGR1(dst, src, y, idx) \ + { int tmp2; __asm__ volatile ( \ + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \ + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \ + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \ + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \ + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } + +#undef YUV2BGR2 +#define YUV2BGR2(dst, src, y, idx) \ + { int tmp2; __asm__ volatile ( \ + "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ + "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \ + "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ + "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ + "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \ + "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ + "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ + "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \ + "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ + "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ + "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \ + : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ + : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } + + + +int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, + int srcSliceH, uint8_t* dst[], int dstStride[]){ + int y; + + if(c->srcFormat == PIX_FMT_YUV422P){ + srcStride[1] *= 2; + srcStride[2] *= 2; + } + + + for(y=0; y<srcSliceH; y+=2){ + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]); + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]); + uint32_t *r, *g, *b; + uint8_t *py_1= src[0] + y*srcStride[0]; + uint8_t *py_2= py_1 + srcStride[0]; + uint8_t *pu= src[1] + (y>>1)*srcStride[1]; + uint8_t *pv= src[2] + (y>>1)*srcStride[2]; + unsigned int h_size= c->dstW>>3; + while (h_size--) { + uint32_t U, V, Y1, Y2, tmp; + U = ((uint32_t*)pu)[0]; + V = ((uint32_t*)pv)[0]; + + RGB("t") + YUV2BGR1(dst_1, py_1, Y1, 0) + YUV2BGR1(dst_2, py_2, Y2, 0) + + RGB("u") + YUV2BGR2(dst_1, py_1, Y1, 1) + YUV2BGR2(dst_2, py_2, Y2, 1) + + RGB("l") + YUV2BGR1(dst_1, py_1, Y1, 2) + YUV2BGR1(dst_2, py_2, Y2, 2) + + RGB("b") + YUV2BGR2(dst_1, py_1, Y1, 3) + YUV2BGR2(dst_2, py_2, Y2, 3) + + + + pu += 4; + pv += 4; + py_1 += 8; + py_2 += 8; + dst_1 += 24; + dst_2 += 24; + } + } + return srcSliceH; +} + + + +static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, + int srcSliceH, uint8_t* dst[], int dstStride[]){ + int y; + + if(c->srcFormat == PIX_FMT_YUV422P){ + srcStride[1] *= 2; + srcStride[2] *= 2; + } + for(y=0; y<srcSliceH; y+=2){ + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]); + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]); + uint8_t *r, *g, *b; + uint8_t *py_1= src[0] + y*srcStride[0]; + uint8_t *py_2= py_1 + srcStride[0]; + uint8_t *pu= src[1] + (y>>1)*srcStride[1]; + uint8_t *pv= src[2] + (y>>1)*srcStride[2]; + unsigned int h_size= c->dstW>>3; + while (h_size--) { + uint32_t U, V, Y1, Y2, tmp; + U = ((uint32_t*)pu)[0]; + V = ((uint32_t*)pv)[0]; + + RGB("t") + YUV2RGB1(dst_1, py_1, Y1, 0) + YUV2RGB1(dst_2, py_2, Y2, 0) + + RGB("u") + YUV2RGB2(dst_1, py_1, Y1, 1) + YUV2RGB2(dst_2, py_2, Y2, 1) + + RGB("l") + YUV2RGB1(dst_1, py_1, Y1, 2) + YUV2RGB1(dst_2, py_2, Y2, 2) + + RGB("b") + YUV2RGB2(dst_1, py_1, Y1, 3) + YUV2RGB2(dst_2, py_2, Y2, 3) + + pu += 4; + pv += 4; + py_1 += 8; + py_2 += 8; + dst_1 += 24; + dst_2 += 24; + } + } + return srcSliceH; +} + +#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits) +#define COEFF_FRAC_BITS 9 +#define OFFSET_FRAC_BITS 2 + +/* Coefficients used in the pico */ +static struct { + short coeff2_2; + short coeff2_3; + short coeff2_0; + short coeff2_1; + short coeff1_2; + short coeff1_3; + short coeff1_0; + short coeff1_1; + short coeff0_2; + short coeff0_3; + short coeff0_0; + short coeff0_1; +} pico_coeff; + + +static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, + int srcSliceH, uint8_t* dst[], int dstStride[]){ + int y; + static int first_time = 1; + + /* Initialize pico */ + PICO_LDCM_D(&pico_coeff, + PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, + PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B, + PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B); + + PICO_PUT_W(PICO_CONFIG, + (PICO_PACKED_MODE << PICO_OUTPUT_MODE + | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE + | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS + | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS)); + + + if(c->srcFormat == PIX_FMT_YUV422P){ + srcStride[1] *= 2; + srcStride[2] *= 2; + } + + for(y=0; y<srcSliceH; y+=2){ + uint8_t *dst_1= (uint8_t*)(dst[0] + (y+srcSliceY )*dstStride[0]); + uint8_t *dst_2= (uint8_t*)(dst[0] + (y+srcSliceY+1)*dstStride[0]); + uint8_t *r, *g, *b; + uint8_t *py_1= src[0] + y*srcStride[0]; + uint8_t *py_2= py_1 + srcStride[0]; + uint8_t *pu= src[1] + (y>>1)*srcStride[1]; + uint8_t *pv= src[2] + (y>>1)*srcStride[2]; + unsigned int h_size= c->dstW>>3; + int *py_1_int = (int *)py_1; + int *py_2_int = (int *)py_2; + int *pu_int = (int *)pu; + int *pv_int = (int *)pv; + while (h_size--) { + PICO_PUT_W(PICO_INPIX0, *py_1_int++); + PICO_PUT_W(PICO_INPIX1, *pu_int++); + PICO_PUT_W(PICO_INPIX2, *pv_int++); + PICO_OP(0, 0, 0, 4, 8); + PICO_OP(0, 1, 1, 4, 8); + PICO_OP(0, 2, 2, 5, 9); + PICO_OP(0, 3, 3, 5, 9); + PICO_PUT_W(PICO_INPIX0, *py_1_int++); + PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); + PICO_OP(0, 0, 0, 6, 10); + PICO_OP(0, 1, 1, 6, 10); + PICO_OP(0, 2, 2, 7, 11); + PICO_OP(0, 3, 3, 7, 11); + PICO_PUT_W(PICO_INPIX0, *py_2_int++); + PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); + + PICO_OP(0, 0, 0, 4, 8); + PICO_OP(0, 1, 1, 4, 8); + PICO_OP(0, 2, 2, 5, 9); + PICO_OP(0, 3, 3, 5, 9); + PICO_PUT_W(PICO_INPIX0, *py_2_int++); + PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); + PICO_OP(0, 0, 0, 6, 10); + PICO_OP(0, 1, 1, 6, 10); + PICO_OP(0, 2, 2, 7, 11); + PICO_OP(0, 3, 3, 7, 11); + PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); + + dst_1 += 24; + dst_2 += 24; + } + } + return srcSliceH; +} + +extern int avr32_use_pico; + +SwsFunc yuv2rgb_init_avr32 (SwsContext *c){ + switch(c->dstFormat){ + case PIX_FMT_BGR24: + { + if ( avr32_use_pico ){ + MSG_ERR("AVR32 BGR24: Using PICO for color space conversion\n"); + return yuv2bgr24_avr32_pico; + } else { + MSG_ERR("AVR32 BGR24: Using optimized color space conversion\n"); + return yuv2bgr24_avr32; + } + } + break; + case PIX_FMT_RGB24: + { + if ( avr32_use_pico ){ + MSG_ERR("AVR32 RGB24: Using PICO for color space conversion\n"); + return yuv2bgr24_avr32_pico; + } else { + MSG_ERR("AVR32 RGB24: Using optimized color space conversion\n"); + return yuv2rgb24_avr32; + } + } + } + return NULL; +} + + +int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){ + const int isRgb = (c->dstFormat == PIX_FMT_RGB24); + + int64_t crv = inv_table[0]; + int64_t cbu = inv_table[1]; + int64_t cgu = -inv_table[2]; + int64_t cgv = -inv_table[3]; + int64_t cy = 1<<16; + int64_t oy = 0; + + if(!fullRange){ + cy= (cy*255) / 219; + oy= 16<<16; + } + + cy = (cy *contrast )>>16; + crv= (crv*contrast * saturation)>>32; + cbu= (cbu*contrast * saturation)>>32; + cgu= (cgu*contrast * saturation)>>32; + cgv= (cgv*contrast * saturation)>>32; + + oy -= 256*brightness; + + pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */ + pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */ + pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */ + pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS) + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */ + + if ( isRgb ){ + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */ + pico_coeff.coeff0_1 = 0; /* R <- U */ + pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */ + pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS) + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */ + + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */ + pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */ + pico_coeff.coeff2_2 = 0; /* B <- V */ + pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS) + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */ + } else { + pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */ + pico_coeff.coeff2_1 = 0; /* R <- U */ + pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */ + pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS) + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */ + + pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */ + pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */ + pico_coeff.coeff0_2 = 0; /* B <- V */ + pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS) + + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */ + } + +} + + +#undef RGB diff --git a/libvo/vo_fbdev2.c b/libvo/vo_fbdev2.c index 053c193..7017770 100644 --- a/libvo/vo_fbdev2.c +++ b/libvo/vo_fbdev2.c @@ -22,6 +22,9 @@ #include "sub.h" #include "mp_msg.h" +/* Draw directly to framebuffer */ +#define USE_CONVERT2FB + static vo_info_t info = { "Framebuffer Device", "fbdev2", @@ -178,6 +181,15 @@ static int fb_preinit(int reset) } fb_orig_vinfo = fb_vinfo; + /* Reset panning offset */ + fb_vinfo.yoffset = 0; + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) { + mp_msg(MSGT_VO, MSGL_ERR, + "[fbdev2] FBIOPAN_DISPLAY failed: %s\n", + strerror(errno)); + return 0; + } + fb_bpp = fb_vinfo.bits_per_pixel; /* 16 and 15 bpp is reported as 16 bpp */ @@ -289,6 +301,10 @@ static int config(uint32_t width, uint32_t height, uint32_t d_width, mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno)); return 1; } +#else + if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2) + && fb_vinfo.yoffset == 0) + center += fb_line_len * fb_vinfo.yres; #endif if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres); @@ -299,14 +315,22 @@ static int query_format(uint32_t format) { // open the device, etc. if (fb_preinit(0)) return 0; - if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) { + if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) { int fb_target_bpp = format & 0xff; set_bpp(&fb_vinfo, fb_target_bpp); fb_vinfo.xres_virtual = fb_vinfo.xres; - fb_vinfo.yres_virtual = fb_vinfo.yres; + fb_vinfo.yres_virtual = fb_vinfo.yres * 2; if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) { - mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno)); - return 0; + mp_msg(MSGT_VO, MSGL_WARN, + "[fbdev2] Can't double virtual y resolution: %s\n", + strerror(errno)); + fb_vinfo.yres_virtual = fb_vinfo.yres; + if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) { + mp_msg(MSGT_VO, MSGL_ERR, + "[fbdev2] Can't put VSCREENINFO: %s\n", + strerror(errno)); + return -1; + } } fb_pixel_size = fb_vinfo.bits_per_pixel / 8; fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length + @@ -367,16 +391,67 @@ static void check_events(void) static void flip_page(void) { -#ifndef USE_CONVERT2FB int i, out_offset = 0, in_offset = 0; - for (i = 0; i < in_height; i++) { - memcpy(center + out_offset, next_frame + in_offset, - in_width * fb_pixel_size); - out_offset += fb_line_len; - in_offset += in_width * fb_pixel_size; - } +#ifndef USE_CONVERT2FB + if (1) { +#else + if (fb_vinfo.yres_virtual == fb_vinfo.yres) { #endif + for (i = 0; i < in_height; i++) { + memcpy(center + out_offset, next_frame + in_offset, + in_width * fb_pixel_size); + out_offset += fb_line_len; + in_offset += in_width * fb_pixel_size; + } + } else { + if (fb_vinfo.yoffset == 0) { + fb_vinfo.yoffset += fb_vinfo.yres; + center -= fb_line_len * fb_vinfo.yres; + } else { + fb_vinfo.yoffset = 0; + center += fb_line_len * fb_vinfo.yres; + } + + if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) { + mp_msg(MSGT_VO, MSGL_ERR, + "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n", + strerror(errno)); + } + } +} + +static uint32_t get_image(mp_image_t *mpi) +{ + if(mpi->flags&MP_IMGFLAG_READABLE) + return VO_FALSE; // slow video ram + if(mpi->type==MP_IMGTYPE_STATIC) + return VO_FALSE; // it is not static + + if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) { + // we're lucky or codec accepts stride => ok, let's go! + + //YUY2 and RGB formats + mpi->planes[0] = center; + mpi->width = in_width; + mpi->stride[0] = fb_line_len; + + // center image + + mpi->flags |= MP_IMGFLAG_DIRECT; + + return VO_TRUE; + } + + return VO_FALSE; +} + +static uint32_t put_image(mp_image_t *mpi) +{ + // already out? + if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK))) + return VO_TRUE; + return VO_FALSE; } static void uninit(void) @@ -403,6 +478,10 @@ static int control(uint32_t request, void *data, ...) switch (request) { case VOCTRL_QUERY_FORMAT: return query_format(*((uint32_t*)data)); + case VOCTRL_GET_IMAGE: + return get_image(data); + case VOCTRL_DRAW_IMAGE: + return put_image(data); } return VO_NOTIMPL; } diff --git a/version.sh b/version.sh index 44b5c5d..cf22a68 100755 --- a/version.sh +++ b/version.sh @@ -1,2 +1,2 @@ #!/bin/sh -echo "#define VERSION \"1.0rc1-$1\"" > version.h +echo "#define VERSION \"1.0rc1.atmel.2-$1\"" > version.h