From c02a6d9f5fd4feaaeac817d6941776ba051853d0 Mon Sep 17 00:00:00 2001 From: Ulf Samuelsson Date: Thu, 20 Sep 2007 21:52:23 +0000 Subject: Fix bad patch in MPlayer --- package/mplayer/mplayer-1.0rc1-atmel.2.patch | 6445 -------------------- package/mplayer/mplayer-1.0rc1-atmel.3.patch | 6444 +++++++++++++++++++ ...move-configuration-x11-header-search-path.patch | 11 - 3 files changed, 6444 insertions(+), 6456 deletions(-) delete mode 100644 package/mplayer/mplayer-1.0rc1-atmel.2.patch create mode 100644 package/mplayer/mplayer-1.0rc1-atmel.3.patch delete mode 100644 package/mplayer/mplayer-1.0rc1-remove-configuration-x11-header-search-path.patch diff --git a/package/mplayer/mplayer-1.0rc1-atmel.2.patch b/package/mplayer/mplayer-1.0rc1-atmel.2.patch deleted file mode 100644 index fc9f7b53c..000000000 --- a/package/mplayer/mplayer-1.0rc1-atmel.2.patch +++ /dev/null @@ -1,6445 +0,0 @@ - cfg-common.h | 4 + - cfg-mencoder.h | 4 + - cfg-mplayer.h | 4 + - configure | 13 +- - libaf/af_format.c | 7 + - libavcodec/Makefile | 7 + - libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++ - libavcodec/avr32/fdct.S | 541 ++++++++ - libavcodec/avr32/h264idct.S | 451 +++++++ - libavcodec/avr32/idct.S | 829 ++++++++++++ - libavcodec/avr32/mc.S | 434 ++++++ - libavcodec/avr32/pico.h | 260 ++++ - libavcodec/bitstream.h | 77 +- - libavcodec/dsputil.c | 3 + - libavcodec/h264.c | 15 + - libavutil/common.h | 16 + - libavutil/internal.h | 9 + - libfaad2/common.h | 2 +- - libmpcodecs/ad_libmad.c | 5 + - libswscale/pico-avr32.h | 137 ++ - libswscale/swscale_internal.h | 2 +- - libswscale/yuv2rgb.c | 14 + - libswscale/yuv2rgb_avr32.c | 416 ++++++ - libvo/vo_fbdev2.c | 101 ++- - version.sh | 2 +- - 25 files changed, 6011 insertions(+), 20 deletions(-) - create mode 100644 libavcodec/avr32/dsputil_avr32.c - create mode 100644 libavcodec/avr32/fdct.S - create mode 100644 libavcodec/avr32/h264idct.S - create mode 100644 libavcodec/avr32/idct.S - create mode 100644 libavcodec/avr32/mc.S - create mode 100644 libavcodec/avr32/pico.h - create mode 100644 libswscale/pico-avr32.h - create mode 100644 libswscale/yuv2rgb_avr32.c - -diff --git a/cfg-common.h b/cfg-common.h -index 780df38..7d878a8 100644 ---- a/cfg-common.h -+++ b/cfg-common.h -@@ -235,6 +235,10 @@ - {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL}, - {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL}, - -+#ifdef ARCH_AVR32 -+ {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL}, -+ {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL}, -+#endif - // draw by slices or whole frame (useful with libmpeg2/libavcodec) - {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL}, - {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL}, -diff --git a/cfg-mencoder.h b/cfg-mencoder.h -index 411b748..addf791 100644 ---- a/cfg-mencoder.h -+++ b/cfg-mencoder.h -@@ -5,6 +5,10 @@ - - #include "cfg-common.h" - -+#ifdef ARCH_AVR32 -+extern int avr32_use_pico; -+#endif -+ - #ifdef USE_FAKE_MONO - extern int fakemono; // defined in dec_audio.c - #endif -diff --git a/cfg-mplayer.h b/cfg-mplayer.h -index 62b6eac..31499c2 100644 ---- a/cfg-mplayer.h -+++ b/cfg-mplayer.h -@@ -4,6 +4,10 @@ - - #include "cfg-common.h" - -+#ifdef ARCH_AVR32 -+extern int avr32_use_pico; -+#endif -+ - extern int noconsolecontrols; - - #if defined(HAVE_FBDEV)||defined(HAVE_VESA) -diff --git a/configure b/configure -index 29002c8..56c6fe4 100755 ---- a/configure -+++ b/configure -@@ -1203,6 +1203,15 @@ EOF - _optimizing="$proc" - ;; - -+ avr32) -+ _def_arch='#define ARCH_AVR32' -+ _target_arch='TARGET_ARCH_AVR32 = yes' -+ iproc='avr32' -+ proc='' -+ _march='' -+ _mcpu='' -+ _optimizing='' -+ ;; - arm|armv4l|armv5tel) - _def_arch='#define ARCH_ARMV4L 1' - _target_arch='TARGET_ARCH_ARMV4L = yes' -@@ -1533,7 +1542,7 @@ echores $_named_asm_args - # Checking for CFLAGS - _stripbinaries=yes - if test "$_profile" != "" || test "$_debug" != "" ; then -- CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile" -+ CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile" - if test "$_cc_major" -ge "3" ; then - CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'` - fi -@@ -3794,7 +3803,7 @@ fi - - - echocheck "X11 headers presence" -Note that this hunk is a dulicate between the other patch -- for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do -+ for I in `echo $_inc_extra | sed s/-I//g`; do - if test -f "$I/X11/Xlib.h" ; then - _inc_x11="-I$I" - _x11_headers="yes" -diff --git a/libaf/af_format.c b/libaf/af_format.c -index e5b7cc9..5d7ea6d 100644 ---- a/libaf/af_format.c -+++ b/libaf/af_format.c -@@ -20,7 +20,14 @@ - // Integer to float conversion through lrintf() - #ifdef HAVE_LRINTF - #include -+ -+#ifdef ARCH_AVR32 -+#define lrintf(x) rint(x) -+#define llrint(x) (long long)rint(x) -+#else - long int lrintf(float); -+#endif -+ - #else - #define lrintf(x) ((int)(x)) - #endif -diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index 17b6c45..8e1dc96 100644 ---- a/libavcodec/Makefile -+++ b/libavcodec/Makefile -@@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) += sparc/dsputil_vis.o \ - - sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc - -+# avr32 specific stuff -+ifeq ($(TARGET_ARCH_AVR32),yes) -+ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o -+OBJS += avr32/dsputil_avr32.o -+endif -+ - # sun mediaLib specific stuff - OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \ - -@@ -419,6 +425,7 @@ tests: apiexample $(TESTS) - clean:: - rm -f \ - i386/*.o i386/*~ \ -+ avr32/*.o avr32/*~ \ - armv4l/*.o armv4l/*~ \ - mlib/*.o mlib/*~ \ - alpha/*.o alpha/*~ \ -diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c -new file mode 100644 -index 0000000..200284d ---- /dev/null -+++ b/libavcodec/avr32/dsputil_avr32.c -@@ -0,0 +1,2678 @@ -+/* -+ * Copyright (c) 2007 Atmel Corporation. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above -+ * copyright notice, this list of conditions and the following -+ * disclaimer in the documentation and/or other materials provided -+ * with the distribution. -+ * -+ * 3. The name of ATMEL may not be used to endorse or promote products -+ * derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR -+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL -+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -+ * DAMAGE. -+ */ -+ -+#include "../dsputil.h" -+#include "pico.h" -+ -+int avr32_use_pico = 1; -+ -+//#define CHECK_DSP_FUNCS_AGAINST_C -+ -+#ifdef CHECK_DSP_FUNCS_AGAINST_C -+#define DSP_FUNC_NAME(name) test_ ## name -+#else -+#define DSP_FUNC_NAME(name) name -+#endif -+ -+union doubleword { -+ int64_t doubleword; -+ struct { -+ int32_t top; -+ int32_t bottom; -+ } words; -+}; -+ -+#undef LD16 -+#undef LD32 -+#undef LD64 -+ -+#define LD16(a) (*((uint16_t*)(a))) -+#define LD32(a) (*((uint32_t*)(a))) -+#define LD64(a) (*((uint64_t*)(a))) -+#define LD64_UNALIGNED(a) \ -+ ({ union doubleword __tmp__; \ -+ __tmp__.words.top = LD32(a); \ -+ __tmp__.words.bottom = LD32(a + 4); \ -+ __tmp__.doubleword; }) -+ -+#undef ST32 -+#undef ST16 -+ -+#define ST16(a, b) *((uint16_t*)(a)) = (b) -+#define ST32(a, b) *((uint32_t*)(a)) = (b) -+ -+#undef rnd_avg32 -+#define rnd_avg32(a, b) \ -+ ({ uint32_t __tmp__;\ -+ asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\ -+ __tmp__;}) -+ -+void idct_avr32(DCTELEM *data); -+void fdct_avr32(DCTELEM *data); -+ -+void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data); -+void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data); -+ -+void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride); -+void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride); -+ -+#define extern_dspfunc(PFX, NUM) \ -+ void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ -+ void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ -+ void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ -+ void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ) -+ -+extern_dspfunc(put, 8); -+extern_dspfunc(put_no_rnd, 8); -+extern_dspfunc(avg, 8); -+extern_dspfunc(avg_no_rnd, 8); -+#undef extern_dspfunc -+ -+#ifdef CHECK_DSP_FUNCS_AGAINST_C -+#define extern_dspfunc(PFX, NUM) \ -+ void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ -+ void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ -+ void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ -+ void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ) -+ -+extern_dspfunc(put, 4); -+extern_dspfunc(put_no_rnd, 4); -+extern_dspfunc(put, 8); -+extern_dspfunc(put_no_rnd, 8); -+extern_dspfunc(put, 16); -+extern_dspfunc(put_no_rnd, 16); -+extern_dspfunc(avg, 8); -+extern_dspfunc(avg_no_rnd, 8); -+extern_dspfunc(avg, 16); -+extern_dspfunc(avg_no_rnd, 16); -+ -+ -+#undef extern_dspfunc -+#define extern_dspfunc(PFX, NUM) \ -+void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \ -+void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \ -+ -+extern_dspfunc(put_h264_qpel, 16); -+extern_dspfunc(put_h264_qpel, 8); -+extern_dspfunc(put_h264_qpel, 4); -+extern_dspfunc(avg_h264_qpel, 16); -+extern_dspfunc(avg_h264_qpel, 8); -+extern_dspfunc(avg_h264_qpel, 4); -+ -+#undef extern_dspfunc -+ -+void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); -+void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); -+void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); -+ -+void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); -+void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); -+void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); -+ -+ -+void dump_block8(uint8_t *block, int line_size, int h); -+void dump_block4(uint8_t *block, int line_size, int h); -+void dump_block(uint8_t *block, int line_size, int h, int w); -+ -+void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, -+ int h, char *name, int max_dev); -+void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, -+ int h, char *name, int max_dev); -+void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, -+ int h, int width, char *name, int max_dev); -+ -+#define PIXOP2( OPNAME, OP ) \ -+void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ -+ int i;\ -+ for(i=0; i> 16)); -+ ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0)); -+ -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_MVRC_W(PICO_INPIX0, src0); -+ PICO_MVRC_W(PICO_INPIX1, src1); -+ PICO_MVRC_W(PICO_INPIX2, src2); -+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); -+ PICO_MVRC_W(PICO_INPIX2, src3); -+ PICO_MVRC_W(PICO_INPIX1, src4); -+ PICO_MVRC_W(PICO_INPIX0, src5); -+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); -+ PICO_MVRC_W(PICO_INPIX0, src0); -+ PICO_MVRC_W(PICO_INPIX1, src1); -+ PICO_MVRC_W(PICO_INPIX2, src2); -+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_MVRC_W(PICO_INPIX0, src1); -+ PICO_MVRC_W(PICO_INPIX1, src2); -+ PICO_MVRC_W(PICO_INPIX2, src3); -+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); -+ PICO_MVRC_W(PICO_INPIX2, src4); -+ PICO_MVRC_W(PICO_INPIX1, src5); -+ PICO_MVRC_W(PICO_INPIX0, src6); -+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); -+ PICO_MVRC_W(PICO_INPIX0, src1); -+ PICO_MVRC_W(PICO_INPIX1, src2); -+ PICO_MVRC_W(PICO_INPIX2, src3); -+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3); -+ -+ ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16)); -+ ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0)); -+ -+ dst += 2; -+ src += 2; -+ } -+} -+ -+ -+ -+ -+static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ -+ int32_t tmp_block[48]; -+ int32_t *tmp = tmp_block; -+ int i; -+ -+ set_pico_config(&h264_qpel4_hv_lowpass_config); -+ -+ src -= 2; -+ for ( i = 0; i < 2; i++ ){ -+ int srcB= LD32(src - 2*srcStride); -+ int srcA= LD32(src - 1*srcStride); -+ int src0= LD32(src + 0 *srcStride); -+ int src1= LD32(src + 1 *srcStride); -+ int src2= LD32(src + 2 *srcStride); -+ int src3= LD32(src + 3 *srcStride); -+ int src4= LD32(src + 4 *srcStride); -+ int src5= LD32(src + 5 *srcStride); -+ int src6= LD32(src + 6 *srcStride); -+ -+ PICO_MVRC_W(PICO_INPIX0, srcB); -+ PICO_MVRC_W(PICO_INPIX1, srcA); -+ PICO_MVRC_W(PICO_INPIX2, src0); -+ PICO_OP(0, 0, 0, 4, 8); -+ PICO_MVRC_W(PICO_INPIX2, src1); -+ PICO_MVRC_W(PICO_INPIX1, src2); -+ PICO_MVRC_W(PICO_INPIX0, src3); -+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); -+ PICO_STCM_W(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ tmp += 3; -+ -+ PICO_OP(0, 0, 1, 5, 9); -+ PICO_MVRC_W(PICO_INPIX0, srcB); -+ PICO_MVRC_W(PICO_INPIX1, srcA); -+ PICO_MVRC_W(PICO_INPIX2, src0); -+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); -+ PICO_STCM_W(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ tmp += 3; -+ -+ PICO_MVRC_W(PICO_INPIX0, src1); -+ PICO_OP(0, 0, 4, 8, 0); -+ PICO_MVRC_W(PICO_INPIX2, src2); -+ PICO_MVRC_W(PICO_INPIX1, src3); -+ PICO_MVRC_W(PICO_INPIX0, src4); -+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); -+ PICO_STCM_W(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ tmp += 3; -+ -+ PICO_OP(0, 0, 1, 5, 9); -+ PICO_MVRC_W(PICO_INPIX0, srcA); -+ PICO_MVRC_W(PICO_INPIX1, src0); -+ PICO_MVRC_W(PICO_INPIX2, src1); -+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); -+ PICO_STCM_W(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ tmp += 3; -+ -+ PICO_MVRC_W(PICO_INPIX0, src2); -+ PICO_OP(0, 0, 4, 8, 0); -+ PICO_MVRC_W(PICO_INPIX2, src3); -+ PICO_MVRC_W(PICO_INPIX1, src4); -+ PICO_MVRC_W(PICO_INPIX0, src5); -+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); -+ PICO_STCM_W(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ tmp += 3; -+ -+ PICO_OP(0, 0, 1, 5, 9); -+ PICO_MVRC_W(PICO_INPIX0, src0); -+ PICO_MVRC_W(PICO_INPIX1, src1); -+ PICO_MVRC_W(PICO_INPIX2, src2); -+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); -+ PICO_STCM_W(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ tmp += 3; -+ -+ PICO_MVRC_W(PICO_INPIX0, src3); -+ PICO_OP(0, 0, 4, 8, 0); -+ PICO_MVRC_W(PICO_INPIX2, src4); -+ PICO_MVRC_W(PICO_INPIX1, src5); -+ PICO_MVRC_W(PICO_INPIX0, src6); -+ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); -+ PICO_STCM_W(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ tmp += 3; -+ -+ PICO_OP(0, 0, 1, 5, 9); -+ PICO_MVRC_W(PICO_INPIX0, src1); -+ PICO_MVRC_W(PICO_INPIX1, src2); -+ PICO_MVRC_W(PICO_INPIX2, src3); -+ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); -+ PICO_STCM_W(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ tmp += 3; -+ src += 2; -+ } -+ -+ src -= 1; -+ tmp -= 48; -+ -+ -+ PICO_PUT_W(PICO_CONFIG, -+ PICO_OUTPUT_MODE(PICO_PLANAR_MODE) -+ | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE) -+ | PICO_COEFF_FRAC_BITS(10) -+ | PICO_OFFSET_FRAC_BITS(10)); -+ -+ for ( i = 0; i < 2; i++ ){ -+ int srcB= LD32(src - 2*srcStride); -+ int srcA= LD32(src - 1*srcStride); -+ int src0= LD32(src + 0 *srcStride); -+ int src1= LD32(src + 1 *srcStride); -+ int src2= LD32(src + 2 *srcStride); -+ int src3= LD32(src + 3 *srcStride); -+ int src4= LD32(src + 4 *srcStride); -+ int src5= LD32(src + 5 *srcStride); -+ int src6= LD32(src + 6 *srcStride); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_MVRC_W(PICO_INPIX0, srcB); -+ PICO_MVRC_W(PICO_INPIX1, srcA); -+ PICO_MVRC_W(PICO_INPIX2, src0); -+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); -+ PICO_MVRC_W(PICO_INPIX2, src1); -+ PICO_MVRC_W(PICO_INPIX1, src2); -+ PICO_MVRC_W(PICO_INPIX0, src3); -+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); -+ PICO_MVRC_W(PICO_INPIX0, srcB); -+ PICO_MVRC_W(PICO_INPIX1, srcA); -+ PICO_MVRC_W(PICO_INPIX2, src0); -+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_MVRC_W(PICO_INPIX0, srcA); -+ PICO_MVRC_W(PICO_INPIX1, src0); -+ PICO_MVRC_W(PICO_INPIX2, src1); -+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); -+ PICO_MVRC_W(PICO_INPIX2, src2); -+ PICO_MVRC_W(PICO_INPIX1, src3); -+ PICO_MVRC_W(PICO_INPIX0, src4); -+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); -+ PICO_MVRC_W(PICO_INPIX0, srcA); -+ PICO_MVRC_W(PICO_INPIX1, src0); -+ PICO_MVRC_W(PICO_INPIX2, src1); -+ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3); -+ -+ ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16)); -+ ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0))); -+ -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_MVRC_W(PICO_INPIX0, src0); -+ PICO_MVRC_W(PICO_INPIX1, src1); -+ PICO_MVRC_W(PICO_INPIX2, src2); -+ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); -+ PICO_MVRC_W(PICO_INPIX2, src3); -+ PICO_MVRC_W(PICO_INPIX1, src4); -+ PICO_MVRC_W(PICO_INPIX0, src5); -+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); -+ PICO_MVRC_W(PICO_INPIX0, src0); -+ PICO_MVRC_W(PICO_INPIX1, src1); -+ PICO_MVRC_W(PICO_INPIX2, src2); -+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_MVRC_W(PICO_INPIX0, src1); -+ PICO_MVRC_W(PICO_INPIX1, src2); -+ PICO_MVRC_W(PICO_INPIX2, src3); -+ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); -+ PICO_MVRC_W(PICO_INPIX2, src4); -+ PICO_MVRC_W(PICO_INPIX1, src5); -+ PICO_MVRC_W(PICO_INPIX0, src6); -+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0); -+ -+ PICO_LDCM_W_INC(tmp, -+ PICO_REGVECT_VMU0_OUT, -+ PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT); -+ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); -+ PICO_MVRC_W(PICO_INPIX0, src1); -+ PICO_MVRC_W(PICO_INPIX1, src2); -+ PICO_MVRC_W(PICO_INPIX2, src3); -+ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3); -+ -+ ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16)); -+ ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0))); -+ -+ dst += 2; -+ src += 2; -+ } -+} -+ -+ -+static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+ src += 4*srcStride; -+ dst += 4*dstStride; -+ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+} -+ -+static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+ src += 4*srcStride; -+ dst += 4*dstStride; -+ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+} -+ -+static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+ src += 4*srcStride; -+ dst += 4*dstStride; -+ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+} -+ -+static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+ src += 4*srcStride; -+ dst += 4*dstStride; -+ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+} -+ -+static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+ src += 4*srcStride; -+ dst += 4*dstStride; -+ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+} -+ -+static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+ src += 4*srcStride; -+ dst += 4*dstStride; -+ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); -+} -+ -+static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+ src += 8*srcStride; -+ dst += 8*dstStride; -+ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+} -+ -+static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+ src += 8*srcStride; -+ dst += 8*dstStride; -+ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+} -+ -+static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+ src += 8*srcStride; -+ dst += 8*dstStride; -+ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+} -+ -+static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+ src += 8*srcStride; -+ dst += 8*dstStride; -+ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+} -+ -+static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+ src += 8*srcStride; -+ dst += 8*dstStride; -+ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); -+ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+} -+ -+static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ -+ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+ src += 8*srcStride; -+ dst += 8*dstStride; -+ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); -+ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); -+} -+ -+ -+#define H264_MC(OPNAME, SIZE) \ -+static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\ -+ OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t half[SIZE*SIZE];\ -+ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t half[SIZE*SIZE];\ -+ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t full[SIZE*(SIZE+5)];\ -+ uint8_t * const full_mid= full + SIZE*2;\ -+ uint8_t half[SIZE*SIZE];\ -+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ -+ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t full[SIZE*(SIZE+5)];\ -+ uint8_t * const full_mid= full + SIZE*2;\ -+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ -+ OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t full[SIZE*(SIZE+5)];\ -+ uint8_t * const full_mid= full + SIZE*2;\ -+ uint8_t half[SIZE*SIZE];\ -+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ -+ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t full[SIZE*(SIZE+5)];\ -+ uint8_t * const full_mid= full + SIZE*2;\ -+ uint8_t halfH[SIZE*SIZE];\ -+ uint8_t halfV[SIZE*SIZE];\ -+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\ -+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ -+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t full[SIZE*(SIZE+5)];\ -+ uint8_t * const full_mid= full + SIZE*2;\ -+ uint8_t halfH[SIZE*SIZE];\ -+ uint8_t halfV[SIZE*SIZE];\ -+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\ -+ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ -+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t full[SIZE*(SIZE+5)];\ -+ uint8_t * const full_mid= full + SIZE*2;\ -+ uint8_t halfH[SIZE*SIZE];\ -+ uint8_t halfV[SIZE*SIZE];\ -+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\ -+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ -+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t full[SIZE*(SIZE+5)];\ -+ uint8_t * const full_mid= full + SIZE*2;\ -+ uint8_t halfH[SIZE*SIZE];\ -+ uint8_t halfV[SIZE*SIZE];\ -+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\ -+ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ -+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t halfH[SIZE*SIZE];\ -+ uint8_t halfHV[SIZE*SIZE];\ -+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\ -+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t halfH[SIZE*SIZE];\ -+ uint8_t halfHV[SIZE*SIZE];\ -+ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\ -+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t full[SIZE*(SIZE+5)];\ -+ uint8_t * const full_mid= full + SIZE*2;\ -+ uint8_t halfV[SIZE*SIZE];\ -+ uint8_t halfHV[SIZE*SIZE];\ -+ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ -+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ -+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ -+}\ -+\ -+static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\ -+ uint8_t full[SIZE*(SIZE+5)];\ -+ uint8_t * const full_mid= full + SIZE*2;\ -+ uint8_t halfV[SIZE*SIZE];\ -+ uint8_t halfHV[SIZE*SIZE];\ -+ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ -+ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ -+ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ -+ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ -+}\ -+ -+H264_MC(put_, 4) -+H264_MC(put_, 8) -+H264_MC(put_, 16) -+H264_MC(avg_, 4) -+H264_MC(avg_, 8) -+H264_MC(avg_, 16) -+ -+ -+ -+#define dspfunc16(PFX) \ -+ void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ -+ PFX ## _pixels8_avr32(dst, pixels, line_size, h);\ -+ PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\ -+ }\ -+ void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ -+ PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\ -+ PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\ -+ }\ -+ void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ -+ PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\ -+ PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\ -+ }\ -+ void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ -+ PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\ -+ PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\ -+ }\ -+ -+ -+dspfunc16(put) -+dspfunc16(put_no_rnd) -+dspfunc16(avg) -+dspfunc16(avg_no_rnd) -+#undef dspfunc16 -+ -+static int pix_sum_avr32(uint8_t * pix, int line_size) -+{ -+ int s, i; -+ -+ s = 0; -+ for (i = 0; i < 16; i++) { -+ int tmp1,tmp2,tmp3,tmp4,tmp5; -+ __asm__ volatile ( "ld.w\t%0, %6[0]\n\t" -+ "ld.w\t%1, %6[4]\n\t" -+ "ld.w\t%2, %6[8]\n\t" -+ "ld.w\t%3, %6[12]\n\t" -+ "punpckub.h\t%4, %0:t\n\t" -+ "padd.h\t%5, %5, %4\n\t" -+ "punpckub.h\t%4, %0:b\n\t" -+ "padd.h\t%5, %5, %4\n\t" -+ "punpckub.h\t%4, %1:t\n\t" -+ "padd.h\t%5, %5, %4\n\t" -+ "punpckub.h\t%4, %1:b\n\t" -+ "padd.h\t%5, %5, %4\n\t" -+ "punpckub.h\t%4, %2:t\n\t" -+ "padd.h\t%5, %5, %4\n\t" -+ "punpckub.h\t%4, %2:b\n\t" -+ "padd.h\t%5, %5, %4\n\t" -+ "punpckub.h\t%4, %3:t\n\t" -+ "padd.h\t%5, %5, %4\n\t" -+ "punpckub.h\t%4, %3:b\n\t" -+ "padd.h\t%5, %5, %4\n\t" -+ : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s) -+ : "r"(pix)); -+ pix += line_size; -+ } -+ __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) ); -+ -+ return s; -+} -+ -+ -+//#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom ) -+//#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) -+//#define H264_WEIGHT(W,H) \ -+//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ -+// int attribute_unused x, y; \ -+// offset <<= log2_denom; \ -+// if(log2_denom) offset += 1<<(log2_denom-1); \ -+// for(y=0; y> 0, 8\n" \ -+// "satu\t%[tmp1] >> 0, 8\n" \ -+// "st.b\t%[block][0], %[tmp0]\n" \ -+// "st.b\t%[block][1], %[tmp1]\n" \ -+// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \ -+// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \ -+// } else if ( W==4 ) { \ -+// asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \ -+// "punpckub.h\t%[tmp1], %[tmp0]:t\n" \ -+// "punpckub.h\t%[tmp0], %[tmp0]:b\n" \ -+// "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \ -+// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \ -+// "asr\t%[tmp0], %[log2_denom]\n" \ -+// "asr\t%[tmp1], %[log2_denom]\n" \ -+// "satu\t%[tmp0] >> 0, 8\n" \ -+// "satu\t%[tmp1] >> 0, 8\n" \ -+// "st.b\t%[block][0], %[tmp0]\n" \ -+// "st.b\t%[block][1], %[tmp1]\n" \ -+// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \ -+// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \ -+// -+// -+// -+// if(W==4) continue; \ -+// op_scale1(4); \ -+// op_scale1(5); \ -+// op_scale1(6); \ -+// op_scale1(7); \ -+// if(W==8) continue; \ -+// op_scale1(8); \ -+// op_scale1(9); \ -+// op_scale1(10); \ -+// op_scale1(11); \ -+// op_scale1(12); \ -+// op_scale1(13); \ -+// op_scale1(14); \ -+// op_scale1(15); \ -+// } \ -+//} \ -+//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \ -+// int attribute_unused x, y; \ -+// int offset = (offsets + offsetd + 1) >> 1; \ -+// offset = ((offset << 1) + 1) << log2_denom; \ -+// for(y=0; y and -+ is not less than */ -+#define PABS_DIFF_LESS_THAN( a, b, compare) \ -+ ({ uint32_t __tmp__, __tmp2__, __mask__; \ -+ asm ( \ -+ /* Check ABS( a - b ) < compare */ \ -+ "psubs.ub\t%[tmp], %[opa], %[opb]\n" \ -+ "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \ -+ "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \ -+ /* This produces 0 for all bytes where the comparison is not true */ \ -+ "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \ -+ : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \ -+ : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \ -+ __mask__; }) -+ -+/* -+ Set all bytes containing zero in to 255 and the rest to zero. -+ -+ Add with saturation 254 to all bytes making all bytes different from -+ zero become 255. Then add one without saturation to make all bytes -+ originally containing zero 255 and the rest 0. */ -+#define SET_ALL_BITS_IN_ZERO_BYTES(value) \ -+ ({ uint32_t __tmp__; \ -+ asm ( \ -+ "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \ -+ "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \ -+ : [tmp] "=r"(__tmp__) \ -+ : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \ -+ __tmp__; }) -+ -+#define PACKW_SH(upper, lower) \ -+ ({ uint32_t __tmp__; \ -+ asm ( \ -+ "packw.sh\t%[tmp], %[u], %[l]\n" \ -+ : [tmp] "=r"(__tmp__) \ -+ : [u] "r"(upper), [l] "r"(lower) ); \ -+ __tmp__; }) -+ -+#define PACKSH_UB(upper, lower) \ -+ ({ uint32_t __tmp__; \ -+ asm ( \ -+ "packsh.sb\t%[tmp], %[u], %[l]\n" \ -+ : [tmp] "=r"(__tmp__) \ -+ : [u] "r"(upper), [l] "r"(lower) ); \ -+ __tmp__; }) -+ -+static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) -+{ -+ int i; -+ -+ if ( alpha == 0 ) -+ return; -+ -+ alpha = PACKW_SH(alpha, alpha); -+ alpha = PACKSH_UB(alpha, alpha); -+ beta = PACKW_SH(beta, beta); -+ beta = PACKSH_UB(beta, beta); -+ -+ for( i = 0; i < 4; i++ ) { -+ uint32_t p0, p1, p2, q0, q1, q2; -+ uint32_t mask, mask2; -+ uint32_t tmp, tmp2, tmp3, tmp4; -+ -+ if( tc0[i] < 0 ) { -+ pix += 4; -+ continue; -+ } -+ -+/* for( d = 0; d < 4; d++ ) { -+ const int p0 = pix[-1*stride]; -+ const int p1 = pix[-2*stride]; -+ const int p2 = pix[-3*stride]; -+ const int q0 = pix[0]; -+ const int q1 = pix[1*stride]; -+ const int q2 = pix[2*stride]; -+ -+ if( ABS( p0 - q0 ) < alpha && -+ ABS( p1 - p0 ) < beta && -+ ABS( q1 - q0 ) < beta ) { */ -+ -+ p0 = LD32(pix - stride); -+ p1 = LD32(pix - 2*stride); -+ q0 = LD32(pix); -+ q1 = LD32(pix + stride); -+ -+ /* Check which of the columns should be filtered, if any. */ -+ mask = PABS_DIFF_LESS_THAN(p0, q0, alpha); -+ mask |= PABS_DIFF_LESS_THAN(p1, p0, beta); -+ mask |= PABS_DIFF_LESS_THAN(q1, q0, beta); -+ -+ if ( !mask ) -+ continue; -+ -+ mask = SET_ALL_BITS_IN_ZERO_BYTES(mask); -+ -+ -+ int tc = PACKW_SH(tc0[i], tc0[i]); -+ int tc0_p = tc; -+ int tc0_m = PACKW_SH(-tc0[i], -tc0[i]); -+ -+ /* -+ int i_delta; -+ if( ABS( p2 - p0 ) < beta ) { -+ pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); -+ tc++; -+ }*/ -+ -+ p2 = LD32(pix - 3*stride); -+ mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask; -+ -+ if ( mask2 ){ -+ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2); -+ asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n" -+ "paddh.ub\t%[tmp], %[tmp], %[p2]\n" -+ "punpckub.h\t%[tmp2], %[tmp]:t\n" -+ "punpckub.h\t%[tmp], %[tmp]:b\n" -+ "punpckub.h\t%[tmp3], %[p1]:t\n" -+ "punpckub.h\t%[tmp4], %[p1]:b\n" -+ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n" -+ "psub.h\t%[tmp], %[tmp], %[tmp4]\n" -+ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n" -+ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n" -+ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n" -+ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n" -+ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n" -+ "padd.h\t%[tmp], %[tmp], %[tmp4]\n" -+ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n" -+ "andn\t%[tmp], %[mask2]\n" -+ "and\t%[tmp2], %[q1], %[mask2]\n" -+ "or\t%[tmp], %[tmp2]\n" -+ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), -+ [tmp4]"=&r"(tmp4) -+ : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p), -+ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2)); -+ ST32(pix - 2*stride, tmp); -+ tc += 0x00010001; -+ } -+ -+ -+ q2 = LD32(pix + 2*stride); -+ -+ /* -+ if( ABS( q2 - q0 ) < beta ) { -+ pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); -+ tc++; -+ } -+ */ -+ mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask; -+ -+ if ( mask2 ){ -+ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2); -+ asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n" -+ "paddh.ub\t%[tmp], %[tmp], %[q2]\n" -+ "punpckub.h\t%[tmp2], %[tmp]:t\n" -+ "punpckub.h\t%[tmp], %[tmp]:b\n" -+ "punpckub.h\t%[tmp3], %[q1]:t\n" -+ "punpckub.h\t%[tmp4], %[q1]:b\n" -+ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n" -+ "psub.h\t%[tmp], %[tmp], %[tmp4]\n" -+ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n" -+ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n" -+ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n" -+ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n" -+ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n" -+ "padd.h\t%[tmp], %[tmp], %[tmp4]\n" -+ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n" -+ "andn\t%[tmp], %[mask2]\n" -+ "and\t%[tmp2], %[q1], %[mask2]\n" -+ "or\t%[tmp], %[tmp2]\n" -+ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), -+ [tmp4]"=&r"(tmp4) -+ : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p), -+ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2)); -+ ST32(pix + stride, tmp); -+ tc += 0x00010001; -+ } -+ -+ uint32_t old_p0 = p0; -+ uint32_t old_q0 = q0; -+ -+ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); -+ pix[-stride] = clip_uint8( p0 + i_delta ); -+ pix[0] = clip_uint8( q0 - i_delta ); */ -+ -+ asm ( -+ /* Check if the two upper pixels should be filtered */ -+ "lsr\t%[tmp], %[inv_mask], 16\n" -+ "breq\t0f\n" -+ -+ "punpckub.h\t%[tmp], %[p1]:t\n" -+ "punpckub.h\t%[tmp2], %[q1]:t\n" -+ -+ /* p1 - q1 */ -+ "psub.h\t%[tmp], %[tmp], %[tmp2]\n" -+ -+ "punpckub.h\t%[tmp3], %[q0]:t\n" -+ "punpckub.h\t%[tmp4], %[p0]:t\n" -+ -+ /* q0 - p0 */ -+ "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n" -+ -+ /* (q0 - p0) << 2 */ -+ "plsl.h\t%[tmp2], %[tmp2], 2\n" -+ -+ /* ((q0 - p0) << 2) + (p1 - q1) */ -+ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n" -+ -+ "mov\t%[tmp], 0x00040004\n" -+ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/ -+ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n" -+ -+ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/ -+ "pasr.h\t%[tmp2], %[tmp2], 3\n" -+ -+ "mov\t%[tmp], 0\n" -+ "psub.h\t%[tmp], %[tmp], %[tc]\n" -+ -+ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */ -+ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n" -+ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n" -+ -+ -+ /* pix[-stride] = clip_uint8( p0 + i_delta ); */ -+ "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n" -+ -+ -+ /* pix[0] = clip_uint8( q0 - i_delta ); */ -+ "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n" -+ -+ /* Check if the two lower pixels should be filtered */ -+ "lsl\t%[tmp2], %[inv_mask], 16\n" -+ "breq\t1f\n" -+ -+ "0:\n" -+ "punpckub.h\t%[p1], %[p1]:b\n" -+ "punpckub.h\t%[q1], %[q1]:b\n" -+ -+ /* p1 - q1 */ -+ "psub.h\t%[p1], %[p1], %[q1]\n" -+ -+ "punpckub.h\t%[q0], %[q0]:b\n" -+ "punpckub.h\t%[p0], %[p0]:b\n" -+ -+ /* q0 - p0 */ -+ "psub.h\t%[tmp2], %[q0], %[p0]\n" -+ -+ /* (q0 - p0) << 2 */ -+ "plsl.h\t%[tmp2], %[tmp2], 2\n" -+ -+ /* ((q0 - p0) << 2) + (p1 - q1) */ -+ "padd.h\t%[tmp2], %[tmp2], %[p1]\n" -+ -+ "mov\t%[q1], 0x00040004\n" -+ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/ -+ "padd.h\t%[tmp2], %[tmp2], %[q1]\n" -+ -+ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/ -+ "pasr.h\t%[tmp2], %[tmp2], 3\n" -+ -+ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */ -+ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n" -+ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n" -+ -+ /* pix[-stride] = clip_uint8( p0 + i_delta ); */ -+ "padd.h\t%[p0], %[p0], %[tmp2]\n" -+ -+ /* pix[0] = clip_uint8( q0 - i_delta ); */ -+ "psub.h\t%[q0], %[q0], %[tmp2]\n" -+ -+ "1:\n" -+ "packsh.ub\t%[p0], %[tmp4], %[p0]\n" -+ "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n" -+ -+ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), -+ [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1) -+ : [tc]"r"(tc), [inv_mask]"r"(~mask)); -+ -+ ST32(pix - stride, (mask & old_p0) | (p0 & ~mask)); -+ ST32(pix, (mask & old_q0) | (q0 & ~mask)); -+ -+ } -+ pix += 1; -+} -+ -+ -+ -+ -+#ifdef CHECK_DSP_FUNCS_AGAINST_C -+ -+void dump_block8(uint8_t *block, int line_size, int h){ -+ int i, j; -+ -+ for ( i = 0; i < h ; i++ ){ -+ av_log(NULL, AV_LOG_ERROR, "\t"); -+ for ( j = 0; j < 8 ; j++ ){ -+ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); -+ } -+ av_log(NULL, AV_LOG_ERROR, "\n"); -+ } -+} -+ -+void dump_block4(uint8_t *block, int line_size, int h){ -+ int i, j; -+ -+ for ( i = 0; i < h ; i++ ){ -+ av_log(NULL, AV_LOG_ERROR, "\t"); -+ for ( j = 0; j < 4 ; j++ ){ -+ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); -+ } -+ av_log(NULL, AV_LOG_ERROR, "\n"); -+ } -+} -+ -+void dump_block(uint8_t *block, int line_size, int h, int w){ -+ int i, j; -+ -+ for ( i = 0; i < h ; i++ ){ -+ av_log(NULL, AV_LOG_ERROR, "\t"); -+ for ( j = 0; j < w ; j++ ){ -+ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); -+ } -+ av_log(NULL, AV_LOG_ERROR, "\n"); -+ } -+} -+ -+void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, -+ int h, char *name, int max_dev){ -+ int i,j; -+ for ( i = 0; i < 8 ; i++ ){ -+ for ( j = 0; j < h ; j++ ){ -+ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j]; -+ diff = diff < 0 ? -diff : diff; -+ if ( diff > max_dev ){ -+ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", -+ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]); -+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name); -+ dump_block8(test, line_size_test, h); -+ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n"); -+ dump_block8(correct, line_size_correct, h); -+ exit(1); -+ } -+ } -+ } -+} -+ -+void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, -+ int h, char *name, int max_dev){ -+ int i,j; -+ for ( i = 0; i < 4 ; i++ ){ -+ for ( j = 0; j < h ; j++ ){ -+ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j]; -+ diff = diff < 0 ? -diff : diff; -+ if ( diff > max_dev ){ -+ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", -+ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]); -+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name); -+ dump_block8(test, line_size_test, h); -+ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n"); -+ dump_block4(correct, line_size_correct, h); -+ exit(1); -+ } -+ } -+ } -+} -+ -+void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, -+ int h, int width, char *name, int max_dev){ -+ int i,j; -+ for ( i = 0; i < width ; i++ ){ -+ for ( j = 0; j < h ; j++ ){ -+ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j]; -+ diff = diff < 0 ? -diff : diff; -+ if ( diff > max_dev ){ -+ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", -+ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]); -+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name); -+ dump_block(test, line_size_test, h, width); -+ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n"); -+ dump_block(correct, line_size_correct, h, width); -+ exit(1); -+ } -+ } -+ } -+} -+ -+void dump_dct_block(DCTELEM *block){ -+ int i, j; -+ -+ for ( i = 0; i < 8 ; i++ ){ -+ av_log(NULL, AV_LOG_ERROR, "\t"); -+ for ( j = 0; j < 8 ; j++ ){ -+ av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]); -+ } -+ av_log(NULL, AV_LOG_ERROR, "\n"); -+ } -+} -+ -+void test_idct_avr32(DCTELEM *block){ -+ DCTELEM testBlock[64]; -+ int i, j; -+ -+ /* Copy transposed block to testBlock */ -+ for ( i = 0; i < 8 ; i++ ){ -+ for ( j = 0; j < 8 ; j++ ){ -+ testBlock[i + 8*j] = block[j + i*8]; -+ } -+ } -+ -+ idct_avr32(block); -+ simple_idct(&testBlock); -+ -+ for ( i = 0; i < 64 ; i++ ){ -+ if ( block[i] != testBlock[i] ){ -+ av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n"); -+ dump_dct_block(block); -+ av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n"); -+ dump_dct_block(testBlock); -+ exit(1); -+ } -+ } -+} -+ -+void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){ -+ uint8_t testBlock[64]; -+ DCTELEM blockCopy[64]; -+ int i, j; -+ -+ /* Copy transposed block to blockCopy */ -+ for ( i = 0; i < 8 ; i++ ){ -+ for ( j = 0; j < 8 ; j++ ){ -+ blockCopy[i + 8*j] = block[j + i*8]; -+ } -+ } -+ -+ idct_put_avr32(dest, line_size, block); -+ simple_idct_put(&testBlock, 8, blockCopy); -+ -+ check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1); -+} -+ -+ -+void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){ -+ uint8_t testBlock[64]; -+ DCTELEM blockCopy[64]; -+ int i, j; -+ -+ /* Copy dest to testBlock */ -+ for ( i = 0; i < 8 ; i++ ){ -+ for ( j = 0; j < 8 ; j++ ){ -+ testBlock[i + 8*j] = dest[i + j*line_size]; -+ } -+ } -+ -+ /* Copy transposed block to blockCopy */ -+ for ( i = 0; i < 8 ; i++ ){ -+ for ( j = 0; j < 8 ; j++ ){ -+ blockCopy[i + 8*j] = block[j + i*8]; -+ } -+ } -+ -+ idct_add_avr32(dest, line_size, block); -+ simple_idct_add(&testBlock, 8, blockCopy); -+ -+ check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1); -+} -+ -+void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){ -+ uint8_t testBlock[16]; -+ DCTELEM blockCopy[16]; -+ int i, j; -+ -+ /* Copy dest to testBlock */ -+ for ( i = 0; i < 4 ; i++ ){ -+ for ( j = 0; j < 4 ; j++ ){ -+ testBlock[i + 4*j] = dest[i + j*stride]; -+ } -+ } -+ -+ /* Copy transposed block to blockCopy */ -+ for ( i = 0; i < 16 ; i++ ){ -+ blockCopy[i] = block[i]; -+ } -+ -+ ff_h264_idct_add_c(dest, block, stride); -+ -+ h264_idct_add_avr32(testBlock, blockCopy, 4); -+ -+ check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0); -+} -+ -+void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){ -+ uint8_t testBlock[8*8]; -+ DCTELEM blockCopy[8*8]; -+ int i, j; -+ -+ /* Copy dest to testBlock */ -+ for ( i = 0; i < 8 ; i++ ){ -+ for ( j = 0; j < 8 ; j++ ){ -+ testBlock[i + 8*j] = dest[i + j*stride]; -+ } -+ } -+ -+ /* Copy source block to blockCopy */ -+ for ( i = 0; i < 8*8 ; i++ ){ -+ blockCopy[i] = block[i]; -+ } -+ -+ ff_h264_idct8_add_c(dest, block, stride); -+ h264_idct8_add_avr32(testBlock, blockCopy, 8); -+ -+ check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0); -+} -+ -+void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block, -+ const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){ -+ uint8_t *testBlock, *testBlock2; -+ int i, j; -+ int input_v_size = h + in_v_size; -+ int input_h_size = 8 + in_h_size; -+ -+ testBlock = alloca(input_h_size*input_v_size); -+ testBlock2 = alloca(input_h_size*input_v_size); -+ -+ for ( i = 0; i < input_h_size ; i++ ){ -+ for ( j = 0; j < input_v_size ; j++ ){ -+ testBlock[i + input_h_size*j] = pixels[i + j*line_size]; -+ } -+ } -+ -+ test(block, pixels, line_size, h); -+ correct(testBlock2, testBlock, input_h_size, h); -+ -+ check_block8(block, testBlock2, line_size, input_h_size, h, name, 0); -+ -+} -+ -+void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst, -+ uint8_t *src, int stride, int h, int w, int x, int y, char *name){ -+ uint8_t *testBlock, *testBlock2; -+ int i, j; -+ int input_v_size = h + 1; -+ int input_h_size = ((w + 1) + 3) & ~3; -+ -+ testBlock = alloca(input_h_size*input_v_size); -+ testBlock2 = alloca(input_h_size*input_v_size); -+ -+ for ( i = 0; i < w + 1 ; i++ ){ -+ for ( j = 0; j < h + 1 ; j++ ){ -+ testBlock[i + input_h_size*j] = src[i + j*stride]; -+ } -+ } -+ -+ for ( i = 0; i < w ; i++ ){ -+ for ( j = 0; j < h ; j++ ){ -+ testBlock2[i + input_h_size*j] = dst[i + j*stride]; -+ } -+ } -+ -+ test(dst, src, stride, h, x, y); -+ correct(testBlock2, testBlock, input_h_size, h, x, y); -+ -+ check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0); -+ -+} -+ -+void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst, -+ uint8_t *src, int stride, int size, char *name){ -+ uint8_t *testBlock, *testBlock2; -+ int i, j; -+ int test_stride = size + 8; -+ -+ testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4; -+ testBlock2 = alloca(test_stride*size); -+ -+ for ( i = -4; i < size+4 ; i++ ){ -+ for ( j = -4; j < size+4 ; j++ ){ -+ testBlock[i + test_stride*j] = src[i + j*stride]; -+ } -+ } -+ -+ for ( i = 0; i < size ; i++ ){ -+ for ( j = 0; j < size ; j++ ){ -+ testBlock2[i + test_stride*j] = dst[i + j*stride]; -+ } -+ } -+ -+ correct(dst, src, stride); -+ test(testBlock2, testBlock, test_stride); -+ -+ check_block(testBlock2, dst, test_stride, stride, size, size, name, 0); -+ -+} -+ -+ -+#define test_pixels_funcs(PFX, NUM ) \ -+void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ -+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \ -+ block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \ -+void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ -+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \ -+ block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \ -+void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ -+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \ -+ block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \ -+void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ -+ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \ -+ block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); } -+ -+test_pixels_funcs(put, 8); -+test_pixels_funcs(put_no_rnd, 8); -+test_pixels_funcs(put, 16); -+test_pixels_funcs(put_no_rnd, 16); -+ -+test_pixels_funcs(avg, 8); -+test_pixels_funcs(avg_no_rnd, 8); -+test_pixels_funcs(avg, 16); -+test_pixels_funcs(avg_no_rnd, 16); -+ -+#define test_h264_chroma_mc_funcs(PFX, NUM ) \ -+void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \ -+ test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \ -+ dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \ -+ -+test_h264_chroma_mc_funcs(put, 2); -+test_h264_chroma_mc_funcs(put, 4); -+test_h264_chroma_mc_funcs(put, 8); -+test_h264_chroma_mc_funcs(avg, 2); -+test_h264_chroma_mc_funcs(avg, 4); -+test_h264_chroma_mc_funcs(avg, 8); -+ -+#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \ -+void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \ -+ test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \ -+ dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); } -+ -+#define test_qpel_mc_funcs(PFX, NUM) \ -+ test_qpel_mc_funcs_type(PFX, NUM, mc00);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc10);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc20);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc30);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc01);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc11);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc21);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc31);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc02);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc12);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc22);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc32);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc03);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc13);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc23);\ -+ test_qpel_mc_funcs_type(PFX, NUM, mc33) -+ -+test_qpel_mc_funcs(put_h264_qpel, 4); -+test_qpel_mc_funcs(put_h264_qpel, 8); -+test_qpel_mc_funcs(put_h264_qpel, 16); -+test_qpel_mc_funcs(avg_h264_qpel, 4); -+test_qpel_mc_funcs(avg_h264_qpel, 8); -+test_qpel_mc_funcs(avg_h264_qpel, 16); -+ -+ -+#define dspfunc(PFX, IDX, NUM) \ -+ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \ -+ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \ -+ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \ -+ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \ -+ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \ -+ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \ -+ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico ) -+ -+#endif -+ -+void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx) -+{ -+ -+ /* H264 */ -+ -+ if ( 0 /*avr32_use_pico*/ ){ -+ c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico); -+ c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico); -+ c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico); -+ -+ c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico); -+ c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico); -+ c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico); -+ } -+ -+#define dspfunc(PFX, IDX, NUM) \ -+ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \ -+ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \ -+ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \ -+ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \ -+ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \ -+ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \ -+ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \ -+ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico ) -+ -+ if ( avr32_use_pico ){ -+ dspfunc(put_h264_qpel, 0, 16); -+ dspfunc(put_h264_qpel, 1, 8); -+ dspfunc(put_h264_qpel, 2, 4); -+ dspfunc(avg_h264_qpel, 0, 16); -+ dspfunc(avg_h264_qpel, 1, 8); -+ dspfunc(avg_h264_qpel, 2, 4); -+ } -+ -+ c->idct_put= DSP_FUNC_NAME(idct_put_avr32); -+ c->idct_add= DSP_FUNC_NAME(idct_add_avr32); -+ c->idct = DSP_FUNC_NAME(idct_avr32); -+ c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32); -+ c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32); -+ -+ /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/ -+ -+ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; -+ -+ c->fdct = fdct_avr32; -+ -+ c->clear_blocks = clear_blocks_avr32; -+ -+#undef dspfunc -+#define dspfunc(PFX, IDX, NUM) \ -+ c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \ -+ c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \ -+ c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \ -+ c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32) -+ -+ dspfunc(put, 0, 16); -+ dspfunc(put_no_rnd, 0, 16); -+ dspfunc(put, 1, 8); -+ dspfunc(put_no_rnd, 1, 8); -+ -+ dspfunc(avg, 1, 8); -+ dspfunc(avg_no_rnd, 1, 8); -+ dspfunc(avg, 0, 16); -+ dspfunc(avg_no_rnd, 0, 16); -+#undef dspfunc -+ -+} -+ -+ -+ -+#if 0 -+int main(int argc, char *argv[]){ -+ -+ -+} -+#endif -+ -diff --git a/libavcodec/avr32/fdct.S b/libavcodec/avr32/fdct.S -new file mode 100644 -index 0000000..be45b86 ---- /dev/null -+++ b/libavcodec/avr32/fdct.S -@@ -0,0 +1,541 @@ -+/* -+ * Copyright (c) 2007 Atmel Corporation. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above -+ * copyright notice, this list of conditions and the following -+ * disclaimer in the documentation and/or other materials provided -+ * with the distribution. -+ * -+ * 3. The name of ATMEL may not be used to endorse or promote products -+ * derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR -+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL -+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -+ * DAMAGE. -+ */ -+ -+//********************************************************** -+//* 2-D fDCT, Based on: * -+//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical * -+//* Fast 1-D DCT Algorithms with 11 Multiplications", * -+//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal * -+//* Processing 1989 (ICASSP '89), pp. 988-991. * -+//* * -+//* Fixed point implementation optimized for the AVR-II * -+//* instruction set. If a table is used for the * -+//* coeffisients we can load two and two of them from * -+//* This will give a reduction of -+//* * -+//* * -+//********************************************************** -+ -+ -+/* This routine is a slow-but-accurate integer implementation of the -+ * forward DCT (Discrete Cosine Transform). Taken from the IJG software -+ * -+ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT -+ * on each column. Direct algorithms are also available, but they are -+ * much more complex and seem not to be any faster when reduced to code. -+ * -+ * This implementation is based on an algorithm described in -+ * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT -+ * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, -+ * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. -+ * The primary algorithm described there uses 11 multiplies and 29 adds. -+ * We use their alternate method with 12 multiplies and 32 adds. -+ * The advantage of this method is that no data path contains more than one -+ * multiplication; this allows a very simple and accurate implementation in -+ * scaled fixed-point arithmetic, with a minimal number of shifts. -+ * -+ * The poop on this scaling stuff is as follows: -+ * -+ * Each 1-D DCT step produces outputs which are a factor of sqrt(N) -+ * larger than the true DCT outputs. The final outputs are therefore -+ * a factor of N larger than desired; since N=8 this can be cured by -+ * a simple right shift at the end of the algorithm. The advantage of -+ * this arrangement is that we save two multiplications per 1-D DCT, -+ * because the y0 and y4 outputs need not be divided by sqrt(N). -+ * In the IJG code, this factor of 8 is removed by the quantization step -+ * (in jcdctmgr.c), here it is removed. -+ * -+ * We have to do addition and subtraction of the integer inputs, which -+ * is no problem, and multiplication by fractional constants, which is -+ * a problem to do in integer arithmetic. We multiply all the constants -+ * by CONST_SCALE and convert them to integer constants (thus retaining -+ * CONST_BITS bits of precision in the constants). After doing a -+ * multiplication we have to divide the product by CONST_SCALE, with proper -+ * rounding, to produce the correct output. This division can be done -+ * cheaply as a right shift of CONST_BITS bits. We postpone shifting -+ * as long as possible so that partial sums can be added together with -+ * full fractional precision. -+ * -+ * The outputs of the first pass are scaled up by PASS1_BITS bits so that -+ * they are represented to better-than-integral precision. These outputs -+ * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word -+ * with the recommended scaling. (For 12-bit sample data, the intermediate -+ * array is INT32 anyway.) -+ * -+ * To avoid overflow of the 32-bit intermediate results in pass 2, we must -+ * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis -+ * shows that the values given below are the most effective. -+ * -+ * We can gain a little more speed, with a further compromise in accuracy, -+ * by omitting the addition in a descaling shift. This yields an incorrectly -+ * rounded result half the time... -+ */ -+ -+ .global fdct_avr32 -+ -+ -+ -+#define CONST_BITS 13 -+#define PASS1_BITS 2 -+ -+#define FIX_0_298631336 2446 /* FIX(0.298631336) */ -+#define FIX_0_390180644 3196 /* FIX(0.390180644) */ -+#define FIX_0_541196100 4433 /* FIX(0.541196100) */ -+#define FIX_0_765366865 6270 /* FIX(0.765366865) */ -+#define FIX_0_899976223 7373 /* FIX(0.899976223) */ -+#define FIX_1_175875602 9633 /* FIX(1.175875602) */ -+#define FIX_1_501321110 12299 /* FIX(1.501321110) */ -+#define FIX_1_847759065 15137 /* FIX(1.847759065) */ -+#define FIX_1_961570560 16069 /* FIX(1.961570560) */ -+#define FIX_2_053119869 16819 /* FIX(2.053119869) */ -+#define FIX_2_562915447 20995 /* FIX(2.562915447) */ -+#define FIX_3_072711026 25172 /* FIX(3.072711026) */ -+ -+ -+/* -+ * Perform an integer forward DCT on one block of samples. -+ */ -+ -+//void -+//fdct_int32(short *const block) -+//{ -+// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; -+// int tmp10, tmp11, tmp12, tmp13; -+// int z1, z2, z3, z4, z5; -+// short *blkptr; -+// int *dataptr; -+// int data[64]; -+// int i; -+// -+// /* Pass 1: process rows. */ -+// /* Note results are scaled up by sqrt(8) compared to a true DCT; */ -+// /* furthermore, we scale the results by 2**PASS1_BITS. */ -+// -+// dataptr = data; -+// blkptr = block; -+ -+ .text -+fdct_avr32: -+ pushm r0-r3, r4-r7, lr -+#define loop_ctr r0 -+#define blkptr r12 -+#define x0 r1 -+#define x1 r2 -+#define x2 r3 -+#define x3 r4 -+#define x4 r5 -+#define x5 r6 -+#define x6 r7 -+#define x7 r8 -+#define tmp0 r5 -+#define tmp7 r2 -+#define tmp1 r3 -+#define tmp6 r4 -+#define tmp2 r9 -+#define tmp5 r8 -+#define tmp3 r7 -+#define tmp4 r6 -+ -+ -+ mov loop_ctr, 8 -+// for (i = 0; i < 8; i++) { -+ROW_LOOP: -+ -+ ldm blkptr, r1, r2, r3, r4 -+ -+// tmp2 = blkptr[2] + blkptr[5]; -+// tmp3 = blkptr[3] + blkptr[4]; -+ paddx.h r5, r3, r2 -+// tmp5 = blkptr[2] - blkptr[5]; -+// tmp4 = blkptr[3] - blkptr[4]; -+ psubx.h r6, r3, r2 -+// tmp0 = blkptr[0] + blkptr[7]; -+// tmp1 = blkptr[1] + blkptr[6]; -+ paddx.h r2, r4, r1 -+// tmp7 = blkptr[0] - blkptr[7]; -+// tmp6 = blkptr[1] - blkptr[6]; -+ psubx.h r3, r4, r1 -+ -+// /* Even part per LL&M figure 1 --- note that published figure is faulty; -+// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". -+// */ -+ -+#define tmp10 r1 -+#define tmp13 r5 -+#define tmp11 r7 -+#define tmp12 r3 -+#define z1 r9 -+ -+// tmp10 = tmp0 + tmp3; -+// tmp13 = tmp0 - tmp3; -+ paddsub.h r1, r2:t, r5:b -+// tmp11 = tmp1 + tmp2; -+// tmp12 = tmp1 - tmp2; -+ paddsub.h r4, r2:b, r5:t -+ -+ -+// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS; -+// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS; -+ paddsub.h r7, r1:t, r4:t -+ ld.w r10, pc[const_table - .] -+ plsl.h r7, r7, PASS1_BITS -+ -+// z1 = (tmp12 + tmp13) * FIX_0_541196100; -+ addhh.w r8, r4:b, r1:b -+ mulhh.w r8, r8:b, r10:t -+ -+// dataptr[2] = -+// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS); -+// dataptr[6] = -+// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS); -+ mulhh.w r9, r1:b, r10:b -+ ld.w r10, pc[const_table - . + 4] -+ add r1, r8, r9 -+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 -+ -+ mulhh.w r9, r4:b, r10:t -+ add r4, r8, r9 -+ satrnds r4 >> (CONST_BITS - PASS1_BITS), 31 -+ -+ -+// /* Odd part per figure 8 --- note paper omits factor of sqrt(2). -+// * cK represents cos(K*pi/16). -+// * i0..i3 in the paper are tmp4..tmp7 here. -+// */ -+ -+#define z2 r5 -+#define z3 r6 -+#define z4 r7 -+#define z5 r8 -+ -+// z4 = tmp5 + tmp7; -+// z3 = tmp4 + tmp6; -+ padd.h r2, r6, r3 -+// z2 = tmp5 + tmp6; -+// z1 = tmp4 + tmp7; -+ paddx.h r5, r6, r3 -+ -+ lddpc r9, pc[const_table - . + 8] -+// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ -+ addhh.w r8, r2:t, r2:b -+ mulhh.w r8, r8:b, r10:b -+ lddpc r10, pc[const_table - . + 12] -+ -+ -+// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ -+ mulhh.w r11, r6:b, r9:t -+ -+// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ -+ mulhh.w r6, r6:t, r9:b -+ -+// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ -+ lddpc r9, pc[const_table - . + 20] -+ mulhh.w lr, r3:b, r10:t -+ -+// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ -+ mulhh.w r3, r3:t, r10:b -+ -+// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ -+ mulhh.w r10, r2:b, r9:t -+ -+// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ -+ mulhh.w r2, r2:t, r9:b -+ lddpc r9, pc[const_table - . + 16] -+// z3 += z5; -+// z4 += z5; -+ add r10, r8 -+ add r2, r8 -+ -+// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ -+ mulhh.w r8, r5:b, r9:t -+ -+// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ -+ mulhh.w r5, r5:t, r9:b -+ -+// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS); -+ add r11, r8 -+ add r11, r10 -+ satrnds r11 >> (CONST_BITS - PASS1_BITS), 31 -+ -+// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS); -+ add r6, r5 -+ -+ sthh.w blkptr[6*2], r4:b, r11:b -+ add r6, r2 -+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 -+ -+// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS); -+ add lr, r5 -+ sthh.w blkptr[4*2], r7:b, r6:b -+ add lr, r10 -+ satrnds lr >> (CONST_BITS - PASS1_BITS), 31 -+ -+// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS); -+ add r3, r8 -+ sthh.w blkptr[2*2], r1:b, lr:b -+ add r3, r2 -+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 -+ -+ -+ -+// dataptr += 8; /* advance pointer to next row */ -+// blkptr += 8; -+ sthh.w blkptr[0], r7:t, r3:b -+ sub blkptr, -16 -+ sub loop_ctr, 1 -+ brne ROW_LOOP -+ -+// } -+ -+ /* Pass 2: process columns. -+ * We remove the PASS1_BITS scaling, but leave the results scaled up -+ * by an overall factor of 8. -+ */ -+ -+// dataptr = data; -+ sub blkptr, 128 -+ -+ mov loop_ctr, 4 -+// for (i = 0; i < 8; i++) { -+COLOUMN_LOOP: -+ ld.w r1, blkptr[0] -+ ld.w r2, blkptr[1*8*2] -+ ld.w r3, blkptr[2*8*2] -+ ld.w r4, blkptr[3*8*2] -+ ld.w r5, blkptr[4*8*2] -+ ld.w r6, blkptr[5*8*2] -+ ld.w r7, blkptr[6*8*2] -+ ld.w r8, blkptr[7*8*2] -+ -+// tmp0 = blkptr[0] + blkptr[7*8]; -+ padds.sh r9, r1, r8 -+// tmp7 = blkptr[0] - blkptr[7*8]; -+ psubs.sh r1, r1, r8 -+// tmp1 = blkptr[1*8] + blkptr[6*8]; -+ padds.sh r8, r2, r7 -+// tmp6 = blkptr[1*8] - blkptr[6*8]; -+ psubs.sh r2, r2, r7 -+// tmp2 = blkptr[2*8] + blkptr[5*8]; -+ padds.sh r7, r3, r6 -+// tmp5 = blkptr[2*8] - blkptr[5*8]; -+ psubs.sh r3, r3, r6 -+// tmp3 = blkptr[3*8] + blkptr[4*8]; -+ padds.sh r6, r4, r5 -+// tmp4 = blkptr[3*8] - blkptr[4*8]; -+ psubs.sh r4, r4, r5 -+ -+// /* even part per ll&m figure 1 --- note that published figure is faulty; -+// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". -+// */ -+// -+// tmp10 = tmp0 + tmp3; -+ padds.sh r5, r9, r6 -+// tmp13 = tmp0 - tmp3; -+ psubs.sh r9, r9, r6 -+// tmp11 = tmp1 + tmp2; -+ padds.sh r6, r8, r7 -+// tmp12 = tmp1 - tmp2; -+ psubs.sh r8, r8, r7 -+ -+// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS); -+// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS); -+//Might get an overflow here -+ padds.sh r7, r5, r6 -+ psubs.sh r5, r5, r6 -+ -+ //Rounding -+ mov lr, (1 << (PASS1_BITS + 2)) -+ orh lr, hi(1 << (16 + PASS1_BITS + 2)) -+ padds.sh r7, r7, lr -+ padds.sh r5, r5, lr -+ -+ pasr.h r7, r7, PASS1_BITS + 3 -+ pasr.h r5, r5, PASS1_BITS + 3 -+ st.w r12[0], r7 -+ st.w r12[4*8*2], r5 -+ -+ lddpc r10, const_table2 -+ -+ -+// z1 = (tmp12 + tmp13) * FIX_0_541196100; -+ padds.sh r5, r8, r9 -+ mulhh.w r6, r5:t, r10:t -+ mulhh.w r7, r5:b, r10:t -+ -+// dataptr[16] = -+// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS); -+ lddpc r11, const_table2 + 4 -+ mulhh.w lr, r9:t, r10:b -+ mulhh.w r9, r9:b, r10:b -+ add lr, r6 -+ add r9, r7 -+ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31 -+ satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ sthh.w r12[2*8*2], lr:b, r9:b -+ -+// dataptr[48] = -+// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS); -+ mulhh.w lr, r8:t, r11:t -+ mulhh.w r8, r8:b, r11:t -+ add lr, r6 -+ add r8, r7 -+ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31 -+ satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ sthh.w r12[6*8*2], lr:b, r8:b -+ -+// /* Odd part per figure 8 --- note paper omits factor of sqrt(2). -+// * cK represents cos(K*pi/16). -+// * i0..i3 in the paper are tmp4..tmp7 here. -+// */ -+// -+// z2 = tmp5 + tmp6; -+// z3 = tmp4 + tmp6; -+// z4 = tmp5 + tmp7; -+ padds.sh r5, r3, r2 -+ padds.sh r6, r4, r2 -+ padds.sh r7, r3, r1 -+ -+// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ -+ padds.sh r8, r6, r7 -+ mulhh.w r9, r8:t, r11:b -+ mulhh.w r8, r8:b, r11:b -+ -+// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ -+// z3 += z5; -+ lddpc r11, const_table2 + 8 -+ mulhh.w r10, r6:t, r11:t -+ mulhh.w r6, r6:b, r11:t -+ add r10, r9 -+ add r6, r8 -+ -+// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ -+// z4 += z5; -+ mulhh.w lr, r7:t, r11:b -+ mulhh.w r7, r7:b, r11:b -+ lddpc r11, const_table2 + 12 -+ st.w --sp,r0 -+ add lr, r9 -+ add r7, r8 -+ -+// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ -+ mulhh.w r0, r2:t, r11:t -+ machh.w r0, r5:t, r11:b -+ mulhh.w r2, r2:b, r11:t -+ machh.w r2, r5:b, r11:b -+ -+// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ -+// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS); -+ add r0, r10 -+ lddpc r11, const_table2 + 16 -+ add r2, r6 -+ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ sthh.w r12[3*8*2], r0:b, r2:b -+// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ -+ mulhh.w r0, r3:t, r11:t -+ machh.w r0, r5:t, r11:b -+ mulhh.w r2, r3:b, r11:t -+ machh.w r2, r5:b, r11:b -+ add r0, lr -+ lddpc r11, const_table2 + 20 -+ add r2, r7 -+ -+// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS); -+ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ sthh.w r12[5*8*2], r0:b, r2:b -+ -+ -+// z1 = tmp4 + tmp7; -+ padds.sh r2, r4, r1 -+ -+// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ -+ mulhh.w r3, r4:t, r11:t -+ machh.w r3, r2:t, r11:b -+ mulhh.w r4, r4:b, r11:t -+ machh.w r4, r2:b, r11:b -+ add r3, r10 -+ lddpc r11, const_table2 + 24 -+ add r4, r6 -+ -+// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ -+// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS); -+ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ sthh.w r12[7*8*2], r3:b, r4:b -+ -+ -+// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ -+ mulhh.w r3, r1:t, r11:t -+ machh.w r3, r2:t, r11:b -+ mulhh.w r4, r1:b, r11:t -+ machh.w r4, r2:b, r11:b -+ add r3, lr -+ add r4, r7 -+ -+// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS); -+ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31 -+ sthh.w r12[1*8*2], r3:b, r4:b -+ ld.w r0, sp++ -+ -+// dataptr++; /* advance pointer to next column */ -+ sub blkptr, -4 -+ sub loop_ctr, 1 -+ brne COLOUMN_LOOP -+ -+// } -+ -+ popm r0-r3, r4-r7, pc -+ -+// /* descale */ -+// for (i = 0; i < 64; i++) -+// block[i] = (short int) DESCALE(data[i], 3); -+ -+ -+//} -+ -+ -+ .align 2 -+const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602 -+ .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110 -+ .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644 -+ -+const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602 -+ .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447 -+ .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223 -+ .short FIX_1_501321110, -FIX_0_899976223 -+ -+ -+ -+ -diff --git a/libavcodec/avr32/h264idct.S b/libavcodec/avr32/h264idct.S -new file mode 100644 -index 0000000..4b23e2d ---- /dev/null -+++ b/libavcodec/avr32/h264idct.S -@@ -0,0 +1,451 @@ -+/* -+ * Copyright (c) 2007 Atmel Corporation. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above -+ * copyright notice, this list of conditions and the following -+ * disclaimer in the documentation and/or other materials provided -+ * with the distribution. -+ * -+ * 3. The name of ATMEL may not be used to endorse or promote products -+ * derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR -+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL -+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -+ * DAMAGE. -+ */ -+ -+ .global h264_idct_add_avr32 -+ -+ /* Macro for performing the 1-D transform on one row line. -+ -+ The register 'w01' should contain the first two pixels, -+ and the register 'w23' should contain the last two pixels -+ in the line. The resulting line is placed in p01 and p23 -+ so that { w01, w23 } = { x0, x1, x3, x2 }. -+ 'tmp' and 'tmp2' should be scratchpad registers. */ -+ .macro transform_row w01, w23, tmp, tmp2 -+ add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */ -+ sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */ -+ bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */ -+ pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */ -+ paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */ -+ padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */ -+ psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */ -+ .endm -+ -+ /* Macro for performing the 1-D transform on two columns. -+ -+ The registers w0, w1, w2, w3 should each contain two -+ packed samples from the two colomns to transform. -+ tmp and tmp2 are scratchpad registers. -+ -+ The resulting transformed columns are placed in the -+ same positions as the input columns. -+ */ -+ .macro transform_2columns w0, w1, w2, w3, tmp, tmp2 -+ padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */ -+ psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */ -+ pasr.h \w2, \w1, 1 /* w2 = w1/2 */ -+ pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */ -+ psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */ -+ padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */ -+ padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */ -+ psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */ -+ padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */ -+ psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */ -+ /* Scale down result. */ -+ pasr.h \w0, \w0, 6 -+ pasr.h \w1, \w1, 6 -+ pasr.h \w2, \w2, 6 -+ pasr.h \w3, \w3, 6 -+ .endm -+ -+/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/ -+ -+h264_idct_add_avr32: -+ -+ stm --sp,r0-r3,r4-r7, lr -+ -+ /* Setup rounding factor. */ -+ mov r0, (1 << 5) -+ lsl r0, 16 -+ -+ /* Load block */ -+ ldm r11,r2-r9 -+ /* r9 = { w00, w01 }, -+ r8 = { w02, w03 }, -+ r7 = { w10, w11 }, -+ r6 = { w12, w13 }, -+ r5 = { w20, w21 }, -+ r4 = { w22, w23 }, -+ r3 = { w30, w31 }, -+ r2 = { w32, w33 } */ -+ -+ -+ /* Add the rounding factor to w00. */ -+ add r9, r0 -+ -+ /* Transform rows */ -+ transform_row r9, r8, r0, r1 -+ transform_row r7, r6, r0, r1 -+ transform_row r5, r4, r0, r1 -+ transform_row r3, r2, r0, r1 -+ -+ /* Transform columns */ -+ transform_2columns r9, r7, r5, r3, r0, r1 -+ transform_2columns r8, r6, r4, r2, r0, r1 -+ -+ /* Load predicted pixels.*/ -+ ld.w lr, r12[0] -+ ld.w r11, r12[r10] -+ -+ /* Unpack to halwords. */ -+ punpckub.h r0, lr:t -+ punpckub.h r1, lr:b -+ -+ /* Add with transformed row. */ -+ padd.h r0, r0, r9 -+ paddx.h r1, r1, r8 -+ /* Pack and saturate back to 8-bit pixels. */ -+ packsh.ub r0, r0, r1 -+ -+ /* Unpack to halwords. */ -+ punpckub.h lr, r11:t -+ punpckub.h r11, r11:b -+ -+ /* Add with transformed row. */ -+ padd.h lr, lr, r7 -+ paddx.h r11, r11, r6 -+ /* Pack and saturate back to 8-bit pixels. */ -+ packsh.ub r1, lr, r11 -+ -+ /* Store back to frame. */ -+ st.w r12[0], r0 -+ st.w r12[r10], r1 -+ -+ add r12, r12, r10 << 1 -+ -+ /* Load predicted pixels.*/ -+ ld.w lr, r12[0] -+ ld.w r11, r12[r10] -+ -+ /* Unpack to halwords. */ -+ punpckub.h r0, lr:t -+ punpckub.h r1, lr:b -+ -+ /* Add with transformed row. */ -+ padd.h r0, r0, r5 -+ paddx.h r1, r1, r4 -+ /* Pack and saturate back to 8-bit pixels. */ -+ packsh.ub r0, r0, r1 -+ -+ /* Unpack to halwords. */ -+ punpckub.h lr, r11:t -+ punpckub.h r11, r11:b -+ -+ /* Add with transformed row. */ -+ padd.h lr, lr, r3 -+ paddx.h r11, r11, r2 -+ /* Pack and saturate back to 8-bit pixels. */ -+ packsh.ub r1, lr, r11 -+ -+ /* Store back to frame. */ -+ st.w r12[0], r0 -+ st.w r12[r10], r1 -+ -+ ldm sp++,r0-r3,r4-r7, pc -+ -+ -+ .global h264_idct8_add_avr32 -+//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){ -+ -+h264_idct8_add_avr32: -+ stm --sp,r0-r3,r4-r7, lr -+ -+ /* Push dst and stride on stack */ -+ stm --sp,r10,r12 -+ -+// int i; -+// DCTELEM (*src)[8] = (DCTELEM(*)[8])block; -+// uint8_t *cm = cropTbl + MAX_NEG_CROP; -+ -+// block[0] += 32; -+ -+ -+// for( i = 0; i < 8; i++ ) -+// { -+ mov lr, 4 -+0: -+ ld.w r7, r11[0*(8*2)] -+ ld.w r6, r11[1*(8*2)] -+ ld.w r5, r11[2*(8*2)] -+ ld.w r4, r11[3*(8*2)] -+ ld.w r3, r11[4*(8*2)] -+ ld.w r2, r11[5*(8*2)] -+ ld.w r1, r11[6*(8*2)] -+ ld.w r0, r11[7*(8*2)] -+ -+/* -+ -+ const int a0 = src[0][i] + src[4][i]; -+ const int a2 = src[0][i] - src[4][i]; -+ const int a4 = (src[2][i]>>1) - src[6][i]; -+ const int a6 = (src[6][i]>>1) + src[2][i]; -+*/ -+ padd.h r8, r7, r3 /* r8 = a0 */ -+ psub.h r7, r7, r3 /* r7 = a2 */ -+ pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */ -+ pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */ -+ psub.h r3, r3, r1 /* r3 = a4 */ -+ padd.h r9, r9, r5 /* r9 = a6 */ -+ -+/* -+ const int b0 = a0 + a6; -+ const int b2 = a2 + a4; -+ const int b4 = a2 - a4; -+ const int b6 = a0 - a6; -+*/ -+ padd.h r1, r8, r9 /* r1 = b0 */ -+ psub.h r8, r8, r9 /* r8 = b6 */ -+ padd.h r5, r7, r3 /* r5 = b2 */ -+ psub.h r7, r7, r3 /* r7 = b4 */ -+ -+/* -+ const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1); -+ const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1); -+ const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1); -+ const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1); -+*/ -+ pasr.h r3, r0, 1 -+ padd.h r3, r3, r0 -+ psub.h r3, r2, r3 -+ psub.h r3, r3, r4 /* r3 = a1 */ -+ -+ pasr.h r9, r4, 1 -+ padd.h r9, r9, r4 -+ psub.h r9, r0, r9 -+ padd.h r9, r6, r9 /* r9 = a3 */ -+ -+ pasr.h r10, r2, 1 -+ padd.h r10, r10, r2 -+ padd.h r10, r10, r0 -+ psub.h r10, r10, r6 /* r10 = a5 */ -+ -+ pasr.h r0, r6, 1 -+ padd.h r0, r0, r6 -+ padd.h r0, r0, r2 -+ padd.h r0, r0, r4 /* r0 = a7 */ -+/* -+ const int b1 = (a7>>2) + a1; -+ const int b3 = a3 + (a5>>2); -+ const int b5 = (a3>>2) - a5; -+ const int b7 = a7 - (a1>>2); -+*/ -+ pasr.h r2, r0, 2 -+ padd.h r2, r2, r3 /* r2 = b1 */ -+ pasr.h r3, r3, 2 -+ psub.h r3, r0, r3 /* r3 = b7 */ -+ -+ pasr.h r0, r10, 2 -+ padd.h r0, r0, r9 /* r0 = b3 */ -+ pasr.h r9, r9, 2 -+ psub.h r9, r9, r10 /* r9 = b5 */ -+ -+ -+/* -+ src[0][i] = b0 + b7; -+ src[7][i] = b0 - b7; -+ src[1][i] = b2 + b5; -+ src[6][i] = b2 - b5; -+ src[2][i] = b4 + b3; -+ src[5][i] = b4 - b3; -+ src[3][i] = b6 + b1; -+ src[4][i] = b6 - b1; */ -+ -+ padd.h r4, r1, r3 -+ psub.h r1, r1, r3 -+ st.w r11[0*(8*2)], r4 -+ st.w r11[7*(8*2)], r1 -+ -+ padd.h r3, r5, r9 -+ psub.h r5, r5, r9 -+ st.w r11[1*(8*2)], r3 -+ st.w r11[6*(8*2)], r5 -+ -+ padd.h r9, r7, r0 -+ psub.h r7, r7, r0 -+ st.w r11[2*(8*2)], r9 -+ st.w r11[5*(8*2)], r7 -+ -+ padd.h r0, r8, r2 -+ psub.h r8, r8, r2 -+ st.w r11[3*(8*2)], r0 -+ st.w r11[4*(8*2)], r8 -+ -+ sub r11, -4 -+ sub lr, 1 -+ brne 0b -+ -+// } -+ -+ lddsp r12, sp[0] /* r12 = dst */ -+ sub r11, 4*4 -+ ldm r11++, r4-r7 -+ mov lr, 8 -+ /* Push dst and stride on stack */ -+ -+1: -+// for( i = 0; i < 8; i++ ) -+// { -+ -+ /* r7 = {src[i][0], src[i][1]} -+ r6 = {src[i][2], src[i][3]} -+ r5 = {src[i][4], src[i][5]} -+ r4 = {src[i][6], src[i][7]} */ -+ -+/* -+ const int a0 = src[i][0] + src[i][4]; -+ const int a2 = src[i][0] - src[i][4]; -+ const int a4 = (src[i][2]>>1) - src[i][6]; -+ const int a6 = (src[i][6]>>1) + src[i][2]; -+*/ -+ pasr.h r8, r6, 1 -+ pasr.h r9, r4, 1 -+ addhh.w r0, r7:t, r5:t /* r0 = a0 */ -+ subhh.w r1, r7:t, r5:t /* r1 = a2 */ -+ subhh.w r2, r8:t, r4:t /* r2 = a4 */ -+ addhh.w r3, r9:t, r6:t /* r3 = a6 */ -+ -+/* -+ const int b0 = a0 + a6; -+ const int b2 = a2 + a4; -+ const int b4 = a2 - a4; -+ const int b6 = a0 - a6; -+*/ -+ add r10, r0, r3 /* r10 = b0 */ -+ sub r0, r3 /* r0 = b6 */ -+ add r3, r1, r2 /* r3 = b2 */ -+ sub r1, r2 /* r1 = b4 */ -+/* -+ -+ -+ const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1); -+ const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1); -+ const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1); -+ const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */ -+ addhh.w r8, r8:b, r6:b -+ addhh.w r2, r4:b, r7:b -+ sub r2, r8 /* r2 = a3 */ -+ -+ addhh.w r9, r9:b, r4:b -+ subhh.w r8, r5:b, r6:b -+ sub r8, r9 /* r8 = a1 */ -+ -+ pasr.h r9, r7, 1 -+ addhh.w r9, r9:b, r7:b -+ addhh.w r6, r5:b, r6:b -+ add r6, r9 /* r6 = a7 */ -+ -+ pasr.h r9, r5, 1 -+ addhh.w r9, r9:b, r5:b -+ subhh.w r5, r4:b, r7:b -+ add r5, r9 /* r5 = a5 */ -+ -+/* const int b1 = (a7>>2) + a1; -+ const int b3 = (a5>>2) + a3; -+ const int b5 = (a3>>2) - a5; -+ const int b7 = -(a1>>2) + a7 ; */ -+ asr r4, r6, 2 -+ add r4, r8 /* r4 = b1 */ -+ asr r8, 2 -+ rsub r8, r6 /* r8 = b7 */ -+ -+ asr r6, r5, 2 -+ add r6, r2 /* r6 = b3 */ -+ asr r2, 2 -+ sub r2, r5 /* r2 = b5 */ -+ -+/* -+ dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ]; -+ dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ]; -+ dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ]; -+ dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ]; -+ dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ]; -+ dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ]; -+ dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ]; -+ dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ]; -+*/ -+ add r5, r10, r8 -+ satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */ -+ sub r10, r8 -+ satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */ -+ add r8, r3, r2 -+ satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */ -+ sub r3, r2 -+ satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */ -+ -+ add r2, r1, r6 -+ satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */ -+ sub r1, r6 -+ satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */ -+ -+ add r6, r0, r4 -+ satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */ -+ sub r0, r4 -+ satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */ -+ -+ ld.w r4, r12[0] -+ -+ packw.sh r8, r5, r8 -+ packw.sh r7, r2, r6 -+ ld.w r9, r12[4] -+ packw.sh r6, r0, r1 -+ packw.sh r5, r3, r10 -+ -+ punpckub.h r10, r4:t -+ punpckub.h r4, r4:b -+ punpckub.h r3, r9:t -+ punpckub.h r9, r9:b -+ -+ padd.h r8, r8, r10 -+ padd.h r7, r7, r4 -+ padd.h r6, r6, r3 -+ padd.h r5, r5, r9 -+ -+ lddsp r10, sp[4] /* r10 = stride */ -+ packsh.ub r0, r8, r7 -+ packsh.ub r1, r6, r5 -+ -+ st.w r12[0], r0 -+ st.w r12[4], r1 -+ -+ ldm r11++, r4-r7 -+ add r12, r10 /* dst += stride */ -+ -+ sub lr, 1 -+ brne 1b -+ -+ sub sp, -8 -+ ldm sp++,r0-r3,r4-r7, pc -+ -+ -+ -+// } -+//} -diff --git a/libavcodec/avr32/idct.S b/libavcodec/avr32/idct.S -new file mode 100644 -index 0000000..e7551ec ---- /dev/null -+++ b/libavcodec/avr32/idct.S -@@ -0,0 +1,829 @@ -+/* -+ * Copyright (c) 2007 Atmel Corporation. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above -+ * copyright notice, this list of conditions and the following -+ * disclaimer in the documentation and/or other materials provided -+ * with the distribution. -+ * -+ * 3. The name of ATMEL may not be used to endorse or promote products -+ * derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR -+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL -+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -+ * DAMAGE. -+ */ -+ -+ .global idct_add_avr32 -+ .global idct_put_avr32 -+ .global idct_avr32 -+ -+ -+#define CONST_BITS 13 -+#define PASS1_BITS 2 -+ -+#define ONE ((INT32) 1) -+ -+#define CONST_SCALE (ONE << CONST_BITS) -+ -+#define LINE_SIZE 32 -+ -+#define FIX_0_298631336 (2446) /* FIX(0.298631336) */ -+#define FIX_0_390180644 (3196) /* FIX(0.390180644) */ -+#define FIX_0_541196100 (4433) /* FIX(0.541196100) */ -+#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ -+#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ -+#define FIX_1_175875602 (9633) /* FIX(1.175875602) */ -+#define FIX_1_501321110 (12299)/* FIX(1.501321110) */ -+#define FIX_1_847759065 (15137)/* FIX(1.847759065) */ -+#define FIX_1_961570560 (16069)/* FIX(1.961570560) */ -+#define FIX_2_053119869 (16819)/* FIX(2.053119869) */ -+#define FIX_2_562915447 (20995)/* FIX(2.562915447) */ -+#define FIX_3_072711026 (25172)/* FIX(3.072711026) */ -+ -+ -+#define loop_cnt r11 -+ -+ .text -+ -+idct_add_avr32: -+ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables -+ -+ // Give room for some variables on the stack -+ sub sp, 8 -+ stdsp SP[0], r12 // rfp -+ stdsp SP[4], r11 // iinc -+ -+ mov loop_cnt, 8 //Initialize loop counter -+ -+FOR_ROW: -+ -+ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block -+ mov r6, 0 -+#ifdef USE_PREFETCH -+ pref r10[LINE_SIZE] //Prefetch next line -+#endif -+ or r4, r2, r3 << 16 -+ or r4, r1 //Check if all DCT-coeffisients except the DC is zero -+ or r4, r0 -+ brne AC_ROW //If there are non-zero AC coeffisients perform row-transform -+ -+ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5 -+ plsl.h r5, r5, PASS1_BITS -+ mov r4, r5 -+ st.d r10++, r4 -+ st.d r10++, r4 -+ -+ sub loop_cnt, 1 //Decrement loop counter -+ brne FOR_ROW //Perform loop one more time if loop_cnt is not zero -+ -+ bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed -+ -+ -+AC_ROW: -+ -+ -+ ld.w r12, pc[coef_table - .] -+ ld.w r9, pc[coef_table - . + 4] -+ -+ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] -+ mulhh.w r5, r4:t, r12:t -+ mulhh.w r6, r0:t, r12:b -+ ld.w r12, pc[coef_table - . + 8] -+ mulhh.w r7, r2:t, r9:t -+ add r6, r5 // tmp2 -+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 -+ add r7, r5 // tmp3 -+ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31 -+ -+ paddsub.h r5, r3:t, r1:t -+ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1 -+ -+ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13 -+ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12 -+ -+ -+ addhh.w lr, r3:b, r1:b // lr = z4 -+ addhh.w r5, r4:b, lr:b -+ mulhh.w r5, r5:b, r9:b // r5 = z5 -+ -+ ld.w r9, pc[coef_table - . + 12] -+ mulhh.w r4, r4:b, r12:t // r4 = z3 -+ mulhh.w lr, lr:b, r12:b // lr = z4 -+ -+ add r4, r5 -+ add lr, r5 -+ -+ addhh.w r5, r2:b, r1:b // r5 = z2 -+ addhh.w r8, r3:b, r0:b // r8 = z1 -+ -+ -+ mulhh.w r0, r0:b, r9:t // r0 = tmp0 -+ ld.w r12, pc[coef_table - . + 16] -+ mulhh.w r1, r1:b, r9:b // r1 = tmp1 -+ ld.w r9, pc[coef_table - . + 20] -+ mulhh.w r2, r2:b, r12:t // r2 = tmp2 -+ mulhh.w r3, r3:b, r12:b // r3 = tmp3 -+ mulhh.w r8, r8:b, r9:t // r8 = z1 -+ mulhh.w r5, r5:b, r9:b // r5 = z2 -+ -+ -+ add r0, r8 -+ add r0, r4 -+ add r1, r5 -+ add r1, lr -+ add r2, r5 -+ add r2, r4 -+ add r3, r8 -+ add r3, lr -+ -+ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31 -+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 -+ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31 -+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 -+ -+ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6] -+ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7] -+ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5] -+ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4] -+ -+ sthh.w r10[0], r4:t, r5:t -+ sthh.w r10[4], r3:t, r2:t -+ sthh.w r10[8], r2:b, r3:b -+ sthh.w r10[12], r5:b, r4:b -+ -+ -+ -+ sub r10, -16 -+ sub loop_cnt, 1 -+ brne FOR_ROW, e -+ -+COLOUMN_TRANSFORM: -+ -+ sub r10, 128 //Set pointer to start of DCT block -+ -+ -+ mov loop_cnt, 8 -+FOR_COLOUMN: -+ ldins.h r3:t,r10[0] // r3:t = dataptr[0] -+ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1] -+ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2] -+ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5] -+ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4] -+ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3] -+ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6] -+ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7] -+ -+ or r4, r1, r3 << 16 -+ or r4, r2 -+ or r4, r0 -+ brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform -+ -+ lddsp r12, SP[0] // rfp -+ lddsp r9, SP[4] // iinc -+ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9 -+ ld.d r0, r12[0] -+ sub r10, -2 // Increment the dataptr -+ bfins r3, r3, 16, 16 -+ punpckub.h r2, r1:t -+ padd.h r2, r2, r3 -+ punpckub.h r1, r1:b -+ padd.h r1, r1, r3 -+ packsh.ub r1, r2, r1 -+ punpckub.h r2, r0:t -+ padd.h r2, r2, r3 -+ punpckub.h r0, r0:b -+ padd.h r0, r0, r3 -+ packsh.ub r0, r2, r0 -+ st.d r12[0], r0 -+ add r12, r9 // increment rfp -+ stdsp SP[0], r12 -+ -+ sub loop_cnt, 1//Decrement loop counter -+ brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero -+ -+ sub sp, -8 -+ popm r0-r3, r4-r7, pc//Pop back registers and PC -+ -+AC_COLOUMN: -+ -+ ld.w r12, pc[coef_table - .] -+ ld.w r9, pc[coef_table - . + 4] -+ -+ addhh.w r4, r2:t, r2:b -+ mulhh.w r4, r4:b, r12:t // r4 = z1 -+ mulhh.w r5, r2:b, r12:b -+ ld.w r12, pc[coef_table - . + 8] -+ mulhh.w r6, r2:t, r9:t -+ add r5, r4 // r5 = tmp2 -+ add r6, r4 // r6 = tmp3 -+ -+ addhh.w r7, r3:t, r3:b -+ subhh.w r8, r3:t, r3:b -+ -+ lsl r7, CONST_BITS -+ lsl r8, CONST_BITS -+ -+ add r2, r7, r6 // r2 = tmp10 -+ sub r3, r7, r6 // r3 = tmp13 -+ add r4, r8, r5 // r4 = tmp11 -+ sub r5, r8, r5 // r5 = tmp12 -+ -+ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3 -+ addhh.w r7, r6:t, r6:b -+ mulhh.w r7, r7:b, r9:b // r7 = z5 -+ -+ ld.w r9, pc[coef_table - . + 12] -+ mulhh.w r8, r6:b, r12:t // r8 = z3 -+ mulhh.w r6, r6:t, r12:b // r6 = z4 -+ -+ add r8, r7 -+ add r6, r7 -+ -+ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1 -+ -+ mulhh.w r12, r0:b, r9:t // r12 = tmp0 -+ mulhh.w r0, r0:t, r9:b // r0 = tmp1 -+ ld.w r9, pc[coef_table - . + 16] -+ add r12, r8 -+ add r0, r6 -+ -+ ld.w lr, pc[coef_table - . + 20] -+ machh.w r8, r1:b, r9:t // r8 = tmp2 -+ machh.w r6, r1:t, r9:b // r6 = tmp3 -+ mulhh.w r9, r7:b, lr:t // r9 = z1 -+ mulhh.w r7, r7:t, lr:b // r7 = z2 -+ -+ -+ add r12, r9 -+ add r0, r7 -+ add r8, r7 -+ add r6, r9 -+ -+ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0] -+ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7] -+ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1] -+ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6] -+ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2] -+ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5] -+ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3] -+ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4] -+ -+ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9 -+ -+ packw.sh r1, r1, r6 -+ packw.sh r8, r8, r0 -+ packw.sh r3, r3, r5 -+ packw.sh r4, r4, r2 -+ -+ lddsp r12, SP[0] // rfp -+ lddsp r9, SP[4] // iinc -+ ld.d r6, r12[0] -+ sub r10, -2 // Increment the dataptr -+ punpckub.h r0, r7:t -+ padd.h r1, r1, r0 -+ punpckub.h r0, r7:b -+ padd.h r8, r8, r0 -+ packsh.ub r7, r1, r8 -+ punpckub.h r0, r6:t -+ padd.h r3, r3, r0 -+ punpckub.h r0, r6:b -+ padd.h r4, r4, r0 -+ packsh.ub r6, r3, r4 -+ st.d r12[0], r6 -+ add r12, r9 // increment rfp -+ stdsp SP[0], r12 -+ -+ sub loop_cnt, 1 //Decrement loop counter -+ brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero -+ -+ sub sp, -8 -+ popm r0-r3, r4-r7, pc //Pop back registers and PC -+ -+ -+ -+//Coeffisient Table: -+ .align 2 -+coef_table: -+ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 -+ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869 -+ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447 -+ -+ -+idct_put_avr32: -+ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables -+ -+ //; Give room for some variables on the stack -+ sub sp, 8 -+ stdsp SP[0], r12 // rfp -+ stdsp SP[4], r11 // iinc -+ -+ mov loop_cnt, 8 //Initialize loop counter -+ -+0: -+ -+ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block -+ mov r6, 0 -+#ifdef USE_PREFETCH -+ pref r10[LINE_SIZE] //Prefetch next line -+#endif -+ or r4, r2, r3 << 16 -+ or r4, r1 //Check if all DCT-coeffisients except the DC is zero -+ or r4, r0 -+ brne 1f //If there are non-zero AC coeffisients perform row-transform -+ -+ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5 -+ plsl.h r5, r5, PASS1_BITS -+ mov r4, r5 -+ st.d r10++, r4 -+ st.d r10++, r4 -+ -+ sub loop_cnt, 1 //Decrement loop counter -+ brne 0b //Perform loop one more time if loop_cnt is not zero -+ -+ bral 2f //Perform coloumn transform after row transform is computed -+ -+1: -+ -+ ld.w r12, pc[coef_table_copy - .] -+ ld.w r9, pc[coef_table_copy - . + 4] -+ -+ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] -+ mulhh.w r5, r4:t, r12:t -+ mulhh.w r6, r0:t, r12:b -+ ld.w r12, pc[coef_table_copy - . + 8] -+ mulhh.w r7, r2:t, r9:t -+ add r6, r5 // tmp2 -+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 -+ add r7, r5 // tmp3 -+ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31 -+ -+ paddsub.h r5, r3:t, r1:t -+ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1 -+ -+ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13 -+ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12 -+ -+ -+ -+ addhh.w lr, r3:b, r1:b // lr = z4 -+ addhh.w r5, r4:b, lr:b -+ mulhh.w r5, r5:b, r9:b // r5 = z5 -+ -+ ld.w r9, pc[coef_table_copy - . + 12] -+ mulhh.w r4, r4:b, r12:t // r4 = z3 -+ mulhh.w lr, lr:b, r12:b // lr = z4 -+ -+ add r4, r5 -+ add lr, r5 -+ -+ addhh.w r5, r2:b, r1:b // r5 = z2 -+ addhh.w r8, r3:b, r0:b // r8 = z1 -+ -+ -+ mulhh.w r0, r0:b, r9:t // r0 = tmp0 -+ ld.w r12, pc[coef_table_copy - . + 16] -+ mulhh.w r1, r1:b, r9:b // r1 = tmp1 -+ ld.w r9, pc[coef_table_copy - . + 20] -+ mulhh.w r2, r2:b, r12:t // r2 = tmp2 -+ mulhh.w r3, r3:b, r12:b // r3 = tmp3 -+ mulhh.w r8, r8:b, r9:t // r8 = z1 -+ mulhh.w r5, r5:b, r9:b // r5 = z2 -+ -+ -+ add r0, r8 -+ add r0, r4 -+ add r1, r5 -+ add r1, lr -+ add r2, r5 -+ add r2, r4 -+ add r3, r8 -+ add r3, lr -+ -+ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31 -+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 -+ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31 -+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 -+ -+ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6] -+ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7] -+ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5] -+ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4] -+ -+ sthh.w r10[0], r4:t, r5:t -+ sthh.w r10[4], r3:t, r2:t -+ sthh.w r10[8], r2:b, r3:b -+ sthh.w r10[12], r5:b, r4:b -+ -+ -+ -+ sub r10, -16 -+ sub loop_cnt, 1 -+ brne 0b -+ -+2: -+ -+ sub r10, 128 //Set pointer to start of DCT block -+ -+ mov loop_cnt, 8 -+ -+0: -+ ldins.h r3:t,r10[0] // r3:t = dataptr[0] -+ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1] -+ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2] -+ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5] -+ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4] -+ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3] -+ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6] -+ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7] -+ -+ or r4, r1, r3 << 16 -+ or r4, r2 -+ or r4, r0 -+ brne 1f //If there are non-zero AC coeffisients perform row-transform -+ -+ lddsp r12, SP[0] // rfp -+ lddsp r9, SP[4] // iinc -+ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31 -+ packw.sh r3, r3, r3 -+ packsh.ub r3, r3, r3 -+ mov r2, r3 -+ st.d r12[0], r2 -+ add r12, r9 // increment rfp -+ sub r10, -2 // Increment the dataptr -+ stdsp SP[0], r12 -+ -+ sub loop_cnt, 1//Decrement loop counter -+ brne 0b //Perform loop one more time if loop_cnt is not zero -+ -+ sub sp, -8 -+ popm r0-r3, r4-r7, pc//Pop back registers and PC -+ -+1: -+ -+ ld.w r12, pc[coef_table_copy - .] -+ ld.w r9, pc[coef_table_copy - . + 4] -+ -+ addhh.w r4, r2:t, r2:b -+ mulhh.w r4, r4:b, r12:t // r4 = z1 -+ mulhh.w r5, r2:b, r12:b -+ ld.w r12, pc[coef_table_copy - . + 8] -+ mulhh.w r6, r2:t, r9:t -+ add r5, r4 // r5 = tmp2 -+ add r6, r4 // r6 = tmp3 -+ -+ addhh.w r7, r3:t, r3:b -+ subhh.w r8, r3:t, r3:b -+ -+ lsl r7, CONST_BITS -+ lsl r8, CONST_BITS -+ -+ add r2, r7, r6 // r2 = tmp10 -+ sub r3, r7, r6 // r3 = tmp13 -+ add r4, r8, r5 // r4 = tmp11 -+ sub r5, r8, r5 // r5 = tmp12 -+ -+ -+ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3 -+ addhh.w r7, r6:t, r6:b -+ mulhh.w r7, r7:b, r9:b // r7 = z5 -+ -+ ld.w r9, pc[coef_table_copy - . + 12] -+ mulhh.w r8, r6:b, r12:t // r8 = z3 -+ mulhh.w r6, r6:t, r12:b // r6 = z4 -+ -+ add r8, r7 -+ add r6, r7 -+ -+ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1 -+ -+ mulhh.w r12, r0:b, r9:t // r12 = tmp0 -+ mulhh.w r0, r0:t, r9:b // r0 = tmp1 -+ ld.w r9, pc[coef_table_copy - . + 16] -+ add r12, r8 -+ add r0, r6 -+ -+ ld.w lr, pc[coef_table_copy - . + 20] -+ machh.w r8, r1:b, r9:t // r8 = tmp2 -+ machh.w r6, r1:t, r9:b // r6 = tmp3 -+ mulhh.w r9, r7:b, lr:t // r9 = z1 -+ mulhh.w r7, r7:t, lr:b // r7 = z2 -+ -+ -+ add r12, r9 -+ add r0, r7 -+ add r8, r7 -+ add r6, r9 -+ -+ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0] -+ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7] -+ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1] -+ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6] -+ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2] -+ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5] -+ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3] -+ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4] -+ -+ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9 -+ -+ packw.sh r1, r1, r6 -+ packw.sh r8, r8, r0 -+ packw.sh r3, r3, r5 -+ packw.sh r4, r4, r2 -+ -+ packsh.ub r1, r1, r8 -+ packsh.ub r0, r3, r4 -+ lddsp r12, SP[0] // rfp -+ lddsp r9, SP[4] // iinc -+ st.d r12[0], r0 -+ sub r10, -2 // Increment the dataptr -+ add r12, r9 // increment rfp -+ stdsp SP[0], r12 -+ -+ sub loop_cnt, 1 //Decrement loop counter -+ brne 0b //Perform loop one more time if loop_cnt is not zero -+ -+ sub sp, -8 -+ popm r0-r3, r4-r7, pc //Pop back registers and PC -+ -+ -+ -+ .align 2 -+coef_table_copy: -+ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 -+ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869 -+ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447 -+ -+ -+idct_avr32: -+ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables -+ -+ //; Give room for a temporary block on the stack -+ sub sp, 8*8*2 -+ -+ mov loop_cnt, 8 //Initialize loop counter -+ -+0: -+ -+ ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block -+ mov r6, 0 -+#ifdef USE_PREFETCH -+ pref r12[LINE_SIZE] //Prefetch next line -+#endif -+ or r4, r2, r3 << 16 -+ or r4, r1 //Check if all DCT-coeffisients except the DC is zero -+ or r4, r0 -+ brne 1f //If there are non-zero AC coeffisients perform row-transform -+ -+ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5 -+ plsl.h r5, r5, PASS1_BITS -+ mov r4, r5 -+ st.d sp++, r4 -+ st.d sp++, r4 -+ -+ sub loop_cnt, 1 //Decrement loop counter -+ brne 0b //Perform loop one more time if loop_cnt is not zero -+ -+ bral 2f //Perform coloumn transform after row transform is computed -+ -+1: -+ -+ ld.w r10, pc[coef_table_idct - .] -+ ld.w r9, pc[coef_table_idct - . + 4] -+ -+ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] -+ mulhh.w r5, r4:t, r10:t -+ mulhh.w r6, r0:t, r10:b -+ ld.w r10, pc[coef_table_idct - . + 8] -+ mulhh.w r7, r2:t, r9:t -+ add r6, r5 // tmp2 -+ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 -+ add r7, r5 // tmp3 -+ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31 -+ -+ paddsub.h r5, r3:t, r1:t -+ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1 -+ -+ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13 -+ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12 -+ -+ -+ -+ addhh.w lr, r3:b, r1:b // lr = z4 -+ addhh.w r5, r4:b, lr:b -+ mulhh.w r5, r5:b, r9:b // r5 = z5 -+ -+ ld.w r9, pc[coef_table_idct - . + 12] -+ mulhh.w r4, r4:b, r10:t // r4 = z3 -+ mulhh.w lr, lr:b, r10:b // lr = z4 -+ -+ add r4, r5 -+ add lr, r5 -+ -+ addhh.w r5, r2:b, r1:b // r5 = z2 -+ addhh.w r8, r3:b, r0:b // r8 = z1 -+ -+ -+ mulhh.w r0, r0:b, r9:t // r0 = tmp0 -+ ld.w r10, pc[coef_table_idct - . + 16] -+ mulhh.w r1, r1:b, r9:b // r1 = tmp1 -+ ld.w r9, pc[coef_table_idct - . + 20] -+ mulhh.w r2, r2:b, r10:t // r2 = tmp2 -+ mulhh.w r3, r3:b, r10:b // r3 = tmp3 -+ mulhh.w r8, r8:b, r9:t // r8 = z1 -+ mulhh.w r5, r5:b, r9:b // r5 = z2 -+ -+ -+ add r0, r8 -+ add r0, r4 -+ add r1, r5 -+ add r1, lr -+ add r2, r5 -+ add r2, r4 -+ add r3, r8 -+ add r3, lr -+ -+ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31 -+ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 -+ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31 -+ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 -+ -+ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6] -+ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7] -+ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5] -+ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4] -+ -+ sthh.w sp[0], r4:t, r5:t -+ sthh.w sp[4], r3:t, r2:t -+ sthh.w sp[8], r2:b, r3:b -+ sthh.w sp[12], r5:b, r4:b -+ -+ -+ -+ sub sp, -16 -+ sub loop_cnt, 1 -+ brne 0b -+ -+2: -+ -+ sub sp, 8*8*2 //Set pointer to start of DCT block -+ sub r12, 8*8*2 //Set pointer to start of DCT block -+ -+ mov loop_cnt, 8 -+ -+0: -+ ldins.h r3:t,sp[0] // r3:t = dataptr[0] -+ ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1] -+ ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2] -+ ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5] -+ ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4] -+ ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3] -+ ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6] -+ ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7] -+ -+ or r4, r1, r3 << 16 -+ or r4, r2 -+ or r4, r0 -+ brne 1f //If there are non-zero AC coeffisients perform row-transform -+ -+ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31 -+ packw.sh r3, r3, r3 -+ mov r2, r3 -+ st.d r12++, r2 -+ st.d r12++, r2 -+ sub sp, -2 // Increment the dataptr -+ -+ sub loop_cnt, 1//Decrement loop counter -+ brne 0b //Perform loop one more time if loop_cnt is not zero -+ -+ sub sp, -(8*8*2 - 8) -+ popm r0-r3, r4-r7, pc//Pop back registers and PC -+ -+1: -+ -+ ld.w r10, pc[coef_table_idct - .] -+ ld.w r9, pc[coef_table_idct - . + 4] -+ -+ addhh.w r4, r2:t, r2:b -+ mulhh.w r4, r4:b, r10:t // r4 = z1 -+ mulhh.w r5, r2:b, r10:b -+ ld.w r10, pc[coef_table_idct - . + 8] -+ mulhh.w r6, r2:t, r9:t -+ add r5, r4 // r5 = tmp2 -+ add r6, r4 // r6 = tmp3 -+ -+ addhh.w r7, r3:t, r3:b -+ subhh.w r8, r3:t, r3:b -+ -+ lsl r7, CONST_BITS -+ lsl r8, CONST_BITS -+ -+ add r2, r7, r6 // r2 = tmp10 -+ sub r3, r7, r6 // r3 = tmp13 -+ add r4, r8, r5 // r4 = tmp11 -+ sub r5, r8, r5 // r5 = tmp12 -+ -+ -+ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3 -+ addhh.w r7, r6:t, r6:b -+ mulhh.w r7, r7:b, r9:b // r7 = z5 -+ -+ ld.w r9, pc[coef_table_idct - . + 12] -+ mulhh.w r8, r6:b, r10:t // r8 = z3 -+ mulhh.w r6, r6:t, r10:b // r6 = z4 -+ -+ add r8, r7 -+ add r6, r7 -+ -+ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1 -+ -+ mulhh.w r10, r0:b, r9:t // r10 = tmp0 -+ mulhh.w r0, r0:t, r9:b // r0 = tmp1 -+ ld.w r9, pc[coef_table_idct - . + 16] -+ add r10, r8 -+ add r0, r6 -+ -+ ld.w lr, pc[coef_table_idct - . + 20] -+ machh.w r8, r1:b, r9:t // r8 = tmp2 -+ machh.w r6, r1:t, r9:b // r6 = tmp3 -+ mulhh.w r9, r7:b, lr:t // r9 = z1 -+ mulhh.w r7, r7:t, lr:b // r7 = z2 -+ -+ -+ add r10, r9 -+ add r0, r7 -+ add r8, r7 -+ add r6, r9 -+ -+ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0] -+ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7] -+ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1] -+ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6] -+ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2] -+ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5] -+ add r0, r3, r10 // r0 = dataptr[DCTSIZE*3] -+ sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4] -+ -+ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9 -+ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9 -+ -+ packw.sh r7, r1, r6 -+ packw.sh r6, r8, r0 -+ packw.sh r5, r3, r5 -+ packw.sh r4, r4, r2 -+ -+ stm r12, r4-r7 -+ sub sp, -2 // Increment the dataptr -+ sub r12, -16 -+ -+ sub loop_cnt, 1 //Decrement loop counter -+ brne 0b //Perform loop one more time if loop_cnt is not zero -+ -+ sub sp, -(8*8*2 - 8) -+ popm r0-r3, r4-r7, pc //Pop back registers and PC -+ -+ -+ -+ .align 2 -+coef_table_idct: -+ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 -+ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869 -+ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447 -+ -diff --git a/libavcodec/avr32/mc.S b/libavcodec/avr32/mc.S -new file mode 100644 -index 0000000..07a002d ---- /dev/null -+++ b/libavcodec/avr32/mc.S -@@ -0,0 +1,434 @@ -+/* -+ * Copyright (c) 2007 Atmel Corporation. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above -+ * copyright notice, this list of conditions and the following -+ * disclaimer in the documentation and/or other materials provided -+ * with the distribution. -+ * -+ * 3. The name of ATMEL may not be used to endorse or promote products -+ * derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR -+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL -+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -+ * DAMAGE. -+ */ -+ -+ -+ /* Macro for masking the lowest bit of each byte in a -+ packed word */ -+ .macro packedmask1 reg, round -+ .if \round -+ and \reg, \reg, r8 >> 1 -+ .else -+ and \reg, r8 -+ .endif -+ .endm -+ -+ /* Macro for 8 pixel wide horizontal and vertical interpolation functions */ -+ .macro pixels8_hv round, put -+ -+ -+ pushm r0-r7, lr -+ -+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ -+ -+ /* Rounding immediate */ -+ .if \round -+ mov r8, lo(0x02020202) -+ orh r8, hi(0x02020202) -+ .else -+ mov r8, lo(0x01010101) -+ orh r8, hi(0x01010101) -+ .endif -+ mov r7, 2 -+ -+ /* Pixel naming convention : -+ -+ |-----------------------------------------------------| -+ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 | -+ |----d00---d01---d02---d03---d04---d05---d06---d07----| -+ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 | -+ |-----------------------------------------------------| -+ */ -+1: -+ ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 } -+ ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 } -+ mov lr, r9 -+ eor r2, r0, r1 -+ packedmask1 r2, \round -+ add r2, r8 -+ -+ paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} -+ -+ add r11, r10 // pixels += line_size -+ ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 } -+ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 } -+0: -+ eor r5, r1, r3 -+ packedmask1 r5, \round -+ add r2, r5 -+ -+ paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2} -+ eor r6, r0, r1 -+ packedmask1 r6, \round -+ add r2, r2, r6 << 1 -+ -+ ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 } -+ add r11, r10 // pixels += line_size -+ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 } -+ -+ paddh.ub r0, r0, r1 -+ plsr.b r2, r2, 2 -+ padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 } -+ -+ /* Next row */ -+ .if \put -+ eor r2, r3, r4 -+ packedmask1 r2, \round -+ add r2, r8 -+ .else -+ ld.w r6, r12[0] -+ eor r2, r3, r4 -+ packedmask1 r2, \round -+ add r2, r8 -+ pavg.ub r0, r0, r6 -+ .endif -+ st.w r12[0], r0 // Put data into the block -+ -+ add r5, r2 -+ paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} -+ -+ eor r6, r0, r1 -+ packedmask1 r6, \round -+ add r5, r5, r6 << 1 -+ -+ .if \put -+ paddh.ub r1, r0, r1 -+ plsr.b r5, r5, 2 -+ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 } -+ .else -+ ld.w r3, r12[r10] -+ paddh.ub r1, r0, r1 -+ plsr.b r5, r5, 2 -+ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 } -+ pavg.ub r1, r1, r3 -+ .endif -+ -+ st.w r12[r10], r1 // Put data into the block -+ -+ -+ ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 } -+ add r11, r10 // pixels += line_size -+ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 } -+ add r12, r12, r10 << 1 // block += 2*line_size -+ sub lr, 2 -+ brne 0b -+ -+ mul r0, r10, r9 // r0 = line_size * h -+ rsub r0, r0, 4 // r0 = 4 - (line_size * h) -+ add r11, r0 -+ sub r11, r10 // pixels += 4 - (line_size * (h+1)) -+ add r12, r0 // pixels += 4 - (line_size * (h)) -+ sub r7, 1 -+ brne 1b -+ -+ popm r0-r7, pc -+ .endm -+ -+ -+ /* Macro for 8 pixel wide vertical interpolation functions */ -+ -+ .macro pixels8_v round, put -+ pushm r4-r7,lr -+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ -+ -+ /* -+ Pixel Naming Convention : -+ |-----------------------------------------------| -+ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | -+ |-d00---d01---d02---d03---d04---d05---d06---d07-| -+ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | -+ |-----------------------------------------------| -+ */ -+ ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 } -+ ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4 -+ ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 } -+ ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 } -+ sub r10, 4 // stride -= 4 -+ add r11, r11, r10 << 1 // src += 2*stride -+ sub r11, -4 // src += 4 -+ -+0: -+ .if \round -+ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} -+ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} -+ .else -+ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} -+ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} -+ .endif -+ -+ .if \put -+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } -+ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4 -+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } -+ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 } -+ .else -+ ld.w lr, r12[0] -+ ld.w r7, r12[4] -+ pavg.ub r5, r5, lr -+ pavg.ub r4, r4, r7 -+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } -+ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4 -+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } -+ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 } -+ .endif -+ add r11, r10 // src += stride -+#ifdef USE_PREFETCH -+ pref r11[0] -+#endif -+ add r12, r10 // dst += stride -+ -+ .if \round -+ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} -+ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} -+ .else -+ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} -+ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} -+ .endif -+ .if \put -+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } -+ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4 -+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } -+ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 } -+ .else -+ ld.w r8, r12[0] -+ ld.w r6, r12[4] -+ pavg.ub r5, r5, r8 -+ pavg.ub r4, r4, r6 -+ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } -+ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4 -+ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } -+ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 } -+ .endif -+ -+ add r11, r10 // src += stride -+#ifdef USE_PREFETCH -+ pref r11[0] -+#endif -+ add r12, r10 // dst += stride -+ sub r9, 2 -+ brne 0b -+ -+ popm r4-r7,pc -+ .endm -+ -+ /* Macro for 8 pixel wide horizontal interpolation functions */ -+ -+ .macro pixels8_h round, put -+ pushm r4-r7, lr -+ -+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ -+ /* -+ Pixel Naming Convention: -+ |--------------------------------------------------------------------| -+ | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08| -+ |------|-------|-------|-------|-------|-------|-------|-------|-----| -+ | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18| -+ |--------------------------------------------------------------------| -+ */ -+ -+ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 } -+ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 } -+ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 } -+ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 } -+ add r11, r10 // src += stride -+ -+0: -+ .if \round -+ pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} -+ pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} -+ .else -+ paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} -+ paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} -+ .endif -+ .if \put -+ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 } -+ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 } -+ .else -+ ld.w r8, r12[0] -+ ld.w r6, r12[4] -+ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 } -+ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 } -+ pavg.ub lr, lr, r8 -+ pavg.ub r7, r7, r6 -+ .endif -+ st.w r12[0], lr // dst = { d00, d01, d02, d03 } -+ st.w r12[4], r7 // dst = { d04, d05, d06, d07 } -+ ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 } -+ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 } -+ add r11, r10 // src += stride -+#ifdef USE_PREFETCH -+ pref r11[0] -+#endif -+ add r12, r10 // dst += stride -+ -+ .if \round -+ pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} -+ pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} -+ .else -+ paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} -+ paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} -+ .endif -+ .if \put -+ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 } -+ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 } -+ .else -+ ld.w r7, r12[0] -+ ld.w r6, r12[4] -+ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 } -+ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 } -+ pavg.ub r5, r5, r7 -+ pavg.ub r4, r4, r6 -+ .endif -+ st.w r12[0], r5 // dst = { d00, d01, d02, d03 } -+ st.w r12[4], r4 // dst = { d04, d05, d06, d07 } -+ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 } -+ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 } -+ add r11, r10 // src += stride -+#ifdef USE_PREFETCH -+ pref r11[0] -+#endif -+ add r12, r10 // dst += stride -+ sub r9, 2 -+ brne 0b -+ -+ popm r4-r7, pc -+ .endm -+ -+ /* Macro for 8 pixel wide copy functions */ -+ .macro pixels8 put -+ stm --sp, r3-r7,lr -+ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ -+ mov lr, r9 -+ sub r3, r10, 2 // stride2 = stride - 2 -+0: -+ .if \put -+ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 } -+ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4 -+ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 } -+ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 } -+ .else -+ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 } -+ ld.d r4, r12[0] -+ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4 -+ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 } -+ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 } -+ pavg.ub r6, r6, r4 -+ pavg.ub r7, r7, r5 -+ ld.d r4, r12[r10] -+ .endif -+ st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 } -+ add r11, r11, r3 << 1 // src += stride2 * 2 -+ .ifeq \put -+ pavg.ub r8, r8, r4 -+ pavg.ub r9, r9, r5 -+ .endif -+ st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 } -+ add r12, r12, r10 << 1 // dst += 2*stride -+ sub lr, 2 -+ brne 0b -+ ldm sp++, r3-r7,pc -+ -+ .endm -+ -+ .global put_no_rnd_pixels8_hv_avr32 -+ .text -+put_no_rnd_pixels8_hv_avr32: -+ pixels8_hv 0, 1 -+ -+ .global put_pixels8_hv_avr32 -+ .text -+put_pixels8_hv_avr32: -+ pixels8_hv 1, 1 -+ -+ .global avg_no_rnd_pixels8_hv_avr32 -+ .text -+avg_no_rnd_pixels8_hv_avr32: -+ pixels8_hv 0, 0 -+ -+ .global avg_pixels8_hv_avr32 -+ .text -+avg_pixels8_hv_avr32: -+ pixels8_hv 1, 0 -+ -+ .global put_no_rnd_pixels8_v_avr32 -+ .text -+put_no_rnd_pixels8_v_avr32: -+ pixels8_v 0, 1 -+ -+ .global put_pixels8_v_avr32 -+ .text -+put_pixels8_v_avr32: -+ pixels8_v 1, 1 -+ -+ .global avg_no_rnd_pixels8_v_avr32 -+ .text -+avg_no_rnd_pixels8_v_avr32: -+ pixels8_v 0, 0 -+ -+ .global avg_pixels8_v_avr32 -+ .text -+avg_pixels8_v_avr32: -+ pixels8_v 1, 0 -+ -+ .global put_no_rnd_pixels8_h_avr32 -+ .text -+put_no_rnd_pixels8_h_avr32: -+ pixels8_h 0, 1 -+ -+ .global put_pixels8_h_avr32 -+ .text -+put_pixels8_h_avr32: -+ pixels8_h 1, 1 -+ -+ .global avg_no_rnd_pixels8_h_avr32 -+ .text -+avg_no_rnd_pixels8_h_avr32: -+ pixels8_h 0, 0 -+ -+ .global avg_pixels8_h_avr32 -+ .text -+avg_pixels8_h_avr32: -+ pixels8_h 1, 0 -+ -+ .global put_pixels8_avr32 -+ .global put_no_rnd_pixels8_avr32 -+ .text -+put_pixels8_avr32: -+put_no_rnd_pixels8_avr32: -+ pixels8 1 -+ -+ .global avg_no_rnd_pixels8_avr32 -+ .global avg_pixels8_avr32 -+ .text -+avg_pixels8_avr32: -+avg_no_rnd_pixels8_avr32: -+ pixels8 0 -diff --git a/libavcodec/avr32/pico.h b/libavcodec/avr32/pico.h -new file mode 100644 -index 0000000..32201ba ---- /dev/null -+++ b/libavcodec/avr32/pico.h -@@ -0,0 +1,260 @@ -+/* -+ * Copyright (c) 2007 Atmel Corporation. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above -+ * copyright notice, this list of conditions and the following -+ * disclaimer in the documentation and/or other materials provided -+ * with the distribution. -+ * -+ * 3. The name of ATMEL may not be used to endorse or promote products -+ * derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR -+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL -+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -+ * DAMAGE. -+ */ -+#ifndef __PICO_H__ -+#define __PICO_H__ -+ -+ -+ -+/* Coprocessor Number */ -+#define PICO_CPNO 1 -+ -+/* Pixel Coprocessor Register file */ -+#define PICO_REGVECT_INPIX2 cr0 -+#define PICO_REGVECT_INPIX1 cr1 -+#define PICO_REGVECT_INPIX0 cr2 -+#define PICO_REGVECT_OUTPIX2 cr3 -+#define PICO_REGVECT_OUTPIX1 cr4 -+#define PICO_REGVECT_OUTPIX0 cr5 -+#define PICO_REGVECT_COEFF0_A cr6 -+#define PICO_REGVECT_COEFF0_B cr7 -+#define PICO_REGVECT_COEFF1_A cr8 -+#define PICO_REGVECT_COEFF1_B cr9 -+#define PICO_REGVECT_COEFF2_A cr10 -+#define PICO_REGVECT_COEFF2_B cr11 -+#define PICO_REGVECT_VMU0_OUT cr12 -+#define PICO_REGVECT_VMU1_OUT cr13 -+#define PICO_REGVECT_VMU2_OUT cr14 -+#define PICO_REGVECT_CONFIG cr15 -+ -+#define PICO_INPIX2 0 -+#define PICO_INPIX1 1 -+#define PICO_INPIX0 2 -+#define PICO_OUTPIX2 3 -+#define PICO_OUTPIX1 4 -+#define PICO_OUTPIX0 5 -+#define PICO_COEFF0_A 6 -+#define PICO_COEFF0_B 7 -+#define PICO_COEFF1_A 8 -+#define PICO_COEFF1_B 9 -+#define PICO_COEFF2_A 10 -+#define PICO_COEFF2_B 11 -+#define PICO_VMU0_OUT 12 -+#define PICO_VMU1_OUT 13 -+#define PICO_VMU2_OUT 14 -+#define PICO_CONFIG 15 -+ -+/* Config Register */ -+#define PICO_COEFF_FRAC_BITS_OFFSET 0 -+#define PICO_COEFF_FRAC_BITS_SIZE 4 -+#define PICO_OFFSET_FRAC_BITS_OFFSET 4 -+#define PICO_OFFSET_FRAC_BITS_SIZE 4 -+#define PICO_INPUT_MODE_OFFSET 8 -+#define PICO_INPUT_MODE_SIZE 2 -+#define PICO_OUTPUT_MODE_OFFSET 10 -+#define PICO_OUTPUT_MODE_SIZE 1 -+ -+struct pico_config_t { -+ unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE; -+ unsigned int output_mode : PICO_OUTPUT_MODE_SIZE; -+ unsigned int input_mode : PICO_INPUT_MODE_SIZE; -+ unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE; -+ unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE; -+ int vmu2_out; -+ int vmu1_out; -+ int vmu0_out; -+ short coeff2_2; -+ short coeff2_3; -+ short coeff2_0; -+ short coeff2_1; -+ short coeff1_2; -+ short coeff1_3; -+ short coeff1_0; -+ short coeff1_1; -+ short coeff0_2; -+ short coeff0_3; -+ short coeff0_0; -+ short coeff0_1; -+}; -+ -+ -+#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET) -+#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET) -+#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET) -+#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET) -+ -+#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1)) -+#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1)) -+#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1)) -+#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1)) -+ -+enum pico_input_mode { PICO_TRANSFORMATION_MODE, -+ PICO_HOR_FILTER_MODE, -+ PICO_VERT_FILTER_MODE }; -+ -+enum pico_output_mode { PICO_PACKED_MODE, -+ PICO_PLANAR_MODE }; -+ -+/* Bits in coefficients */ -+#define PICO_COEFF_BITS 12 -+ -+/* Operation bits */ -+#define PICO_MATRIX (0) -+#define PICO_USE_ACC (1 << 2) -+#define PICO_SINGLE_VECTOR (1 << 3) -+ -+ -+#define __str(x...) #x -+#define __xstr(x...) __str(x) -+ -+#define PICO_PUT_W(pico_reg, x) \ -+ __builtin_mvrc_w(PICO_CPNO, pico_reg, x); -+#define PICO_GET_W(pico_reg) \ -+ __builtin_mvcr_w(PICO_CPNO, pico_reg) -+ -+#define PICO_MVCR_W(x, pico_reg) \ -+ asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x)); -+ -+#define PICO_MVRC_W(pico_reg, x) \ -+ asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x)); -+ -+#define PICO_PUT_D(pico_reg, x) \ -+ __builtin_mvrc_d(PICO_CPNO, pico_reg, x); -+#define PICO_GET_D(pico_reg) \ -+ __builtin_mvcr_d(PICO_CPNO, pico_reg) -+ -+#define PICO_MVCR_D(x, pico_reg) \ -+ asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x)); -+#define PICO_MVRC_D(pico_reg, x) \ -+ asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x)); -+ -+#define PICO_STCM_W(ptr, pico_regs...) \ -+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); -+#define PICO_STCM_D(ptr, pico_regs...) \ -+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); -+ -+#define PICO_STCM_W_DEC(ptr, pico_regs...) \ -+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); -+#define PICO_STCM_D_DEC(ptr, pico_regs...) \ -+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); -+ -+#define PICO_LDCM_W(ptr, pico_regs...) \ -+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); -+#define PICO_LDCM_D(ptr, pico_regs...) \ -+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); -+ -+#define PICO_LDCM_W_INC(ptr, pico_regs...) \ -+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); -+#define PICO_LDCM_D_INC(ptr, pico_regs...) \ -+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); -+ -+#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \ -+ __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr); -+ -+static inline void set_pico_config(struct pico_config_t *config){ -+ PICO_LDCM_D(config, -+ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, -+ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B, -+ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B, -+ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG); -+} -+ -+static inline void get_pico_config(struct pico_config_t *config){ -+ PICO_STCM_D(config, -+ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, -+ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B, -+ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B, -+ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT, -+ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG); -+} -+ -+static inline void dump_pico_config(){ -+ struct pico_config_t pico_config; -+ char *input_mode, *output_mode; -+ get_pico_config(&pico_config); -+ -+ -+ av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n"); -+ av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits); -+ av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits); -+ -+ switch ( pico_config.input_mode ){ -+ case PICO_TRANSFORMATION_MODE: -+ input_mode = "Transformation Mode"; -+ break; -+ case PICO_HOR_FILTER_MODE: -+ input_mode = "Horisontal Filter Mode"; -+ break; -+ case PICO_VERT_FILTER_MODE: -+ input_mode = "Vertical Filter Mode"; -+ break; -+ default: -+ input_mode = "Unknown Mode!!"; -+ break; -+ } -+ av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode); -+ -+ switch ( pico_config.output_mode ){ -+ case PICO_PLANAR_MODE: -+ output_mode = "Planar Mode"; -+ break; -+ case PICO_PACKED_MODE: -+ output_mode = "Packed Mode"; -+ break; -+ default: -+ output_mode = "Unknown Mode!!"; -+ break; -+ } -+ -+ av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode); -+ -+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits)); -+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits)); -+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits)); -+ av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits)); -+ -+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits)); -+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits)); -+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits)); -+ av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits)); -+ -+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits)); -+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits)); -+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits)); -+ av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits)); -+} -+ -+ -+ -+#endif -+ -diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h -index 26b4f8d..1f8fabf 100644 ---- a/libavcodec/bitstream.h -+++ b/libavcodec/bitstream.h -@@ -171,7 +171,7 @@ typedef struct RL_VLC_ELEM { - #endif - - /* used to avoid missaligned exceptions on some archs (alpha, ...) */ --#if defined(ARCH_X86) || defined(ARCH_X86_64) -+#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_AVR32) - # define unaligned16(a) (*(const uint16_t*)(a)) - # define unaligned32(a) (*(const uint32_t*)(a)) - # define unaligned64(a) (*(const uint64_t*)(a)) -@@ -813,6 +813,44 @@ void free_vlc(VLC *vlc); - * if the vlc code is invalid and max_depth>1 than the number of bits removed - * is undefined - */ -+ -+#if defined(ARCH_AVR32) -+#define GET_VLC(code, name, gb, table, bits, max_depth)\ -+{\ -+ int n, index, nb_bits;\ -+ union { VLC_TYPE vlc[2];\ -+ uint32_t u32; } table_elem;\ -+\ -+ index= SHOW_UBITS(name, gb, bits);\ -+ table_elem.u32 = unaligned32(&table[index]); \ -+ code = table_elem.vlc[0];\ -+ n = table_elem.vlc[1];\ -+\ -+ if(max_depth > 1 && n < 0 ){\ -+ LAST_SKIP_BITS(name, gb, bits)\ -+ UPDATE_CACHE(name, gb)\ -+\ -+ nb_bits = -n;\ -+\ -+ index= SHOW_UBITS(name, gb, nb_bits) + code;\ -+ table_elem.u32 = unaligned32(&table[index]); \ -+ code = table_elem.vlc[0];\ -+ n = table_elem.vlc[1];\ -+ if(max_depth > 2 && n < 0){\ -+ LAST_SKIP_BITS(name, gb, nb_bits)\ -+ UPDATE_CACHE(name, gb)\ -+\ -+ nb_bits = -n;\ -+\ -+ index= SHOW_UBITS(name, gb, nb_bits) + code;\ -+ code = table[index][0];\ -+ n = table[index][1];\ -+ }\ -+ }\ -+ SKIP_BITS(name, gb, n)\ -+} -+ -+#else - #define GET_VLC(code, name, gb, table, bits, max_depth)\ - {\ - int n, index, nb_bits;\ -@@ -821,7 +859,7 @@ void free_vlc(VLC *vlc); - code = table[index][0];\ - n = table[index][1];\ - \ -- if(max_depth > 1 && n < 0){\ -+ if(max_depth > 1 && n < 0 ){\ - LAST_SKIP_BITS(name, gb, bits)\ - UPDATE_CACHE(name, gb)\ - \ -@@ -843,7 +881,38 @@ void free_vlc(VLC *vlc); - }\ - SKIP_BITS(name, gb, n)\ - } -+#endif - -+#if defined(ARCH_AVR32) -+#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\ -+{\ -+ int n, index, nb_bits;\ -+ union { RL_VLC_ELEM vlc;\ -+ uint32_t u32; } table_elem;\ -+\ -+ index= SHOW_UBITS(name, gb, bits);\ -+ table_elem.u32 = unaligned32(&table[index]); \ -+ level = table_elem.vlc.level;\ -+ n = table_elem.vlc.len;\ -+\ -+ if(max_depth > 1 && n < 0 ){\ -+ SKIP_BITS(name, gb, bits)\ -+ if(need_update){\ -+ UPDATE_CACHE(name, gb)\ -+ }\ -+\ -+ nb_bits = -n;\ -+\ -+ index= SHOW_UBITS(name, gb, nb_bits) + level;\ -+ table_elem.u32 = unaligned32(&table[index]); \ -+ level = table_elem.vlc.level;\ -+ n = table_elem.vlc.len;\ -+ }\ -+ run= table_elem.vlc.run;\ -+ SKIP_BITS(name, gb, n)\ -+} -+ -+#else - #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\ - {\ - int n, index, nb_bits;\ -@@ -852,7 +921,7 @@ void free_vlc(VLC *vlc); - level = table[index].level;\ - n = table[index].len;\ - \ -- if(max_depth > 1 && n < 0){\ -+ if(max_depth > 1 && n < 0 ){\ - SKIP_BITS(name, gb, bits)\ - if(need_update){\ - UPDATE_CACHE(name, gb)\ -@@ -867,7 +936,7 @@ void free_vlc(VLC *vlc); - run= table[index].run;\ - SKIP_BITS(name, gb, n)\ - } -- -+#endif - - /** - * parses a vlc code, faster then get_vlc() -diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c -index 56c42b9..8fc10c6 100644 ---- a/libavcodec/dsputil.c -+++ b/libavcodec/dsputil.c -@@ -4197,6 +4197,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) - #ifdef ARCH_BFIN - dsputil_init_bfin(c,avctx); - #endif -+#ifdef ARCH_AVR32 -+ dsputil_init_avr32(c,avctx); -+#endif - - for(i=0; i<64; i++){ - if(!c->put_2tap_qpel_pixels_tab[0][i]) -diff --git a/libavcodec/h264.c b/libavcodec/h264.c -index 865e80a..8f7c3f1 100644 ---- a/libavcodec/h264.c -+++ b/libavcodec/h264.c -@@ -3258,7 +3258,12 @@ static void free_tables(H264Context *h){ - - static void init_dequant8_coeff_table(H264Context *h){ - int i,q,x; -+#ifdef ARCH_AVR32 -+ const int transpose = 0; -+#else - const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly -+#endif -+ - h->dequant8_coeff[0] = h->dequant8_buffer[0]; - h->dequant8_coeff[1] = h->dequant8_buffer[1]; - -@@ -3281,7 +3286,13 @@ static void init_dequant8_coeff_table(H264Context *h){ - - static void init_dequant4_coeff_table(H264Context *h){ - int i,j,q,x; -+ // Yes this is ugly as hell.... -+#ifdef ARCH_AVR32 -+ const int transpose = 0; -+#else - const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly -+#endif -+ - for(i=0; i<6; i++ ){ - h->dequant4_coeff[i] = h->dequant4_buffer[i]; - for(j=0; jdsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly -+#endif - memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t)); - memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t)); - }else{ -diff --git a/libavutil/common.h b/libavutil/common.h -index 3ae5971..7e52b90 100644 ---- a/libavutil/common.h -+++ b/libavutil/common.h -@@ -283,23 +283,39 @@ static inline int mid_pred(int a, int b, int c) - * @param amax maximum value of the clip range - * @return cliped value - */ -+#if defined(ARCH_AVR32) -+#define clip(a, amin, amax) \ -+ ({ int __tmp__; \ -+ asm ("min\t%0, %1, %2\n" \ -+ "max\t%0, %0, %3\n" \ -+ : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \ -+ __tmp__; }) -+#else - static inline int clip(int a, int amin, int amax) - { - if (a < amin) return amin; - else if (a > amax) return amax; - else return a; - } -+#endif - - /** - * clip a signed integer value into the 0-255 range - * @param a value to clip - * @return cliped value - */ -+#if defined(ARCH_AVR32) -+#define clip_uint8(a) \ -+ ({ int __tmp__ = a; \ -+ asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \ -+ __tmp__; }) -+#else - static inline uint8_t clip_uint8(int a) - { - if (a&(~255)) return (-a)>>31; - else return a; - } -+#endif - - /* math */ - int64_t ff_gcd(int64_t a, int64_t b); -diff --git a/libavutil/internal.h b/libavutil/internal.h -index 285d304..a8b0718 100644 ---- a/libavutil/internal.h -+++ b/libavutil/internal.h -@@ -210,6 +210,15 @@ if((y)<(x)){\ - }\ - } - -+/* XXX: Hack for uclibc which declares lrintf but does not implement it... */ -+#ifdef ARCH_AVR32 -+#undef HAVE_LRINTF -+#define HAVE_LRINTF 1 -+#define lrintf(x) rint(x) -+#define llrint(x) (long long)rint(x) -+#endif -+ -+ - #ifndef HAVE_LRINTF - /* XXX: add ISOC specific test to avoid specific BSD testing. */ - /* better than nothing implementation. */ -diff --git a/libfaad2/common.h b/libfaad2/common.h -index f809042..6c5fb21 100644 ---- a/libfaad2/common.h -+++ b/libfaad2/common.h -@@ -67,7 +67,7 @@ extern "C" { - /* Use if target platform has address generators with autoincrement */ - //#define PREFER_POINTERS - --#if defined(_WIN32_WCE) || defined(__arm__) -+#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__) - #define FIXED_POINT - #endif - -diff --git a/libmpcodecs/ad_libmad.c b/libmpcodecs/ad_libmad.c -index 076359a..51b77fe 100644 ---- a/libmpcodecs/ad_libmad.c -+++ b/libmpcodecs/ad_libmad.c -@@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){ - sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2; - sh->samplerate=this->frame.header.samplerate; - sh->i_bps=this->frame.header.bitrate/8; -+#ifdef WORDS_BIGENDIAN -+ sh->sample_format = AF_FORMAT_S16_BE; -+#else -+ sh->sample_format = AF_FORMAT_S16_LE; -+#endif - sh->samplesize=2; - - return 1; -diff --git a/libswscale/pico-avr32.h b/libswscale/pico-avr32.h -new file mode 100644 -index 0000000..7ac6200 ---- /dev/null -+++ b/libswscale/pico-avr32.h -@@ -0,0 +1,137 @@ -+/* -+ * Copyright (c) 2007 Atmel Corporation. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above -+ * copyright notice, this list of conditions and the following -+ * disclaimer in the documentation and/or other materials provided -+ * with the distribution. -+ * -+ * 3. The name of ATMEL may not be used to endorse or promote products -+ * derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR -+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL -+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -+ * DAMAGE. -+ */ -+#ifndef __PICO_H__ -+#define __PICO_H__ -+ -+/* Coprocessor Number */ -+#define PICO_CPNO 1 -+ -+/* Pixel Coprocessor Register file */ -+#define PICO_REGVECT_INPIX2 cr0 -+#define PICO_REGVECT_INPIX1 cr1 -+#define PICO_REGVECT_INPIX0 cr2 -+#define PICO_REGVECT_OUTPIX2 cr3 -+#define PICO_REGVECT_OUTPIX1 cr4 -+#define PICO_REGVECT_OUTPIX0 cr5 -+#define PICO_REGVECT_COEFF0_A cr6 -+#define PICO_REGVECT_COEFF0_B cr7 -+#define PICO_REGVECT_COEFF1_A cr8 -+#define PICO_REGVECT_COEFF1_B cr9 -+#define PICO_REGVECT_COEFF2_A cr10 -+#define PICO_REGVECT_COEFF2_B cr11 -+#define PICO_REGVECT_VMU0_OUT cr12 -+#define PICO_REGVECT_VMU1_OUT cr13 -+#define PICO_REGVECT_VMU2_OUT cr14 -+#define PICO_REGVECT_CONFIG cr15 -+ -+#define PICO_INPIX2 0 -+#define PICO_INPIX1 1 -+#define PICO_INPIX0 2 -+#define PICO_OUTPIX2 3 -+#define PICO_OUTPIX1 4 -+#define PICO_OUTPIX0 5 -+#define PICO_COEFF0_A 6 -+#define PICO_COEFF0_B 7 -+#define PICO_COEFF1_A 8 -+#define PICO_COEFF1_B 9 -+#define PICO_COEFF2_A 10 -+#define PICO_COEFF2_B 11 -+#define PICO_VMU0_OUT 12 -+#define PICO_VMU1_OUT 13 -+#define PICO_VMU2_OUT 14 -+#define PICO_CONFIG 15 -+ -+/* Config Register */ -+#define PICO_COEFF_FRAC_BITS 0 -+#define PICO_COEFF_FRAC_BITS_WIDTH 4 -+#define PICO_OFFSET_FRAC_BITS 4 -+#define PICO_OFFSET_FRAC_BITS_WIDTH 4 -+#define PICO_INPUT_MODE 8 -+#define PICO_INPUT_MODE_WIDTH 2 -+#define PICO_OUTPUT_MODE 10 -+ -+#define PICO_TRANSFORMATION_MODE 0 -+#define PICO_HOR_FILTER_MODE 1 -+#define PICO_VERT_FILTER_MODE 2 -+ -+#define PICO_PLANAR_MODE 1 -+#define PICO_PACKED_MODE 0 -+ -+/* Bits in coefficients */ -+#define PICO_COEFF_BITS 12 -+ -+/* Operation bits */ -+#define PICO_USE_ACC (1 << 2) -+#define PICO_SINGLE_VECTOR (1 << 3) -+ -+ -+#define __str(x...) #x -+#define __xstr(x...) __str(x) -+ -+#define PICO_PUT_W(pico_reg, x) \ -+ __builtin_mvrc_w(PICO_CPNO, pico_reg, x); -+#define PICO_GET_W(pico_reg) \ -+ __builtin_mvcr_w(PICO_CPNO, pico_reg) -+ -+#define PICO_PUT_D(pico_reg, x) \ -+ __builtin_mvrc_d(PICO_CPNO, pico_reg, x); -+#define PICO_GET_D(pico_reg) \ -+ __builtin_mvcr_d(PICO_CPNO, pico_reg) -+ -+ -+#define PICO_STCM_W(ptr, pico_regs...) \ -+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); -+#define PICO_STCM_D(ptr, pico_regs...) \ -+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); -+ -+#define PICO_STCM_W_DEC(ptr, pico_regs...) \ -+ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); -+#define PICO_STCM_D_DEC(ptr, pico_regs...) \ -+ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); -+ -+#define PICO_LDCM_W(ptr, pico_regs...) \ -+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); -+#define PICO_LDCM_D(ptr, pico_regs...) \ -+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); -+ -+#define PICO_LDCM_W_INC(ptr, pico_regs...) \ -+ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); -+#define PICO_LDCM_D_INC(ptr, pico_regs...) \ -+ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); -+ -+#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \ -+ __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr); -+ -+ -+#endif -+ -diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h -index ecd28f5..3221d0c 100644 ---- a/libswscale/swscale_internal.h -+++ b/libswscale/swscale_internal.h -@@ -173,7 +173,7 @@ typedef struct SwsContext{ - SwsFunc yuv2rgb_get_func_ptr (SwsContext *c); - int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation); - --char *sws_format_name(int format); -+char *sws_format_name(enum PixelFormat format); - - //FIXME replace this with something faster - #define isPlanarYUV(x) ((x)==PIX_FMT_YUV410P || (x)==PIX_FMT_YUV420P \ -diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c -index 71759bc..fa83985 100644 ---- a/libswscale/yuv2rgb.c -+++ b/libswscale/yuv2rgb.c -@@ -44,6 +44,10 @@ - #include "yuv2rgb_mlib.c" - #endif - -+#ifdef ARCH_AVR32 -+#include "yuv2rgb_avr32.c" -+#endif -+ - #define DITHER1XBPP // only for mmx - - const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={ -@@ -601,6 +605,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) - if(t) return t; - } - #endif -+#ifdef ARCH_AVR32 -+ { -+ SwsFunc t= yuv2rgb_init_avr32(c); -+ if(t) return t; -+ } -+#endif - #ifdef HAVE_ALTIVEC - if (c->flags & SWS_CPU_CAPS_ALTIVEC) - { -@@ -678,6 +688,10 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, - //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv); - oy -= 256*brightness; - -+#ifdef ARCH_AVR32 -+ yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation); -+#endif -+ - for (i = 0; i < 1024; i++) { - int j; - -diff --git a/libswscale/yuv2rgb_avr32.c b/libswscale/yuv2rgb_avr32.c -new file mode 100644 -index 0000000..4a8341e ---- /dev/null -+++ b/libswscale/yuv2rgb_avr32.c -@@ -0,0 +1,416 @@ -+/* -+ * Copyright (c) 2007 Atmel Corporation. All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * -+ * 2. Redistributions in binary form must reproduce the above -+ * copyright notice, this list of conditions and the following -+ * disclaimer in the documentation and/or other materials provided -+ * with the distribution. -+ * -+ * 3. The name of ATMEL may not be used to endorse or promote products -+ * derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR -+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL -+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, -+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -+ * DAMAGE. -+ */ -+#include "pico-avr32.h" -+ -+ -+#define RGB(uv_part) \ -+ __asm__ volatile ( \ -+ "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \ -+ "ld.w\t%1, %4[%8:" uv_part " << 2]\n\t" /* g = c->table_gU[U] */ \ -+ "ld.w\t%2, %5[%8:" uv_part " << 2]\n\t" /* b = c->table_bU[U] */ \ -+ "add\t%1, %0\n\t" /* g += tmp */\ -+ "ld.w\t%0, %6[%7:" uv_part " << 2]" /* r = c->table_rV[V] */ \ -+ : "=&r" (r), "=&r" (g), "=&r" (b) \ -+ : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \ -+ "r" (&c->table_rV[0]), "r" (V), "r" (U)); -+ -+ -+#undef YUV2RGB1 -+#define YUV2RGB1(dst, src, y, idx) \ -+ { int tmp2; __asm__ volatile ( \ -+ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ -+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ -+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \ -+ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ -+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ -+ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ -+ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \ -+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ -+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \ -+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ -+ "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \ -+ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ -+ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } -+ -+#undef YUV2RGB2 -+#define YUV2RGB2(dst, src, y, idx) \ -+ { int tmp2; __asm__ volatile ( \ -+ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ -+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ -+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \ -+ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ -+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ -+ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ -+ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \ -+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ -+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \ -+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ -+ "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \ -+ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ -+ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } -+ -+ -+#undef YUV2BGR1 -+#define YUV2BGR1(dst, src, y, idx) \ -+ { int tmp2; __asm__ volatile ( \ -+ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ -+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ -+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \ -+ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ -+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ -+ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ -+ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \ -+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ -+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \ -+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ -+ "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \ -+ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ -+ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } -+ -+#undef YUV2BGR2 -+#define YUV2BGR2(dst, src, y, idx) \ -+ { int tmp2; __asm__ volatile ( \ -+ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ -+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ -+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \ -+ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ -+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ -+ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ -+ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \ -+ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ -+ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \ -+ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ -+ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ -+ "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \ -+ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ -+ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } -+ -+ -+ -+int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, -+ int srcSliceH, uint8_t* dst[], int dstStride[]){ -+ int y; -+ -+ if(c->srcFormat == PIX_FMT_YUV422P){ -+ srcStride[1] *= 2; -+ srcStride[2] *= 2; -+ } -+ -+ -+ for(y=0; y>1)*srcStride[1]; -+ uint8_t *pv= src[2] + (y>>1)*srcStride[2]; -+ unsigned int h_size= c->dstW>>3; -+ while (h_size--) { -+ uint32_t U, V, Y1, Y2, tmp; -+ U = ((uint32_t*)pu)[0]; -+ V = ((uint32_t*)pv)[0]; -+ -+ RGB("t") -+ YUV2BGR1(dst_1, py_1, Y1, 0) -+ YUV2BGR1(dst_2, py_2, Y2, 0) -+ -+ RGB("u") -+ YUV2BGR2(dst_1, py_1, Y1, 1) -+ YUV2BGR2(dst_2, py_2, Y2, 1) -+ -+ RGB("l") -+ YUV2BGR1(dst_1, py_1, Y1, 2) -+ YUV2BGR1(dst_2, py_2, Y2, 2) -+ -+ RGB("b") -+ YUV2BGR2(dst_1, py_1, Y1, 3) -+ YUV2BGR2(dst_2, py_2, Y2, 3) -+ -+ -+ -+ pu += 4; -+ pv += 4; -+ py_1 += 8; -+ py_2 += 8; -+ dst_1 += 24; -+ dst_2 += 24; -+ } -+ } -+ return srcSliceH; -+} -+ -+ -+ -+static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, -+ int srcSliceH, uint8_t* dst[], int dstStride[]){ -+ int y; -+ -+ if(c->srcFormat == PIX_FMT_YUV422P){ -+ srcStride[1] *= 2; -+ srcStride[2] *= 2; -+ } -+ for(y=0; y>1)*srcStride[1]; -+ uint8_t *pv= src[2] + (y>>1)*srcStride[2]; -+ unsigned int h_size= c->dstW>>3; -+ while (h_size--) { -+ uint32_t U, V, Y1, Y2, tmp; -+ U = ((uint32_t*)pu)[0]; -+ V = ((uint32_t*)pv)[0]; -+ -+ RGB("t") -+ YUV2RGB1(dst_1, py_1, Y1, 0) -+ YUV2RGB1(dst_2, py_2, Y2, 0) -+ -+ RGB("u") -+ YUV2RGB2(dst_1, py_1, Y1, 1) -+ YUV2RGB2(dst_2, py_2, Y2, 1) -+ -+ RGB("l") -+ YUV2RGB1(dst_1, py_1, Y1, 2) -+ YUV2RGB1(dst_2, py_2, Y2, 2) -+ -+ RGB("b") -+ YUV2RGB2(dst_1, py_1, Y1, 3) -+ YUV2RGB2(dst_2, py_2, Y2, 3) -+ -+ pu += 4; -+ pv += 4; -+ py_1 += 8; -+ py_2 += 8; -+ dst_1 += 24; -+ dst_2 += 24; -+ } -+ } -+ return srcSliceH; -+} -+ -+#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits) -+#define COEFF_FRAC_BITS 9 -+#define OFFSET_FRAC_BITS 2 -+ -+/* Coefficients used in the pico */ -+static struct { -+ short coeff2_2; -+ short coeff2_3; -+ short coeff2_0; -+ short coeff2_1; -+ short coeff1_2; -+ short coeff1_3; -+ short coeff1_0; -+ short coeff1_1; -+ short coeff0_2; -+ short coeff0_3; -+ short coeff0_0; -+ short coeff0_1; -+} pico_coeff; -+ -+ -+static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, -+ int srcSliceH, uint8_t* dst[], int dstStride[]){ -+ int y; -+ static int first_time = 1; -+ -+ /* Initialize pico */ -+ PICO_LDCM_D(&pico_coeff, -+ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, -+ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B, -+ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B); -+ -+ PICO_PUT_W(PICO_CONFIG, -+ (PICO_PACKED_MODE << PICO_OUTPUT_MODE -+ | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE -+ | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS -+ | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS)); -+ -+ -+ if(c->srcFormat == PIX_FMT_YUV422P){ -+ srcStride[1] *= 2; -+ srcStride[2] *= 2; -+ } -+ -+ for(y=0; y>1)*srcStride[1]; -+ uint8_t *pv= src[2] + (y>>1)*srcStride[2]; -+ unsigned int h_size= c->dstW>>3; -+ int *py_1_int = (int *)py_1; -+ int *py_2_int = (int *)py_2; -+ int *pu_int = (int *)pu; -+ int *pv_int = (int *)pv; -+ while (h_size--) { -+ PICO_PUT_W(PICO_INPIX0, *py_1_int++); -+ PICO_PUT_W(PICO_INPIX1, *pu_int++); -+ PICO_PUT_W(PICO_INPIX2, *pv_int++); -+ PICO_OP(0, 0, 0, 4, 8); -+ PICO_OP(0, 1, 1, 4, 8); -+ PICO_OP(0, 2, 2, 5, 9); -+ PICO_OP(0, 3, 3, 5, 9); -+ PICO_PUT_W(PICO_INPIX0, *py_1_int++); -+ PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); -+ PICO_OP(0, 0, 0, 6, 10); -+ PICO_OP(0, 1, 1, 6, 10); -+ PICO_OP(0, 2, 2, 7, 11); -+ PICO_OP(0, 3, 3, 7, 11); -+ PICO_PUT_W(PICO_INPIX0, *py_2_int++); -+ PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); -+ -+ PICO_OP(0, 0, 0, 4, 8); -+ PICO_OP(0, 1, 1, 4, 8); -+ PICO_OP(0, 2, 2, 5, 9); -+ PICO_OP(0, 3, 3, 5, 9); -+ PICO_PUT_W(PICO_INPIX0, *py_2_int++); -+ PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); -+ PICO_OP(0, 0, 0, 6, 10); -+ PICO_OP(0, 1, 1, 6, 10); -+ PICO_OP(0, 2, 2, 7, 11); -+ PICO_OP(0, 3, 3, 7, 11); -+ PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); -+ -+ dst_1 += 24; -+ dst_2 += 24; -+ } -+ } -+ return srcSliceH; -+} -+ -+extern int avr32_use_pico; -+ -+SwsFunc yuv2rgb_init_avr32 (SwsContext *c){ -+ switch(c->dstFormat){ -+ case PIX_FMT_BGR24: -+ { -+ if ( avr32_use_pico ){ -+ MSG_ERR("AVR32 BGR24: Using PICO for color space conversion\n"); -+ return yuv2bgr24_avr32_pico; -+ } else { -+ MSG_ERR("AVR32 BGR24: Using optimized color space conversion\n"); -+ return yuv2bgr24_avr32; -+ } -+ } -+ break; -+ case PIX_FMT_RGB24: -+ { -+ if ( avr32_use_pico ){ -+ MSG_ERR("AVR32 RGB24: Using PICO for color space conversion\n"); -+ return yuv2bgr24_avr32_pico; -+ } else { -+ MSG_ERR("AVR32 RGB24: Using optimized color space conversion\n"); -+ return yuv2rgb24_avr32; -+ } -+ } -+ } -+ return NULL; -+} -+ -+ -+int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){ -+ const int isRgb = (c->dstFormat == PIX_FMT_RGB24); -+ -+ int64_t crv = inv_table[0]; -+ int64_t cbu = inv_table[1]; -+ int64_t cgu = -inv_table[2]; -+ int64_t cgv = -inv_table[3]; -+ int64_t cy = 1<<16; -+ int64_t oy = 0; -+ -+ if(!fullRange){ -+ cy= (cy*255) / 219; -+ oy= 16<<16; -+ } -+ -+ cy = (cy *contrast )>>16; -+ crv= (crv*contrast * saturation)>>32; -+ cbu= (cbu*contrast * saturation)>>32; -+ cgu= (cgu*contrast * saturation)>>32; -+ cgv= (cgv*contrast * saturation)>>32; -+ -+ oy -= 256*brightness; -+ -+ pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */ -+ pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */ -+ pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */ -+ pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS) -+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */ -+ -+ if ( isRgb ){ -+ pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */ -+ pico_coeff.coeff0_1 = 0; /* R <- U */ -+ pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */ -+ pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS) -+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */ -+ -+ pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */ -+ pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */ -+ pico_coeff.coeff2_2 = 0; /* B <- V */ -+ pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS) -+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */ -+ } else { -+ pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */ -+ pico_coeff.coeff2_1 = 0; /* R <- U */ -+ pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */ -+ pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS) -+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */ -+ -+ pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */ -+ pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */ -+ pico_coeff.coeff0_2 = 0; /* B <- V */ -+ pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS) -+ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */ -+ } -+ -+} -+ -+ -+#undef RGB -diff --git a/libvo/vo_fbdev2.c b/libvo/vo_fbdev2.c -index 053c193..7017770 100644 ---- a/libvo/vo_fbdev2.c -+++ b/libvo/vo_fbdev2.c -@@ -22,6 +22,9 @@ - #include "sub.h" - #include "mp_msg.h" - -+/* Draw directly to framebuffer */ -+#define USE_CONVERT2FB -+ - static vo_info_t info = { - "Framebuffer Device", - "fbdev2", -@@ -178,6 +181,15 @@ static int fb_preinit(int reset) - } - fb_orig_vinfo = fb_vinfo; - -+ /* Reset panning offset */ -+ fb_vinfo.yoffset = 0; -+ if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) { -+ mp_msg(MSGT_VO, MSGL_ERR, -+ "[fbdev2] FBIOPAN_DISPLAY failed: %s\n", -+ strerror(errno)); -+ return 0; -+ } -+ - fb_bpp = fb_vinfo.bits_per_pixel; - - /* 16 and 15 bpp is reported as 16 bpp */ -@@ -289,6 +301,10 @@ static int config(uint32_t width, uint32_t height, uint32_t d_width, - mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno)); - return 1; - } -+#else -+ if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2) -+ && fb_vinfo.yoffset == 0) -+ center += fb_line_len * fb_vinfo.yres; - #endif - if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres); - -@@ -299,14 +315,22 @@ static int query_format(uint32_t format) - { - // open the device, etc. - if (fb_preinit(0)) return 0; -- if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) { -+ if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) { - int fb_target_bpp = format & 0xff; - set_bpp(&fb_vinfo, fb_target_bpp); - fb_vinfo.xres_virtual = fb_vinfo.xres; -- fb_vinfo.yres_virtual = fb_vinfo.yres; -+ fb_vinfo.yres_virtual = fb_vinfo.yres * 2; - if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) { -- mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno)); -- return 0; -+ mp_msg(MSGT_VO, MSGL_WARN, -+ "[fbdev2] Can't double virtual y resolution: %s\n", -+ strerror(errno)); -+ fb_vinfo.yres_virtual = fb_vinfo.yres; -+ if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) { -+ mp_msg(MSGT_VO, MSGL_ERR, -+ "[fbdev2] Can't put VSCREENINFO: %s\n", -+ strerror(errno)); -+ return -1; -+ } - } - fb_pixel_size = fb_vinfo.bits_per_pixel / 8; - fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length + -@@ -367,16 +391,67 @@ static void check_events(void) - - static void flip_page(void) - { --#ifndef USE_CONVERT2FB - int i, out_offset = 0, in_offset = 0; - -- for (i = 0; i < in_height; i++) { -- memcpy(center + out_offset, next_frame + in_offset, -- in_width * fb_pixel_size); -- out_offset += fb_line_len; -- in_offset += in_width * fb_pixel_size; -- } -+#ifndef USE_CONVERT2FB -+ if (1) { -+#else -+ if (fb_vinfo.yres_virtual == fb_vinfo.yres) { - #endif -+ for (i = 0; i < in_height; i++) { -+ memcpy(center + out_offset, next_frame + in_offset, -+ in_width * fb_pixel_size); -+ out_offset += fb_line_len; -+ in_offset += in_width * fb_pixel_size; -+ } -+ } else { -+ if (fb_vinfo.yoffset == 0) { -+ fb_vinfo.yoffset += fb_vinfo.yres; -+ center -= fb_line_len * fb_vinfo.yres; -+ } else { -+ fb_vinfo.yoffset = 0; -+ center += fb_line_len * fb_vinfo.yres; -+ } -+ -+ if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) { -+ mp_msg(MSGT_VO, MSGL_ERR, -+ "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n", -+ strerror(errno)); -+ } -+ } -+} -+ -+static uint32_t get_image(mp_image_t *mpi) -+{ -+ if(mpi->flags&MP_IMGFLAG_READABLE) -+ return VO_FALSE; // slow video ram -+ if(mpi->type==MP_IMGTYPE_STATIC) -+ return VO_FALSE; // it is not static -+ -+ if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) { -+ // we're lucky or codec accepts stride => ok, let's go! -+ -+ //YUY2 and RGB formats -+ mpi->planes[0] = center; -+ mpi->width = in_width; -+ mpi->stride[0] = fb_line_len; -+ -+ // center image -+ -+ mpi->flags |= MP_IMGFLAG_DIRECT; -+ -+ return VO_TRUE; -+ } -+ -+ return VO_FALSE; -+} -+ -+static uint32_t put_image(mp_image_t *mpi) -+{ -+ // already out? -+ if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK))) -+ return VO_TRUE; -+ return VO_FALSE; - } - - static void uninit(void) -@@ -403,6 +478,10 @@ static int control(uint32_t request, void *data, ...) - switch (request) { - case VOCTRL_QUERY_FORMAT: - return query_format(*((uint32_t*)data)); -+ case VOCTRL_GET_IMAGE: -+ return get_image(data); -+ case VOCTRL_DRAW_IMAGE: -+ return put_image(data); - } - return VO_NOTIMPL; - } -diff --git a/version.sh b/version.sh -index 44b5c5d..cf22a68 100755 ---- a/version.sh -+++ b/version.sh -@@ -1,2 +1,2 @@ - #!/bin/sh --echo "#define VERSION \"1.0rc1-$1\"" > version.h -+echo "#define VERSION \"1.0rc1.atmel.2-$1\"" > version.h diff --git a/package/mplayer/mplayer-1.0rc1-atmel.3.patch b/package/mplayer/mplayer-1.0rc1-atmel.3.patch new file mode 100644 index 000000000..800f43e8e --- /dev/null +++ b/package/mplayer/mplayer-1.0rc1-atmel.3.patch @@ -0,0 +1,6444 @@ + cfg-common.h | 4 + + cfg-mencoder.h | 4 + + cfg-mplayer.h | 4 + + configure | 13 +- + libaf/af_format.c | 7 + + libavcodec/Makefile | 7 + + libavcodec/avr32/dsputil_avr32.c | 2678 ++++++++++++++++++++++++++++++++++++++ + libavcodec/avr32/fdct.S | 541 ++++++++ + libavcodec/avr32/h264idct.S | 451 +++++++ + libavcodec/avr32/idct.S | 829 ++++++++++++ + libavcodec/avr32/mc.S | 434 ++++++ + libavcodec/avr32/pico.h | 260 ++++ + libavcodec/bitstream.h | 77 +- + libavcodec/dsputil.c | 3 + + libavcodec/h264.c | 15 + + libavutil/common.h | 16 + + libavutil/internal.h | 9 + + libfaad2/common.h | 2 +- + libmpcodecs/ad_libmad.c | 5 + + libswscale/pico-avr32.h | 137 ++ + libswscale/swscale_internal.h | 2 +- + libswscale/yuv2rgb.c | 14 + + libswscale/yuv2rgb_avr32.c | 416 ++++++ + libvo/vo_fbdev2.c | 101 ++- + version.sh | 2 +- + 25 files changed, 6011 insertions(+), 20 deletions(-) + create mode 100644 libavcodec/avr32/dsputil_avr32.c + create mode 100644 libavcodec/avr32/fdct.S + create mode 100644 libavcodec/avr32/h264idct.S + create mode 100644 libavcodec/avr32/idct.S + create mode 100644 libavcodec/avr32/mc.S + create mode 100644 libavcodec/avr32/pico.h + create mode 100644 libswscale/pico-avr32.h + create mode 100644 libswscale/yuv2rgb_avr32.c + +diff --git a/cfg-common.h b/cfg-common.h +index 780df38..7d878a8 100644 +--- a/cfg-common.h ++++ b/cfg-common.h +@@ -235,6 +235,10 @@ + {"tsprobe", &ts_probe, CONF_TYPE_POSITION, 0, 0, TS_MAX_PROBE_SIZE, NULL}, + {"tskeepbroken", &ts_keep_broken, CONF_TYPE_FLAG, 0, 0, 1, NULL}, + ++#ifdef ARCH_AVR32 ++ {"use-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 0, 1, NULL}, ++ {"nouse-pico", &avr32_use_pico, CONF_TYPE_FLAG, 0, 1, 0, NULL}, ++#endif + // draw by slices or whole frame (useful with libmpeg2/libavcodec) + {"slices", &vd_use_slices, CONF_TYPE_FLAG, 0, 0, 1, NULL}, + {"noslices", &vd_use_slices, CONF_TYPE_FLAG, 0, 1, 0, NULL}, +diff --git a/cfg-mencoder.h b/cfg-mencoder.h +index 411b748..addf791 100644 +--- a/cfg-mencoder.h ++++ b/cfg-mencoder.h +@@ -5,6 +5,10 @@ + + #include "cfg-common.h" + ++#ifdef ARCH_AVR32 ++extern int avr32_use_pico; ++#endif ++ + #ifdef USE_FAKE_MONO + extern int fakemono; // defined in dec_audio.c + #endif +diff --git a/cfg-mplayer.h b/cfg-mplayer.h +index 62b6eac..31499c2 100644 +--- a/cfg-mplayer.h ++++ b/cfg-mplayer.h +@@ -4,6 +4,10 @@ + + #include "cfg-common.h" + ++#ifdef ARCH_AVR32 ++extern int avr32_use_pico; ++#endif ++ + extern int noconsolecontrols; + + #if defined(HAVE_FBDEV)||defined(HAVE_VESA) +diff --git a/configure b/configure +index 29002c8..56c6fe4 100755 +--- a/configure ++++ b/configure +@@ -1203,6 +1203,15 @@ EOF + _optimizing="$proc" + ;; + ++ avr32) ++ _def_arch='#define ARCH_AVR32' ++ _target_arch='TARGET_ARCH_AVR32 = yes' ++ iproc='avr32' ++ proc='' ++ _march='' ++ _mcpu='' ++ _optimizing='' ++ ;; + arm|armv4l|armv5tel) + _def_arch='#define ARCH_ARMV4L 1' + _target_arch='TARGET_ARCH_ARMV4L = yes' +@@ -1533,7 +1542,7 @@ echores $_named_asm_args + # Checking for CFLAGS + _stripbinaries=yes + if test "$_profile" != "" || test "$_debug" != "" ; then +- CFLAGS="-W -Wall -O2 $_march $_mcpu $_debug $_profile" ++ CFLAGS="-W -Wall -O4 $_march $_mcpu $_debug $_profile" + if test "$_cc_major" -ge "3" ; then + CFLAGS=`echo "$CFLAGS" | sed -e 's/\(-Wall\)/\1 -Wno-unused-parameter/'` + fi +@@ -3794,7 +3803,7 @@ fi + + + echocheck "X11 headers presence" +- for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do ++ for I in `echo $_inc_extra | sed s/-I//g`; do + if test -f "$I/X11/Xlib.h" ; then + _inc_x11="-I$I" + _x11_headers="yes" +diff --git a/libaf/af_format.c b/libaf/af_format.c +index e5b7cc9..5d7ea6d 100644 +--- a/libaf/af_format.c ++++ b/libaf/af_format.c +@@ -20,7 +20,14 @@ + // Integer to float conversion through lrintf() + #ifdef HAVE_LRINTF + #include ++ ++#ifdef ARCH_AVR32 ++#define lrintf(x) rint(x) ++#define llrint(x) (long long)rint(x) ++#else + long int lrintf(float); ++#endif ++ + #else + #define lrintf(x) ((int)(x)) + #endif +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 17b6c45..8e1dc96 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -360,6 +360,12 @@ OBJS-$(TARGET_ARCH_SPARC) += sparc/dsputil_vis.o \ + + sparc/dsputil_vis.o: CFLAGS += -mcpu=ultrasparc -mtune=ultrasparc + ++# avr32 specific stuff ++ifeq ($(TARGET_ARCH_AVR32),yes) ++ASM_OBJS += avr32/idct.o avr32/fdct.o avr32/mc.o avr32/h264idct.o ++OBJS += avr32/dsputil_avr32.o ++endif ++ + # sun mediaLib specific stuff + OBJS-$(HAVE_MLIB) += mlib/dsputil_mlib.o \ + +@@ -419,6 +425,7 @@ tests: apiexample $(TESTS) + clean:: + rm -f \ + i386/*.o i386/*~ \ ++ avr32/*.o avr32/*~ \ + armv4l/*.o armv4l/*~ \ + mlib/*.o mlib/*~ \ + alpha/*.o alpha/*~ \ +diff --git a/libavcodec/avr32/dsputil_avr32.c b/libavcodec/avr32/dsputil_avr32.c +new file mode 100644 +index 0000000..200284d +--- /dev/null ++++ b/libavcodec/avr32/dsputil_avr32.c +@@ -0,0 +1,2678 @@ ++/* ++ * Copyright (c) 2007 Atmel Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials provided ++ * with the distribution. ++ * ++ * 3. The name of ATMEL may not be used to endorse or promote products ++ * derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL ++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ++ * DAMAGE. ++ */ ++ ++#include "../dsputil.h" ++#include "pico.h" ++ ++int avr32_use_pico = 1; ++ ++//#define CHECK_DSP_FUNCS_AGAINST_C ++ ++#ifdef CHECK_DSP_FUNCS_AGAINST_C ++#define DSP_FUNC_NAME(name) test_ ## name ++#else ++#define DSP_FUNC_NAME(name) name ++#endif ++ ++union doubleword { ++ int64_t doubleword; ++ struct { ++ int32_t top; ++ int32_t bottom; ++ } words; ++}; ++ ++#undef LD16 ++#undef LD32 ++#undef LD64 ++ ++#define LD16(a) (*((uint16_t*)(a))) ++#define LD32(a) (*((uint32_t*)(a))) ++#define LD64(a) (*((uint64_t*)(a))) ++#define LD64_UNALIGNED(a) \ ++ ({ union doubleword __tmp__; \ ++ __tmp__.words.top = LD32(a); \ ++ __tmp__.words.bottom = LD32(a + 4); \ ++ __tmp__.doubleword; }) ++ ++#undef ST32 ++#undef ST16 ++ ++#define ST16(a, b) *((uint16_t*)(a)) = (b) ++#define ST32(a, b) *((uint32_t*)(a)) = (b) ++ ++#undef rnd_avg32 ++#define rnd_avg32(a, b) \ ++ ({ uint32_t __tmp__;\ ++ asm("pavg.ub\t%0, %1, %2" : "=r"(__tmp__) : "r"(a), "r"(b));\ ++ __tmp__;}) ++ ++void idct_avr32(DCTELEM *data); ++void fdct_avr32(DCTELEM *data); ++ ++void idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *data); ++void idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *data); ++ ++void h264_idct_add_avr32(uint8_t *dest, DCTELEM *data, int stride); ++void h264_idct8_add_avr32(uint8_t *dest, DCTELEM *data, int stride); ++ ++#define extern_dspfunc(PFX, NUM) \ ++ void PFX ## _pixels ## NUM ## _avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ) ++ ++extern_dspfunc(put, 8); ++extern_dspfunc(put_no_rnd, 8); ++extern_dspfunc(avg, 8); ++extern_dspfunc(avg_no_rnd, 8); ++#undef extern_dspfunc ++ ++#ifdef CHECK_DSP_FUNCS_AGAINST_C ++#define extern_dspfunc(PFX, NUM) \ ++ void PFX ## _pixels ## NUM ## _c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _x2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _y2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ); \ ++ void PFX ## _pixels ## NUM ## _xy2_c(uint8_t *dst, const uint8_t *pixels, int line_size, int h ) ++ ++extern_dspfunc(put, 4); ++extern_dspfunc(put_no_rnd, 4); ++extern_dspfunc(put, 8); ++extern_dspfunc(put_no_rnd, 8); ++extern_dspfunc(put, 16); ++extern_dspfunc(put_no_rnd, 16); ++extern_dspfunc(avg, 8); ++extern_dspfunc(avg_no_rnd, 8); ++extern_dspfunc(avg, 16); ++extern_dspfunc(avg_no_rnd, 16); ++ ++ ++#undef extern_dspfunc ++#define extern_dspfunc(PFX, NUM) \ ++void PFX ## NUM ## _mc00_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc10_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc20_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc30_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc01_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc11_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc21_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc31_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc02_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc12_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc22_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc32_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc03_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc13_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc23_c(uint8_t *dst, uint8_t *src, int stride); \ ++void PFX ## NUM ## _mc33_c(uint8_t *dst, uint8_t *src, int stride); \ ++ ++extern_dspfunc(put_h264_qpel, 16); ++extern_dspfunc(put_h264_qpel, 8); ++extern_dspfunc(put_h264_qpel, 4); ++extern_dspfunc(avg_h264_qpel, 16); ++extern_dspfunc(avg_h264_qpel, 8); ++extern_dspfunc(avg_h264_qpel, 4); ++ ++#undef extern_dspfunc ++ ++void put_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++void put_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++void put_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++ ++void avg_h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++void avg_h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++void avg_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y); ++ ++ ++void dump_block8(uint8_t *block, int line_size, int h); ++void dump_block4(uint8_t *block, int line_size, int h); ++void dump_block(uint8_t *block, int line_size, int h, int w); ++ ++void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, ++ int h, char *name, int max_dev); ++void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, ++ int h, char *name, int max_dev); ++void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, ++ int h, int width, char *name, int max_dev); ++ ++#define PIXOP2( OPNAME, OP ) \ ++void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\ ++ int i;\ ++ for(i=0; i> 16)); ++ ST16(dst + 1*dstStride, (short)PICO_GET_W(PICO_OUTPIX0)); ++ ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_MVRC_W(PICO_INPIX1, src4); ++ PICO_MVRC_W(PICO_INPIX0, src5); ++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_MVRC_W(PICO_INPIX0, src1); ++ PICO_MVRC_W(PICO_INPIX1, src2); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); ++ PICO_MVRC_W(PICO_INPIX2, src4); ++ PICO_MVRC_W(PICO_INPIX1, src5); ++ PICO_MVRC_W(PICO_INPIX0, src6); ++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); ++ PICO_MVRC_W(PICO_INPIX0, src1); ++ PICO_MVRC_W(PICO_INPIX1, src2); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3); ++ ++ ST16(dst + 2*dstStride, (short)(PICO_GET_W(PICO_OUTPIX0) >> 16)); ++ ST16(dst + 3*dstStride, (short)PICO_GET_W(PICO_OUTPIX0)); ++ ++ dst += 2; ++ src += 2; ++ } ++} ++ ++ ++ ++ ++static void avg_h264_qpel4_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ ++ int32_t tmp_block[48]; ++ int32_t *tmp = tmp_block; ++ int i; ++ ++ set_pico_config(&h264_qpel4_hv_lowpass_config); ++ ++ src -= 2; ++ for ( i = 0; i < 2; i++ ){ ++ int srcB= LD32(src - 2*srcStride); ++ int srcA= LD32(src - 1*srcStride); ++ int src0= LD32(src + 0 *srcStride); ++ int src1= LD32(src + 1 *srcStride); ++ int src2= LD32(src + 2 *srcStride); ++ int src3= LD32(src + 3 *srcStride); ++ int src4= LD32(src + 4 *srcStride); ++ int src5= LD32(src + 5 *srcStride); ++ int src6= LD32(src + 6 *srcStride); ++ ++ PICO_MVRC_W(PICO_INPIX0, srcB); ++ PICO_MVRC_W(PICO_INPIX1, srcA); ++ PICO_MVRC_W(PICO_INPIX2, src0); ++ PICO_OP(0, 0, 0, 4, 8); ++ PICO_MVRC_W(PICO_INPIX2, src1); ++ PICO_MVRC_W(PICO_INPIX1, src2); ++ PICO_MVRC_W(PICO_INPIX0, src3); ++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); ++ PICO_STCM_W(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ tmp += 3; ++ ++ PICO_OP(0, 0, 1, 5, 9); ++ PICO_MVRC_W(PICO_INPIX0, srcB); ++ PICO_MVRC_W(PICO_INPIX1, srcA); ++ PICO_MVRC_W(PICO_INPIX2, src0); ++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); ++ PICO_STCM_W(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ tmp += 3; ++ ++ PICO_MVRC_W(PICO_INPIX0, src1); ++ PICO_OP(0, 0, 4, 8, 0); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_MVRC_W(PICO_INPIX1, src3); ++ PICO_MVRC_W(PICO_INPIX0, src4); ++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); ++ PICO_STCM_W(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ tmp += 3; ++ ++ PICO_OP(0, 0, 1, 5, 9); ++ PICO_MVRC_W(PICO_INPIX0, srcA); ++ PICO_MVRC_W(PICO_INPIX1, src0); ++ PICO_MVRC_W(PICO_INPIX2, src1); ++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); ++ PICO_STCM_W(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ tmp += 3; ++ ++ PICO_MVRC_W(PICO_INPIX0, src2); ++ PICO_OP(0, 0, 4, 8, 0); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_MVRC_W(PICO_INPIX1, src4); ++ PICO_MVRC_W(PICO_INPIX0, src5); ++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); ++ PICO_STCM_W(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ tmp += 3; ++ ++ PICO_OP(0, 0, 1, 5, 9); ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); ++ PICO_STCM_W(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ tmp += 3; ++ ++ PICO_MVRC_W(PICO_INPIX0, src3); ++ PICO_OP(0, 0, 4, 8, 0); ++ PICO_MVRC_W(PICO_INPIX2, src4); ++ PICO_MVRC_W(PICO_INPIX1, src5); ++ PICO_MVRC_W(PICO_INPIX0, src6); ++ PICO_OP(PICO_USE_ACC, 0, 0, 4, 8); ++ PICO_STCM_W(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ tmp += 3; ++ ++ PICO_OP(0, 0, 1, 5, 9); ++ PICO_MVRC_W(PICO_INPIX0, src1); ++ PICO_MVRC_W(PICO_INPIX1, src2); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_OP(PICO_USE_ACC, 0, 1, 5, 9); ++ PICO_STCM_W(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ tmp += 3; ++ src += 2; ++ } ++ ++ src -= 1; ++ tmp -= 48; ++ ++ ++ PICO_PUT_W(PICO_CONFIG, ++ PICO_OUTPUT_MODE(PICO_PLANAR_MODE) ++ | PICO_INPUT_MODE(PICO_VERT_FILTER_MODE) ++ | PICO_COEFF_FRAC_BITS(10) ++ | PICO_OFFSET_FRAC_BITS(10)); ++ ++ for ( i = 0; i < 2; i++ ){ ++ int srcB= LD32(src - 2*srcStride); ++ int srcA= LD32(src - 1*srcStride); ++ int src0= LD32(src + 0 *srcStride); ++ int src1= LD32(src + 1 *srcStride); ++ int src2= LD32(src + 2 *srcStride); ++ int src3= LD32(src + 3 *srcStride); ++ int src4= LD32(src + 4 *srcStride); ++ int src5= LD32(src + 5 *srcStride); ++ int src6= LD32(src + 6 *srcStride); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_MVRC_W(PICO_INPIX0, srcB); ++ PICO_MVRC_W(PICO_INPIX1, srcA); ++ PICO_MVRC_W(PICO_INPIX2, src0); ++ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); ++ PICO_MVRC_W(PICO_INPIX2, src1); ++ PICO_MVRC_W(PICO_INPIX1, src2); ++ PICO_MVRC_W(PICO_INPIX0, src3); ++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 0, 6, 3, 0); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); ++ PICO_MVRC_W(PICO_INPIX0, srcB); ++ PICO_MVRC_W(PICO_INPIX1, srcA); ++ PICO_MVRC_W(PICO_INPIX2, src0); ++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 1, 9, 6, 3); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_MVRC_W(PICO_INPIX0, srcA); ++ PICO_MVRC_W(PICO_INPIX1, src0); ++ PICO_MVRC_W(PICO_INPIX2, src1); ++ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_MVRC_W(PICO_INPIX1, src3); ++ PICO_MVRC_W(PICO_INPIX0, src4); ++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 2, 6, 3, 0); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); ++ PICO_MVRC_W(PICO_INPIX0, srcA); ++ PICO_MVRC_W(PICO_INPIX1, src0); ++ PICO_MVRC_W(PICO_INPIX2, src1); ++ PICO_OP(PICO_USE_ACC | PICO_SINGLE_VECTOR, 3, 9, 6, 3); ++ ++ ST16(dst + 0*dstStride, rnd_avg32(LD16(dst + 0*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16)); ++ ST16(dst + 1*dstStride, rnd_avg32(LD16(dst + 1*dstStride), PICO_GET_W(PICO_OUTPIX0))); ++ ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_USE_ACC, 0, 6, 3, 0); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_MVRC_W(PICO_INPIX1, src4); ++ PICO_MVRC_W(PICO_INPIX0, src5); ++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 0, 6, 3, 0); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_OP(PICO_USE_ACC, 1, 9, 6, 3); ++ PICO_MVRC_W(PICO_INPIX0, src0); ++ PICO_MVRC_W(PICO_INPIX1, src1); ++ PICO_MVRC_W(PICO_INPIX2, src2); ++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 1, 9, 6, 3); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_MVRC_W(PICO_INPIX0, src1); ++ PICO_MVRC_W(PICO_INPIX1, src2); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_OP(PICO_USE_ACC, 2, 6, 3, 0); ++ PICO_MVRC_W(PICO_INPIX2, src4); ++ PICO_MVRC_W(PICO_INPIX1, src5); ++ PICO_MVRC_W(PICO_INPIX0, src6); ++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 2, 6, 3, 0); ++ ++ PICO_LDCM_W_INC(tmp, ++ PICO_REGVECT_VMU0_OUT, ++ PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT); ++ PICO_OP(PICO_USE_ACC, 3, 9, 6, 3); ++ PICO_MVRC_W(PICO_INPIX0, src1); ++ PICO_MVRC_W(PICO_INPIX1, src2); ++ PICO_MVRC_W(PICO_INPIX2, src3); ++ PICO_OP(PICO_USE_ACC| PICO_SINGLE_VECTOR, 3, 9, 6, 3); ++ ++ ST16(dst + 2*dstStride, rnd_avg32(LD16(dst + 2*dstStride), PICO_GET_W(PICO_OUTPIX0) >> 16)); ++ ST16(dst + 3*dstStride, rnd_avg32(LD16(dst + 3*dstStride), PICO_GET_W(PICO_OUTPIX0))); ++ ++ dst += 2; ++ src += 2; ++ } ++} ++ ++ ++static void put_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++ src += 4*srcStride; ++ dst += 4*dstStride; ++ put_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++} ++ ++static void avg_h264_qpel8_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++ src += 4*srcStride; ++ dst += 4*dstStride; ++ avg_h264_qpel4_v_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel4_v_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++} ++ ++static void put_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++ src += 4*srcStride; ++ dst += 4*dstStride; ++ put_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++} ++ ++static void avg_h264_qpel8_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++ src += 4*srcStride; ++ dst += 4*dstStride; ++ avg_h264_qpel4_h_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel4_h_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++} ++ ++static void put_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++ src += 4*srcStride; ++ dst += 4*dstStride; ++ put_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++} ++ ++static void avg_h264_qpel8_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++ src += 4*srcStride; ++ dst += 4*dstStride; ++ avg_h264_qpel4_hv_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel4_hv_lowpass_pico(dst+4, src+4, dstStride, srcStride); ++} ++ ++static void put_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++ src += 8*srcStride; ++ dst += 8*dstStride; ++ put_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++} ++ ++static void avg_h264_qpel16_v_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++ src += 8*srcStride; ++ dst += 8*dstStride; ++ avg_h264_qpel8_v_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel8_v_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++} ++ ++static void put_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++ src += 8*srcStride; ++ dst += 8*dstStride; ++ put_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++} ++ ++static void avg_h264_qpel16_h_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++ src += 8*srcStride; ++ dst += 8*dstStride; ++ avg_h264_qpel8_h_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel8_h_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++} ++ ++static void put_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++ src += 8*srcStride; ++ dst += 8*dstStride; ++ put_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); ++ put_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++} ++ ++static void avg_h264_qpel16_hv_lowpass_pico(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){ ++ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++ src += 8*srcStride; ++ dst += 8*dstStride; ++ avg_h264_qpel8_hv_lowpass_pico(dst , src , dstStride, srcStride); ++ avg_h264_qpel8_hv_lowpass_pico(dst+8, src+8, dstStride, srcStride); ++} ++ ++ ++#define H264_MC(OPNAME, SIZE) \ ++static void OPNAME ## h264_qpel ## SIZE ## _mc00_pico (uint8_t *dst, uint8_t *src, int stride){\ ++ OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc10_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t half[SIZE*SIZE];\ ++ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc20_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ OPNAME ## h264_qpel ## SIZE ## _h_lowpass_pico(dst, src, stride, stride);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc30_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t half[SIZE*SIZE];\ ++ put_h264_qpel ## SIZE ## _h_lowpass_pico(half, src, SIZE, stride);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc01_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t full[SIZE*(SIZE+5)];\ ++ uint8_t * const full_mid= full + SIZE*2;\ ++ uint8_t half[SIZE*SIZE];\ ++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ ++ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc02_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t full[SIZE*(SIZE+5)];\ ++ uint8_t * const full_mid= full + SIZE*2;\ ++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ ++ OPNAME ## h264_qpel ## SIZE ## _v_lowpass_pico(dst, full_mid, stride, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc03_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t full[SIZE*(SIZE+5)];\ ++ uint8_t * const full_mid= full + SIZE*2;\ ++ uint8_t half[SIZE*SIZE];\ ++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ ++ put_h264_qpel ## SIZE ## _v_lowpass_pico(half, full_mid, SIZE, SIZE);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc11_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t full[SIZE*(SIZE+5)];\ ++ uint8_t * const full_mid= full + SIZE*2;\ ++ uint8_t halfH[SIZE*SIZE];\ ++ uint8_t halfV[SIZE*SIZE];\ ++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\ ++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ ++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc31_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t full[SIZE*(SIZE+5)];\ ++ uint8_t * const full_mid= full + SIZE*2;\ ++ uint8_t halfH[SIZE*SIZE];\ ++ uint8_t halfV[SIZE*SIZE];\ ++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\ ++ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ ++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc13_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t full[SIZE*(SIZE+5)];\ ++ uint8_t * const full_mid= full + SIZE*2;\ ++ uint8_t halfH[SIZE*SIZE];\ ++ uint8_t halfV[SIZE*SIZE];\ ++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\ ++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ ++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc33_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t full[SIZE*(SIZE+5)];\ ++ uint8_t * const full_mid= full + SIZE*2;\ ++ uint8_t halfH[SIZE*SIZE];\ ++ uint8_t halfV[SIZE*SIZE];\ ++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\ ++ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ ++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc22_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_pico(dst, src, stride, stride);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc21_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t halfH[SIZE*SIZE];\ ++ uint8_t halfHV[SIZE*SIZE];\ ++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src, SIZE, stride);\ ++ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc23_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t halfH[SIZE*SIZE];\ ++ uint8_t halfHV[SIZE*SIZE];\ ++ put_h264_qpel ## SIZE ## _h_lowpass_pico(halfH, src + stride, SIZE, stride);\ ++ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc12_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t full[SIZE*(SIZE+5)];\ ++ uint8_t * const full_mid= full + SIZE*2;\ ++ uint8_t halfV[SIZE*SIZE];\ ++ uint8_t halfHV[SIZE*SIZE];\ ++ copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\ ++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ ++ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ ++}\ ++\ ++static void OPNAME ## h264_qpel ## SIZE ## _mc32_pico(uint8_t *dst, uint8_t *src, int stride){\ ++ uint8_t full[SIZE*(SIZE+5)];\ ++ uint8_t * const full_mid= full + SIZE*2;\ ++ uint8_t halfV[SIZE*SIZE];\ ++ uint8_t halfHV[SIZE*SIZE];\ ++ copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\ ++ put_h264_qpel ## SIZE ## _v_lowpass_pico(halfV, full_mid, SIZE, SIZE);\ ++ put_h264_qpel ## SIZE ## _hv_lowpass_pico(halfHV, src, SIZE, stride);\ ++ OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\ ++}\ ++ ++H264_MC(put_, 4) ++H264_MC(put_, 8) ++H264_MC(put_, 16) ++H264_MC(avg_, 4) ++H264_MC(avg_, 8) ++H264_MC(avg_, 16) ++ ++ ++ ++#define dspfunc16(PFX) \ ++ void PFX ## _pixels16_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ ++ PFX ## _pixels8_avr32(dst, pixels, line_size, h);\ ++ PFX ## _pixels8_avr32(dst + 8, pixels + 8, line_size, h);\ ++ }\ ++ void PFX ## _pixels16_h_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ ++ PFX ## _pixels8_h_avr32(dst, pixels, line_size, h);\ ++ PFX ## _pixels8_h_avr32(dst + 8, pixels + 8, line_size, h);\ ++ }\ ++ void PFX ## _pixels16_v_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ ++ PFX ## _pixels8_v_avr32(dst, pixels, line_size, h);\ ++ PFX ## _pixels8_v_avr32(dst + 8, pixels + 8, line_size, h);\ ++ }\ ++ void PFX ## _pixels16_hv_avr32(uint8_t *dst, const uint8_t *pixels, int line_size, int h ){ \ ++ PFX ## _pixels8_hv_avr32(dst, pixels, line_size, h);\ ++ PFX ## _pixels8_hv_avr32(dst + 8, pixels + 8, line_size, h);\ ++ }\ ++ ++ ++dspfunc16(put) ++dspfunc16(put_no_rnd) ++dspfunc16(avg) ++dspfunc16(avg_no_rnd) ++#undef dspfunc16 ++ ++static int pix_sum_avr32(uint8_t * pix, int line_size) ++{ ++ int s, i; ++ ++ s = 0; ++ for (i = 0; i < 16; i++) { ++ int tmp1,tmp2,tmp3,tmp4,tmp5; ++ __asm__ volatile ( "ld.w\t%0, %6[0]\n\t" ++ "ld.w\t%1, %6[4]\n\t" ++ "ld.w\t%2, %6[8]\n\t" ++ "ld.w\t%3, %6[12]\n\t" ++ "punpckub.h\t%4, %0:t\n\t" ++ "padd.h\t%5, %5, %4\n\t" ++ "punpckub.h\t%4, %0:b\n\t" ++ "padd.h\t%5, %5, %4\n\t" ++ "punpckub.h\t%4, %1:t\n\t" ++ "padd.h\t%5, %5, %4\n\t" ++ "punpckub.h\t%4, %1:b\n\t" ++ "padd.h\t%5, %5, %4\n\t" ++ "punpckub.h\t%4, %2:t\n\t" ++ "padd.h\t%5, %5, %4\n\t" ++ "punpckub.h\t%4, %2:b\n\t" ++ "padd.h\t%5, %5, %4\n\t" ++ "punpckub.h\t%4, %3:t\n\t" ++ "padd.h\t%5, %5, %4\n\t" ++ "punpckub.h\t%4, %3:b\n\t" ++ "padd.h\t%5, %5, %4\n\t" ++ : "=&r"(tmp1),"=&r"(tmp2),"=&r"(tmp3),"=&r"(tmp4),"=&r"(tmp5),"=&r"(s) ++ : "r"(pix)); ++ pix += line_size; ++ } ++ __asm__ volatile ( "addhh.w\t%0, %0:t, %0:b" : "=&r" (s) ); ++ ++ return s; ++} ++ ++ ++//#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom ) ++//#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1)) ++//#define H264_WEIGHT(W,H) \ ++//static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \ ++// int attribute_unused x, y; \ ++// offset <<= log2_denom; \ ++// if(log2_denom) offset += 1<<(log2_denom-1); \ ++// for(y=0; y> 0, 8\n" \ ++// "satu\t%[tmp1] >> 0, 8\n" \ ++// "st.b\t%[block][0], %[tmp0]\n" \ ++// "st.b\t%[block][1], %[tmp1]\n" \ ++// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \ ++// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \ ++// } else if ( W==4 ) { \ ++// asm volatile ( "ld.w\t%[tmp0], %[block][0]\n" \ ++// "punpckub.h\t%[tmp1], %[tmp0]:t\n" \ ++// "punpckub.h\t%[tmp0], %[tmp0]:b\n" \ ++// "mulhh.w\t%[tmp2], %[tmp1]:t, %[weight]:b\n" \ ++// "mulhh.w\t%[tmp1], %[tmp1]:b, %[weight]:b\n" \ ++// "asr\t%[tmp0], %[log2_denom]\n" \ ++// "asr\t%[tmp1], %[log2_denom]\n" \ ++// "satu\t%[tmp0] >> 0, 8\n" \ ++// "satu\t%[tmp1] >> 0, 8\n" \ ++// "st.b\t%[block][0], %[tmp0]\n" \ ++// "st.b\t%[block][1], %[tmp1]\n" \ ++// : [tmp0] "=&r"(tmp0), [tmp1] "=&r"(tmp1) \ ++// : [block] "r"(block), [weight]"r"(weight), [log2_denom]"r"(log2denom) ); \ ++// ++// ++// ++// if(W==4) continue; \ ++// op_scale1(4); \ ++// op_scale1(5); \ ++// op_scale1(6); \ ++// op_scale1(7); \ ++// if(W==8) continue; \ ++// op_scale1(8); \ ++// op_scale1(9); \ ++// op_scale1(10); \ ++// op_scale1(11); \ ++// op_scale1(12); \ ++// op_scale1(13); \ ++// op_scale1(14); \ ++// op_scale1(15); \ ++// } \ ++//} \ ++//static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \ ++// int attribute_unused x, y; \ ++// int offset = (offsets + offsetd + 1) >> 1; \ ++// offset = ((offset << 1) + 1) << log2_denom; \ ++// for(y=0; y and ++ is not less than */ ++#define PABS_DIFF_LESS_THAN( a, b, compare) \ ++ ({ uint32_t __tmp__, __tmp2__, __mask__; \ ++ asm ( \ ++ /* Check ABS( a - b ) < compare */ \ ++ "psubs.ub\t%[tmp], %[opa], %[opb]\n" \ ++ "psubs.ub\t%[tmp2], %[opb], %[opa]\n" \ ++ "or\t%[tmp], %[tmp2]\n" /* ABS ( a - b ) */ \ ++ /* This produces 0 for all bytes where the comparison is not true */ \ ++ "psubs.ub\t%[mask], %[cmp], %[tmp]\n" \ ++ : [tmp] "=&r"(__tmp__), [tmp2] "=&r"(__tmp2__), [mask] "=&r"(__mask__) \ ++ : [opa] "r"(a), [opb] "r"(b), [cmp] "r"(compare) ); \ ++ __mask__; }) ++ ++/* ++ Set all bytes containing zero in to 255 and the rest to zero. ++ ++ Add with saturation 254 to all bytes making all bytes different from ++ zero become 255. Then add one without saturation to make all bytes ++ originally containing zero 255 and the rest 0. */ ++#define SET_ALL_BITS_IN_ZERO_BYTES(value) \ ++ ({ uint32_t __tmp__; \ ++ asm ( \ ++ "padds.ub\t%[tmp], %[val], %[max_minus_one]\n" \ ++ "padd.b\t%[tmp], %[tmp], %[all_ones]\n" \ ++ : [tmp] "=r"(__tmp__) \ ++ : [val] "r"(value), [max_minus_one] "r"(0xFEFEFEFE), [all_ones] "r"(0x01010101) ); \ ++ __tmp__; }) ++ ++#define PACKW_SH(upper, lower) \ ++ ({ uint32_t __tmp__; \ ++ asm ( \ ++ "packw.sh\t%[tmp], %[u], %[l]\n" \ ++ : [tmp] "=r"(__tmp__) \ ++ : [u] "r"(upper), [l] "r"(lower) ); \ ++ __tmp__; }) ++ ++#define PACKSH_UB(upper, lower) \ ++ ({ uint32_t __tmp__; \ ++ asm ( \ ++ "packsh.sb\t%[tmp], %[u], %[l]\n" \ ++ : [tmp] "=r"(__tmp__) \ ++ : [u] "r"(upper), [l] "r"(lower) ); \ ++ __tmp__; }) ++ ++static void h264_v_loop_filter_luma_avr32(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) ++{ ++ int i; ++ ++ if ( alpha == 0 ) ++ return; ++ ++ alpha = PACKW_SH(alpha, alpha); ++ alpha = PACKSH_UB(alpha, alpha); ++ beta = PACKW_SH(beta, beta); ++ beta = PACKSH_UB(beta, beta); ++ ++ for( i = 0; i < 4; i++ ) { ++ uint32_t p0, p1, p2, q0, q1, q2; ++ uint32_t mask, mask2; ++ uint32_t tmp, tmp2, tmp3, tmp4; ++ ++ if( tc0[i] < 0 ) { ++ pix += 4; ++ continue; ++ } ++ ++/* for( d = 0; d < 4; d++ ) { ++ const int p0 = pix[-1*stride]; ++ const int p1 = pix[-2*stride]; ++ const int p2 = pix[-3*stride]; ++ const int q0 = pix[0]; ++ const int q1 = pix[1*stride]; ++ const int q2 = pix[2*stride]; ++ ++ if( ABS( p0 - q0 ) < alpha && ++ ABS( p1 - p0 ) < beta && ++ ABS( q1 - q0 ) < beta ) { */ ++ ++ p0 = LD32(pix - stride); ++ p1 = LD32(pix - 2*stride); ++ q0 = LD32(pix); ++ q1 = LD32(pix + stride); ++ ++ /* Check which of the columns should be filtered, if any. */ ++ mask = PABS_DIFF_LESS_THAN(p0, q0, alpha); ++ mask |= PABS_DIFF_LESS_THAN(p1, p0, beta); ++ mask |= PABS_DIFF_LESS_THAN(q1, q0, beta); ++ ++ if ( !mask ) ++ continue; ++ ++ mask = SET_ALL_BITS_IN_ZERO_BYTES(mask); ++ ++ ++ int tc = PACKW_SH(tc0[i], tc0[i]); ++ int tc0_p = tc; ++ int tc0_m = PACKW_SH(-tc0[i], -tc0[i]); ++ ++ /* ++ int i_delta; ++ if( ABS( p2 - p0 ) < beta ) { ++ pix[-2*stride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] ); ++ tc++; ++ }*/ ++ ++ p2 = LD32(pix - 3*stride); ++ mask2 = PABS_DIFF_LESS_THAN(p2, p0, beta) & ~mask; ++ ++ if ( mask2 ){ ++ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2); ++ asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n" ++ "paddh.ub\t%[tmp], %[tmp], %[p2]\n" ++ "punpckub.h\t%[tmp2], %[tmp]:t\n" ++ "punpckub.h\t%[tmp], %[tmp]:b\n" ++ "punpckub.h\t%[tmp3], %[p1]:t\n" ++ "punpckub.h\t%[tmp4], %[p1]:b\n" ++ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n" ++ "psub.h\t%[tmp], %[tmp], %[tmp4]\n" ++ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n" ++ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n" ++ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n" ++ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n" ++ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n" ++ "padd.h\t%[tmp], %[tmp], %[tmp4]\n" ++ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n" ++ "andn\t%[tmp], %[mask2]\n" ++ "and\t%[tmp2], %[q1], %[mask2]\n" ++ "or\t%[tmp], %[tmp2]\n" ++ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), ++ [tmp4]"=&r"(tmp4) ++ : [q0]"r"(q0), [p2]"r"(p2), [p1]"r"(p1), [p0]"r"(p0), [q1]"r"(q1), [tc0_p]"r"(tc0_p), ++ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2)); ++ ST32(pix - 2*stride, tmp); ++ tc += 0x00010001; ++ } ++ ++ ++ q2 = LD32(pix + 2*stride); ++ ++ /* ++ if( ABS( q2 - q0 ) < beta ) { ++ pix[ stride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] ); ++ tc++; ++ } ++ */ ++ mask2 = PABS_DIFF_LESS_THAN(q2, q0, beta) & ~mask; ++ ++ if ( mask2 ){ ++ mask2 = SET_ALL_BITS_IN_ZERO_BYTES(mask2); ++ asm ("pavg.ub\t%[tmp], %[p0], %[q0]\n" ++ "paddh.ub\t%[tmp], %[tmp], %[q2]\n" ++ "punpckub.h\t%[tmp2], %[tmp]:t\n" ++ "punpckub.h\t%[tmp], %[tmp]:b\n" ++ "punpckub.h\t%[tmp3], %[q1]:t\n" ++ "punpckub.h\t%[tmp4], %[q1]:b\n" ++ "psub.h\t%[tmp2], %[tmp2], %[tmp3]\n" ++ "psub.h\t%[tmp], %[tmp], %[tmp4]\n" ++ "pmin.sh\t%[tmp2], %[tmp2], %[tc0_p]\n" ++ "pmin.sh\t%[tmp], %[tmp], %[tc0_p]\n" ++ "pmax.sh\t%[tmp2], %[tmp2], %[tc0_m]\n" ++ "pmax.sh\t%[tmp], %[tmp], %[tc0_m]\n" ++ "padd.h\t%[tmp2], %[tmp2], %[tmp3]\n" ++ "padd.h\t%[tmp], %[tmp], %[tmp4]\n" ++ "packsh.ub\t%[tmp], %[tmp2], %[tmp]\n" ++ "andn\t%[tmp], %[mask2]\n" ++ "and\t%[tmp2], %[q1], %[mask2]\n" ++ "or\t%[tmp], %[tmp2]\n" ++ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), ++ [tmp4]"=&r"(tmp4) ++ : [q0]"r"(q0), [q2]"r"(q2), [q1]"r"(q1), [p0]"r"(p0), [tc0_p]"r"(tc0_p), ++ [tc0_m]"r"(tc0_m), [mask2]"r"(mask2)); ++ ST32(pix + stride, tmp); ++ tc += 0x00010001; ++ } ++ ++ uint32_t old_p0 = p0; ++ uint32_t old_q0 = q0; ++ ++ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); ++ pix[-stride] = clip_uint8( p0 + i_delta ); ++ pix[0] = clip_uint8( q0 - i_delta ); */ ++ ++ asm ( ++ /* Check if the two upper pixels should be filtered */ ++ "lsr\t%[tmp], %[inv_mask], 16\n" ++ "breq\t0f\n" ++ ++ "punpckub.h\t%[tmp], %[p1]:t\n" ++ "punpckub.h\t%[tmp2], %[q1]:t\n" ++ ++ /* p1 - q1 */ ++ "psub.h\t%[tmp], %[tmp], %[tmp2]\n" ++ ++ "punpckub.h\t%[tmp3], %[q0]:t\n" ++ "punpckub.h\t%[tmp4], %[p0]:t\n" ++ ++ /* q0 - p0 */ ++ "psub.h\t%[tmp2], %[tmp3], %[tmp4]\n" ++ ++ /* (q0 - p0) << 2 */ ++ "plsl.h\t%[tmp2], %[tmp2], 2\n" ++ ++ /* ((q0 - p0) << 2) + (p1 - q1) */ ++ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n" ++ ++ "mov\t%[tmp], 0x00040004\n" ++ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/ ++ "padd.h\t%[tmp2], %[tmp2], %[tmp]\n" ++ ++ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/ ++ "pasr.h\t%[tmp2], %[tmp2], 3\n" ++ ++ "mov\t%[tmp], 0\n" ++ "psub.h\t%[tmp], %[tmp], %[tc]\n" ++ ++ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */ ++ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n" ++ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n" ++ ++ ++ /* pix[-stride] = clip_uint8( p0 + i_delta ); */ ++ "padd.h\t%[tmp4], %[tmp4], %[tmp2]\n" ++ ++ ++ /* pix[0] = clip_uint8( q0 - i_delta ); */ ++ "psub.h\t%[tmp3], %[tmp3], %[tmp2]\n" ++ ++ /* Check if the two lower pixels should be filtered */ ++ "lsl\t%[tmp2], %[inv_mask], 16\n" ++ "breq\t1f\n" ++ ++ "0:\n" ++ "punpckub.h\t%[p1], %[p1]:b\n" ++ "punpckub.h\t%[q1], %[q1]:b\n" ++ ++ /* p1 - q1 */ ++ "psub.h\t%[p1], %[p1], %[q1]\n" ++ ++ "punpckub.h\t%[q0], %[q0]:b\n" ++ "punpckub.h\t%[p0], %[p0]:b\n" ++ ++ /* q0 - p0 */ ++ "psub.h\t%[tmp2], %[q0], %[p0]\n" ++ ++ /* (q0 - p0) << 2 */ ++ "plsl.h\t%[tmp2], %[tmp2], 2\n" ++ ++ /* ((q0 - p0) << 2) + (p1 - q1) */ ++ "padd.h\t%[tmp2], %[tmp2], %[p1]\n" ++ ++ "mov\t%[q1], 0x00040004\n" ++ /* ((q0 - p0) << 2) + (p1 - q1) + 4*/ ++ "padd.h\t%[tmp2], %[tmp2], %[q1]\n" ++ ++ /* (((q0 - p0) << 2) + (p1 - q1) + 4) >> 3*/ ++ "pasr.h\t%[tmp2], %[tmp2], 3\n" ++ ++ /* i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc ); */ ++ "pmin.sh\t%[tmp2], %[tmp2], %[tc]\n" ++ "pmax.sh\t%[tmp2], %[tmp2], %[tmp]\n" ++ ++ /* pix[-stride] = clip_uint8( p0 + i_delta ); */ ++ "padd.h\t%[p0], %[p0], %[tmp2]\n" ++ ++ /* pix[0] = clip_uint8( q0 - i_delta ); */ ++ "psub.h\t%[q0], %[q0], %[tmp2]\n" ++ ++ "1:\n" ++ "packsh.ub\t%[p0], %[tmp4], %[p0]\n" ++ "packsh.ub\t%[q0], %[tmp3], %[tmp4]\n" ++ ++ : [tmp]"=&r"(tmp), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), ++ [tmp4]"=&r"(tmp4), [q0]"=&r"(q0), [q1]"=&r"(q1), [p0]"=&r"(p0), [p1]"=&r"(p1) ++ : [tc]"r"(tc), [inv_mask]"r"(~mask)); ++ ++ ST32(pix - stride, (mask & old_p0) | (p0 & ~mask)); ++ ST32(pix, (mask & old_q0) | (q0 & ~mask)); ++ ++ } ++ pix += 1; ++} ++ ++ ++ ++ ++#ifdef CHECK_DSP_FUNCS_AGAINST_C ++ ++void dump_block8(uint8_t *block, int line_size, int h){ ++ int i, j; ++ ++ for ( i = 0; i < h ; i++ ){ ++ av_log(NULL, AV_LOG_ERROR, "\t"); ++ for ( j = 0; j < 8 ; j++ ){ ++ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); ++ } ++ av_log(NULL, AV_LOG_ERROR, "\n"); ++ } ++} ++ ++void dump_block4(uint8_t *block, int line_size, int h){ ++ int i, j; ++ ++ for ( i = 0; i < h ; i++ ){ ++ av_log(NULL, AV_LOG_ERROR, "\t"); ++ for ( j = 0; j < 4 ; j++ ){ ++ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); ++ } ++ av_log(NULL, AV_LOG_ERROR, "\n"); ++ } ++} ++ ++void dump_block(uint8_t *block, int line_size, int h, int w){ ++ int i, j; ++ ++ for ( i = 0; i < h ; i++ ){ ++ av_log(NULL, AV_LOG_ERROR, "\t"); ++ for ( j = 0; j < w ; j++ ){ ++ av_log(NULL, AV_LOG_ERROR, "%d ", block[j + i*line_size]); ++ } ++ av_log(NULL, AV_LOG_ERROR, "\n"); ++ } ++} ++ ++void check_block8(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, ++ int h, char *name, int max_dev){ ++ int i,j; ++ for ( i = 0; i < 8 ; i++ ){ ++ for ( j = 0; j < h ; j++ ){ ++ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j]; ++ diff = diff < 0 ? -diff : diff; ++ if ( diff > max_dev ){ ++ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", ++ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]); ++ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name); ++ dump_block8(test, line_size_test, h); ++ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n"); ++ dump_block8(correct, line_size_correct, h); ++ exit(1); ++ } ++ } ++ } ++} ++ ++void check_block4(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, ++ int h, char *name, int max_dev){ ++ int i,j; ++ for ( i = 0; i < 4 ; i++ ){ ++ for ( j = 0; j < h ; j++ ){ ++ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j]; ++ diff = diff < 0 ? -diff : diff; ++ if ( diff > max_dev ){ ++ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", ++ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]); ++ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name); ++ dump_block8(test, line_size_test, h); ++ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n"); ++ dump_block4(correct, line_size_correct, h); ++ exit(1); ++ } ++ } ++ } ++} ++ ++void check_block(uint8_t *test, uint8_t *correct, int line_size_test, int line_size_correct, ++ int h, int width, char *name, int max_dev){ ++ int i,j; ++ for ( i = 0; i < width ; i++ ){ ++ for ( j = 0; j < h ; j++ ){ ++ int diff = test[i + line_size_test*j] - correct[i + line_size_correct*j]; ++ diff = diff < 0 ? -diff : diff; ++ if ( diff > max_dev ){ ++ av_log(NULL, AV_LOG_ERROR, "Error pixel x=%i, y=%i differs. Is 0x%x should be 0x%x\n", ++ i, j, test[i + line_size_test*j], correct[i + j*line_size_correct]); ++ av_log(NULL, AV_LOG_ERROR, "Error resulting block from %s is:\n", name); ++ dump_block(test, line_size_test, h, width); ++ av_log(NULL, AV_LOG_ERROR, "But should be equal to:\n"); ++ dump_block(correct, line_size_correct, h, width); ++ exit(1); ++ } ++ } ++ } ++} ++ ++void dump_dct_block(DCTELEM *block){ ++ int i, j; ++ ++ for ( i = 0; i < 8 ; i++ ){ ++ av_log(NULL, AV_LOG_ERROR, "\t"); ++ for ( j = 0; j < 8 ; j++ ){ ++ av_log(NULL, AV_LOG_ERROR, "0x%x ", block[j + i*8]); ++ } ++ av_log(NULL, AV_LOG_ERROR, "\n"); ++ } ++} ++ ++void test_idct_avr32(DCTELEM *block){ ++ DCTELEM testBlock[64]; ++ int i, j; ++ ++ /* Copy transposed block to testBlock */ ++ for ( i = 0; i < 8 ; i++ ){ ++ for ( j = 0; j < 8 ; j++ ){ ++ testBlock[i + 8*j] = block[j + i*8]; ++ } ++ } ++ ++ idct_avr32(block); ++ simple_idct(&testBlock); ++ ++ for ( i = 0; i < 64 ; i++ ){ ++ if ( block[i] != testBlock[i] ){ ++ av_log(NULL, AV_LOG_ERROR, "Error resulting block from idct is:\n"); ++ dump_dct_block(block); ++ av_log(NULL, AV_LOG_ERROR, "But should be equal to the transposed of:\n"); ++ dump_dct_block(testBlock); ++ exit(1); ++ } ++ } ++} ++ ++void test_idct_put_avr32(uint8_t *dest, int line_size, DCTELEM *block){ ++ uint8_t testBlock[64]; ++ DCTELEM blockCopy[64]; ++ int i, j; ++ ++ /* Copy transposed block to blockCopy */ ++ for ( i = 0; i < 8 ; i++ ){ ++ for ( j = 0; j < 8 ; j++ ){ ++ blockCopy[i + 8*j] = block[j + i*8]; ++ } ++ } ++ ++ idct_put_avr32(dest, line_size, block); ++ simple_idct_put(&testBlock, 8, blockCopy); ++ ++ check_block8(dest, testBlock, line_size, 8, 8, "idct_put", 1); ++} ++ ++ ++void test_idct_add_avr32(uint8_t *dest, int line_size, DCTELEM *block){ ++ uint8_t testBlock[64]; ++ DCTELEM blockCopy[64]; ++ int i, j; ++ ++ /* Copy dest to testBlock */ ++ for ( i = 0; i < 8 ; i++ ){ ++ for ( j = 0; j < 8 ; j++ ){ ++ testBlock[i + 8*j] = dest[i + j*line_size]; ++ } ++ } ++ ++ /* Copy transposed block to blockCopy */ ++ for ( i = 0; i < 8 ; i++ ){ ++ for ( j = 0; j < 8 ; j++ ){ ++ blockCopy[i + 8*j] = block[j + i*8]; ++ } ++ } ++ ++ idct_add_avr32(dest, line_size, block); ++ simple_idct_add(&testBlock, 8, blockCopy); ++ ++ check_block8(dest, testBlock, line_size, 8, 8, "idct_add", 1); ++} ++ ++void test_h264_idct_add_avr32(uint8_t *dest, DCTELEM *block, int stride){ ++ uint8_t testBlock[16]; ++ DCTELEM blockCopy[16]; ++ int i, j; ++ ++ /* Copy dest to testBlock */ ++ for ( i = 0; i < 4 ; i++ ){ ++ for ( j = 0; j < 4 ; j++ ){ ++ testBlock[i + 4*j] = dest[i + j*stride]; ++ } ++ } ++ ++ /* Copy transposed block to blockCopy */ ++ for ( i = 0; i < 16 ; i++ ){ ++ blockCopy[i] = block[i]; ++ } ++ ++ ff_h264_idct_add_c(dest, block, stride); ++ ++ h264_idct_add_avr32(testBlock, blockCopy, 4); ++ ++ check_block(dest, testBlock, stride, 4, 4, 4, "h264_idct_add", 0); ++} ++ ++void test_h264_idct8_add_avr32(uint8_t *dest, DCTELEM *block, int stride){ ++ uint8_t testBlock[8*8]; ++ DCTELEM blockCopy[8*8]; ++ int i, j; ++ ++ /* Copy dest to testBlock */ ++ for ( i = 0; i < 8 ; i++ ){ ++ for ( j = 0; j < 8 ; j++ ){ ++ testBlock[i + 8*j] = dest[i + j*stride]; ++ } ++ } ++ ++ /* Copy source block to blockCopy */ ++ for ( i = 0; i < 8*8 ; i++ ){ ++ blockCopy[i] = block[i]; ++ } ++ ++ ff_h264_idct8_add_c(dest, block, stride); ++ h264_idct8_add_avr32(testBlock, blockCopy, 8); ++ ++ check_block(dest, testBlock, stride, 8, 8, 8, "h264_idct8_add", 0); ++} ++ ++void test_put_pixels_funcs8(op_pixels_func test, op_pixels_func correct, uint8_t *block, ++ const uint8_t *pixels, int line_size, int h, char *name, int in_h_size, int in_v_size){ ++ uint8_t *testBlock, *testBlock2; ++ int i, j; ++ int input_v_size = h + in_v_size; ++ int input_h_size = 8 + in_h_size; ++ ++ testBlock = alloca(input_h_size*input_v_size); ++ testBlock2 = alloca(input_h_size*input_v_size); ++ ++ for ( i = 0; i < input_h_size ; i++ ){ ++ for ( j = 0; j < input_v_size ; j++ ){ ++ testBlock[i + input_h_size*j] = pixels[i + j*line_size]; ++ } ++ } ++ ++ test(block, pixels, line_size, h); ++ correct(testBlock2, testBlock, input_h_size, h); ++ ++ check_block8(block, testBlock2, line_size, input_h_size, h, name, 0); ++ ++} ++ ++void test_h264_chroma_mc_funcs(h264_chroma_mc_func test, h264_chroma_mc_func correct, uint8_t *dst, ++ uint8_t *src, int stride, int h, int w, int x, int y, char *name){ ++ uint8_t *testBlock, *testBlock2; ++ int i, j; ++ int input_v_size = h + 1; ++ int input_h_size = ((w + 1) + 3) & ~3; ++ ++ testBlock = alloca(input_h_size*input_v_size); ++ testBlock2 = alloca(input_h_size*input_v_size); ++ ++ for ( i = 0; i < w + 1 ; i++ ){ ++ for ( j = 0; j < h + 1 ; j++ ){ ++ testBlock[i + input_h_size*j] = src[i + j*stride]; ++ } ++ } ++ ++ for ( i = 0; i < w ; i++ ){ ++ for ( j = 0; j < h ; j++ ){ ++ testBlock2[i + input_h_size*j] = dst[i + j*stride]; ++ } ++ } ++ ++ test(dst, src, stride, h, x, y); ++ correct(testBlock2, testBlock, input_h_size, h, x, y); ++ ++ check_block(dst, testBlock2, stride, input_h_size, h, w, name, 0); ++ ++} ++ ++void test_qpel_mc_funcs(qpel_mc_func test, qpel_mc_func correct, uint8_t *dst, ++ uint8_t *src, int stride, int size, char *name){ ++ uint8_t *testBlock, *testBlock2; ++ int i, j; ++ int test_stride = size + 8; ++ ++ testBlock = alloca(test_stride*(size+8)) + 4 + test_stride*4; ++ testBlock2 = alloca(test_stride*size); ++ ++ for ( i = -4; i < size+4 ; i++ ){ ++ for ( j = -4; j < size+4 ; j++ ){ ++ testBlock[i + test_stride*j] = src[i + j*stride]; ++ } ++ } ++ ++ for ( i = 0; i < size ; i++ ){ ++ for ( j = 0; j < size ; j++ ){ ++ testBlock2[i + test_stride*j] = dst[i + j*stride]; ++ } ++ } ++ ++ correct(dst, src, stride); ++ test(testBlock2, testBlock, test_stride); ++ ++ check_block(testBlock2, dst, test_stride, stride, size, size, name, 0); ++ ++} ++ ++ ++#define test_pixels_funcs(PFX, NUM ) \ ++void test_ ## PFX ## _pixels ## NUM ## _avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ ++ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _avr32, PFX ## _pixels ## NUM ## _c, \ ++ block, pixels, line_size, h, "test_" #PFX "_pixels", 0, 0); } \ ++void test_ ## PFX ## _pixels ## NUM ## _h_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ ++ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _h_avr32, PFX ## _pixels ## NUM ## _x2_c, \ ++ block, pixels, line_size, h, "test_" #PFX "_pixels_h", 1, 0); } \ ++void test_ ## PFX ## _pixels ## NUM ## _v_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ ++ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _v_avr32, PFX ## _pixels ## NUM ## _y2_c, \ ++ block, pixels, line_size, h, "test_" #PFX "_pixels_v", 0, 1); } \ ++void test_ ## PFX ## _pixels ## NUM ## _hv_avr32( uint8_t *block, const uint8_t *pixels, int line_size, int h){ \ ++ test_put_pixels_funcs8(PFX ## _pixels ## NUM ## _hv_avr32, PFX ## _pixels ## NUM ## _xy2_c, \ ++ block, pixels, line_size, h, "test_" #PFX "_pixels_hv", 1, 1); } ++ ++test_pixels_funcs(put, 8); ++test_pixels_funcs(put_no_rnd, 8); ++test_pixels_funcs(put, 16); ++test_pixels_funcs(put_no_rnd, 16); ++ ++test_pixels_funcs(avg, 8); ++test_pixels_funcs(avg_no_rnd, 8); ++test_pixels_funcs(avg, 16); ++test_pixels_funcs(avg_no_rnd, 16); ++ ++#define test_h264_chroma_mc_funcs(PFX, NUM ) \ ++void test_ ## PFX ## _h264_chroma_mc ## NUM ## _pico( uint8_t *dst, uint8_t *src, int stride, int h, int x, int y){ \ ++ test_h264_chroma_mc_funcs(PFX ## _h264_chroma_mc ## NUM ## _pico, PFX ## _h264_chroma_mc ## NUM ## _c, \ ++ dst, src, stride, h, NUM, x, y, "test_" #PFX "_h264_chroma_mc" #NUM "_pico"); } \ ++ ++test_h264_chroma_mc_funcs(put, 2); ++test_h264_chroma_mc_funcs(put, 4); ++test_h264_chroma_mc_funcs(put, 8); ++test_h264_chroma_mc_funcs(avg, 2); ++test_h264_chroma_mc_funcs(avg, 4); ++test_h264_chroma_mc_funcs(avg, 8); ++ ++#define test_qpel_mc_funcs_type(PFX, NUM, TYPE ) \ ++void test_ ## PFX ## NUM ## _ ## TYPE ## _pico( uint8_t *dst, uint8_t *src, int stride){ \ ++ test_qpel_mc_funcs(PFX ## NUM ## _ ## TYPE ## _pico, PFX ## NUM ## _ ## TYPE ## _c, \ ++ dst, src, stride, NUM, "test_" #PFX #NUM "_" #TYPE "_pico"); } ++ ++#define test_qpel_mc_funcs(PFX, NUM) \ ++ test_qpel_mc_funcs_type(PFX, NUM, mc00);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc10);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc20);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc30);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc01);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc11);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc21);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc31);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc02);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc12);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc22);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc32);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc03);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc13);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc23);\ ++ test_qpel_mc_funcs_type(PFX, NUM, mc33) ++ ++test_qpel_mc_funcs(put_h264_qpel, 4); ++test_qpel_mc_funcs(put_h264_qpel, 8); ++test_qpel_mc_funcs(put_h264_qpel, 16); ++test_qpel_mc_funcs(avg_h264_qpel, 4); ++test_qpel_mc_funcs(avg_h264_qpel, 8); ++test_qpel_mc_funcs(avg_h264_qpel, 16); ++ ++ ++#define dspfunc(PFX, IDX, NUM) \ ++ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \ ++ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \ ++ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \ ++ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \ ++ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \ ++ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \ ++ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico ) ++ ++#endif ++ ++void dsputil_init_avr32(DSPContext* c, AVCodecContext *avctx) ++{ ++ ++ /* H264 */ ++ ++ if ( 0 /*avr32_use_pico*/ ){ ++ c->put_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(put_h264_chroma_mc8_pico); ++ c->put_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(put_h264_chroma_mc4_pico); ++ c->put_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(put_h264_chroma_mc2_pico); ++ ++ c->avg_h264_chroma_pixels_tab[0]= DSP_FUNC_NAME(avg_h264_chroma_mc8_pico); ++ c->avg_h264_chroma_pixels_tab[1]= DSP_FUNC_NAME(avg_h264_chroma_mc4_pico); ++ c->avg_h264_chroma_pixels_tab[2]= DSP_FUNC_NAME(avg_h264_chroma_mc2_pico); ++ } ++ ++#define dspfunc(PFX, IDX, NUM) \ ++ c->PFX ## _pixels_tab[IDX][ 0] = DSP_FUNC_NAME( PFX ## NUM ## _mc00_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 1] = DSP_FUNC_NAME( PFX ## NUM ## _mc10_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 2] = DSP_FUNC_NAME( PFX ## NUM ## _mc20_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 3] = DSP_FUNC_NAME( PFX ## NUM ## _mc30_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 4] = DSP_FUNC_NAME( PFX ## NUM ## _mc01_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 5] = DSP_FUNC_NAME( PFX ## NUM ## _mc11_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 6] = DSP_FUNC_NAME( PFX ## NUM ## _mc21_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 7] = DSP_FUNC_NAME( PFX ## NUM ## _mc31_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 8] = DSP_FUNC_NAME( PFX ## NUM ## _mc02_pico ); \ ++ c->PFX ## _pixels_tab[IDX][ 9] = DSP_FUNC_NAME( PFX ## NUM ## _mc12_pico ); \ ++ c->PFX ## _pixels_tab[IDX][10] = DSP_FUNC_NAME( PFX ## NUM ## _mc22_pico ); \ ++ c->PFX ## _pixels_tab[IDX][11] = DSP_FUNC_NAME( PFX ## NUM ## _mc32_pico ); \ ++ c->PFX ## _pixels_tab[IDX][12] = DSP_FUNC_NAME( PFX ## NUM ## _mc03_pico ); \ ++ c->PFX ## _pixels_tab[IDX][13] = DSP_FUNC_NAME( PFX ## NUM ## _mc13_pico ); \ ++ c->PFX ## _pixels_tab[IDX][14] = DSP_FUNC_NAME( PFX ## NUM ## _mc23_pico ); \ ++ c->PFX ## _pixels_tab[IDX][15] = DSP_FUNC_NAME( PFX ## NUM ## _mc33_pico ) ++ ++ if ( avr32_use_pico ){ ++ dspfunc(put_h264_qpel, 0, 16); ++ dspfunc(put_h264_qpel, 1, 8); ++ dspfunc(put_h264_qpel, 2, 4); ++ dspfunc(avg_h264_qpel, 0, 16); ++ dspfunc(avg_h264_qpel, 1, 8); ++ dspfunc(avg_h264_qpel, 2, 4); ++ } ++ ++ c->idct_put= DSP_FUNC_NAME(idct_put_avr32); ++ c->idct_add= DSP_FUNC_NAME(idct_add_avr32); ++ c->idct = DSP_FUNC_NAME(idct_avr32); ++ c->h264_idct_add = DSP_FUNC_NAME(h264_idct_add_avr32); ++ c->h264_idct8_add = DSP_FUNC_NAME(h264_idct8_add_avr32); ++ ++ /*c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_avr32;*/ ++ ++ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; ++ ++ c->fdct = fdct_avr32; ++ ++ c->clear_blocks = clear_blocks_avr32; ++ ++#undef dspfunc ++#define dspfunc(PFX, IDX, NUM) \ ++ c->PFX ## _pixels_tab[IDX][0] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _avr32 ); \ ++ c->PFX ## _pixels_tab[IDX][1] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _h_avr32); \ ++ c->PFX ## _pixels_tab[IDX][2] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _v_avr32); \ ++ c->PFX ## _pixels_tab[IDX][3] = DSP_FUNC_NAME( PFX ## _pixels ## NUM ## _hv_avr32) ++ ++ dspfunc(put, 0, 16); ++ dspfunc(put_no_rnd, 0, 16); ++ dspfunc(put, 1, 8); ++ dspfunc(put_no_rnd, 1, 8); ++ ++ dspfunc(avg, 1, 8); ++ dspfunc(avg_no_rnd, 1, 8); ++ dspfunc(avg, 0, 16); ++ dspfunc(avg_no_rnd, 0, 16); ++#undef dspfunc ++ ++} ++ ++ ++ ++#if 0 ++int main(int argc, char *argv[]){ ++ ++ ++} ++#endif ++ +diff --git a/libavcodec/avr32/fdct.S b/libavcodec/avr32/fdct.S +new file mode 100644 +index 0000000..be45b86 +--- /dev/null ++++ b/libavcodec/avr32/fdct.S +@@ -0,0 +1,541 @@ ++/* ++ * Copyright (c) 2007 Atmel Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials provided ++ * with the distribution. ++ * ++ * 3. The name of ATMEL may not be used to endorse or promote products ++ * derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL ++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ++ * DAMAGE. ++ */ ++ ++//********************************************************** ++//* 2-D fDCT, Based on: * ++//* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical * ++//* Fast 1-D DCT Algorithms with 11 Multiplications", * ++//* Proc. Int'l. Conf. on Acoustics, Speech, and Signal * ++//* Processing 1989 (ICASSP '89), pp. 988-991. * ++//* * ++//* Fixed point implementation optimized for the AVR-II * ++//* instruction set. If a table is used for the * ++//* coeffisients we can load two and two of them from * ++//* This will give a reduction of ++//* * ++//* * ++//********************************************************** ++ ++ ++/* This routine is a slow-but-accurate integer implementation of the ++ * forward DCT (Discrete Cosine Transform). Taken from the IJG software ++ * ++ * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT ++ * on each column. Direct algorithms are also available, but they are ++ * much more complex and seem not to be any faster when reduced to code. ++ * ++ * This implementation is based on an algorithm described in ++ * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT ++ * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, ++ * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. ++ * The primary algorithm described there uses 11 multiplies and 29 adds. ++ * We use their alternate method with 12 multiplies and 32 adds. ++ * The advantage of this method is that no data path contains more than one ++ * multiplication; this allows a very simple and accurate implementation in ++ * scaled fixed-point arithmetic, with a minimal number of shifts. ++ * ++ * The poop on this scaling stuff is as follows: ++ * ++ * Each 1-D DCT step produces outputs which are a factor of sqrt(N) ++ * larger than the true DCT outputs. The final outputs are therefore ++ * a factor of N larger than desired; since N=8 this can be cured by ++ * a simple right shift at the end of the algorithm. The advantage of ++ * this arrangement is that we save two multiplications per 1-D DCT, ++ * because the y0 and y4 outputs need not be divided by sqrt(N). ++ * In the IJG code, this factor of 8 is removed by the quantization step ++ * (in jcdctmgr.c), here it is removed. ++ * ++ * We have to do addition and subtraction of the integer inputs, which ++ * is no problem, and multiplication by fractional constants, which is ++ * a problem to do in integer arithmetic. We multiply all the constants ++ * by CONST_SCALE and convert them to integer constants (thus retaining ++ * CONST_BITS bits of precision in the constants). After doing a ++ * multiplication we have to divide the product by CONST_SCALE, with proper ++ * rounding, to produce the correct output. This division can be done ++ * cheaply as a right shift of CONST_BITS bits. We postpone shifting ++ * as long as possible so that partial sums can be added together with ++ * full fractional precision. ++ * ++ * The outputs of the first pass are scaled up by PASS1_BITS bits so that ++ * they are represented to better-than-integral precision. These outputs ++ * require 8 + PASS1_BITS + 3 bits; this fits in a 16-bit word ++ * with the recommended scaling. (For 12-bit sample data, the intermediate ++ * array is INT32 anyway.) ++ * ++ * To avoid overflow of the 32-bit intermediate results in pass 2, we must ++ * have 8 + CONST_BITS + PASS1_BITS <= 26. Error analysis ++ * shows that the values given below are the most effective. ++ * ++ * We can gain a little more speed, with a further compromise in accuracy, ++ * by omitting the addition in a descaling shift. This yields an incorrectly ++ * rounded result half the time... ++ */ ++ ++ .global fdct_avr32 ++ ++ ++ ++#define CONST_BITS 13 ++#define PASS1_BITS 2 ++ ++#define FIX_0_298631336 2446 /* FIX(0.298631336) */ ++#define FIX_0_390180644 3196 /* FIX(0.390180644) */ ++#define FIX_0_541196100 4433 /* FIX(0.541196100) */ ++#define FIX_0_765366865 6270 /* FIX(0.765366865) */ ++#define FIX_0_899976223 7373 /* FIX(0.899976223) */ ++#define FIX_1_175875602 9633 /* FIX(1.175875602) */ ++#define FIX_1_501321110 12299 /* FIX(1.501321110) */ ++#define FIX_1_847759065 15137 /* FIX(1.847759065) */ ++#define FIX_1_961570560 16069 /* FIX(1.961570560) */ ++#define FIX_2_053119869 16819 /* FIX(2.053119869) */ ++#define FIX_2_562915447 20995 /* FIX(2.562915447) */ ++#define FIX_3_072711026 25172 /* FIX(3.072711026) */ ++ ++ ++/* ++ * Perform an integer forward DCT on one block of samples. ++ */ ++ ++//void ++//fdct_int32(short *const block) ++//{ ++// int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; ++// int tmp10, tmp11, tmp12, tmp13; ++// int z1, z2, z3, z4, z5; ++// short *blkptr; ++// int *dataptr; ++// int data[64]; ++// int i; ++// ++// /* Pass 1: process rows. */ ++// /* Note results are scaled up by sqrt(8) compared to a true DCT; */ ++// /* furthermore, we scale the results by 2**PASS1_BITS. */ ++// ++// dataptr = data; ++// blkptr = block; ++ ++ .text ++fdct_avr32: ++ pushm r0-r3, r4-r7, lr ++#define loop_ctr r0 ++#define blkptr r12 ++#define x0 r1 ++#define x1 r2 ++#define x2 r3 ++#define x3 r4 ++#define x4 r5 ++#define x5 r6 ++#define x6 r7 ++#define x7 r8 ++#define tmp0 r5 ++#define tmp7 r2 ++#define tmp1 r3 ++#define tmp6 r4 ++#define tmp2 r9 ++#define tmp5 r8 ++#define tmp3 r7 ++#define tmp4 r6 ++ ++ ++ mov loop_ctr, 8 ++// for (i = 0; i < 8; i++) { ++ROW_LOOP: ++ ++ ldm blkptr, r1, r2, r3, r4 ++ ++// tmp2 = blkptr[2] + blkptr[5]; ++// tmp3 = blkptr[3] + blkptr[4]; ++ paddx.h r5, r3, r2 ++// tmp5 = blkptr[2] - blkptr[5]; ++// tmp4 = blkptr[3] - blkptr[4]; ++ psubx.h r6, r3, r2 ++// tmp0 = blkptr[0] + blkptr[7]; ++// tmp1 = blkptr[1] + blkptr[6]; ++ paddx.h r2, r4, r1 ++// tmp7 = blkptr[0] - blkptr[7]; ++// tmp6 = blkptr[1] - blkptr[6]; ++ psubx.h r3, r4, r1 ++ ++// /* Even part per LL&M figure 1 --- note that published figure is faulty; ++// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". ++// */ ++ ++#define tmp10 r1 ++#define tmp13 r5 ++#define tmp11 r7 ++#define tmp12 r3 ++#define z1 r9 ++ ++// tmp10 = tmp0 + tmp3; ++// tmp13 = tmp0 - tmp3; ++ paddsub.h r1, r2:t, r5:b ++// tmp11 = tmp1 + tmp2; ++// tmp12 = tmp1 - tmp2; ++ paddsub.h r4, r2:b, r5:t ++ ++ ++// dataptr[0] = (tmp10 + tmp11) << PASS1_BITS; ++// dataptr[4] = (tmp10 - tmp11) << PASS1_BITS; ++ paddsub.h r7, r1:t, r4:t ++ ld.w r10, pc[const_table - .] ++ plsl.h r7, r7, PASS1_BITS ++ ++// z1 = (tmp12 + tmp13) * FIX_0_541196100; ++ addhh.w r8, r4:b, r1:b ++ mulhh.w r8, r8:b, r10:t ++ ++// dataptr[2] = ++// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS - PASS1_BITS); ++// dataptr[6] = ++// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS - PASS1_BITS); ++ mulhh.w r9, r1:b, r10:b ++ ld.w r10, pc[const_table - . + 4] ++ add r1, r8, r9 ++ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 ++ ++ mulhh.w r9, r4:b, r10:t ++ add r4, r8, r9 ++ satrnds r4 >> (CONST_BITS - PASS1_BITS), 31 ++ ++ ++// /* Odd part per figure 8 --- note paper omits factor of sqrt(2). ++// * cK represents cos(K*pi/16). ++// * i0..i3 in the paper are tmp4..tmp7 here. ++// */ ++ ++#define z2 r5 ++#define z3 r6 ++#define z4 r7 ++#define z5 r8 ++ ++// z4 = tmp5 + tmp7; ++// z3 = tmp4 + tmp6; ++ padd.h r2, r6, r3 ++// z2 = tmp5 + tmp6; ++// z1 = tmp4 + tmp7; ++ paddx.h r5, r6, r3 ++ ++ lddpc r9, pc[const_table - . + 8] ++// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ ++ addhh.w r8, r2:t, r2:b ++ mulhh.w r8, r8:b, r10:b ++ lddpc r10, pc[const_table - . + 12] ++ ++ ++// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ ++ mulhh.w r11, r6:b, r9:t ++ ++// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ ++ mulhh.w r6, r6:t, r9:b ++ ++// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ ++ lddpc r9, pc[const_table - . + 20] ++ mulhh.w lr, r3:b, r10:t ++ ++// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ ++ mulhh.w r3, r3:t, r10:b ++ ++// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ ++ mulhh.w r10, r2:b, r9:t ++ ++// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ ++ mulhh.w r2, r2:t, r9:b ++ lddpc r9, pc[const_table - . + 16] ++// z3 += z5; ++// z4 += z5; ++ add r10, r8 ++ add r2, r8 ++ ++// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ ++ mulhh.w r8, r5:b, r9:t ++ ++// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ ++ mulhh.w r5, r5:t, r9:b ++ ++// dataptr[7] = DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS); ++ add r11, r8 ++ add r11, r10 ++ satrnds r11 >> (CONST_BITS - PASS1_BITS), 31 ++ ++// dataptr[5] = DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS); ++ add r6, r5 ++ ++ sthh.w blkptr[6*2], r4:b, r11:b ++ add r6, r2 ++ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 ++ ++// dataptr[3] = DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS); ++ add lr, r5 ++ sthh.w blkptr[4*2], r7:b, r6:b ++ add lr, r10 ++ satrnds lr >> (CONST_BITS - PASS1_BITS), 31 ++ ++// dataptr[1] = DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS); ++ add r3, r8 ++ sthh.w blkptr[2*2], r1:b, lr:b ++ add r3, r2 ++ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 ++ ++ ++ ++// dataptr += 8; /* advance pointer to next row */ ++// blkptr += 8; ++ sthh.w blkptr[0], r7:t, r3:b ++ sub blkptr, -16 ++ sub loop_ctr, 1 ++ brne ROW_LOOP ++ ++// } ++ ++ /* Pass 2: process columns. ++ * We remove the PASS1_BITS scaling, but leave the results scaled up ++ * by an overall factor of 8. ++ */ ++ ++// dataptr = data; ++ sub blkptr, 128 ++ ++ mov loop_ctr, 4 ++// for (i = 0; i < 8; i++) { ++COLOUMN_LOOP: ++ ld.w r1, blkptr[0] ++ ld.w r2, blkptr[1*8*2] ++ ld.w r3, blkptr[2*8*2] ++ ld.w r4, blkptr[3*8*2] ++ ld.w r5, blkptr[4*8*2] ++ ld.w r6, blkptr[5*8*2] ++ ld.w r7, blkptr[6*8*2] ++ ld.w r8, blkptr[7*8*2] ++ ++// tmp0 = blkptr[0] + blkptr[7*8]; ++ padds.sh r9, r1, r8 ++// tmp7 = blkptr[0] - blkptr[7*8]; ++ psubs.sh r1, r1, r8 ++// tmp1 = blkptr[1*8] + blkptr[6*8]; ++ padds.sh r8, r2, r7 ++// tmp6 = blkptr[1*8] - blkptr[6*8]; ++ psubs.sh r2, r2, r7 ++// tmp2 = blkptr[2*8] + blkptr[5*8]; ++ padds.sh r7, r3, r6 ++// tmp5 = blkptr[2*8] - blkptr[5*8]; ++ psubs.sh r3, r3, r6 ++// tmp3 = blkptr[3*8] + blkptr[4*8]; ++ padds.sh r6, r4, r5 ++// tmp4 = blkptr[3*8] - blkptr[4*8]; ++ psubs.sh r4, r4, r5 ++ ++// /* even part per ll&m figure 1 --- note that published figure is faulty; ++// * rotator "sqrt(2)*c1" should be "sqrt(2)*c6". ++// */ ++// ++// tmp10 = tmp0 + tmp3; ++ padds.sh r5, r9, r6 ++// tmp13 = tmp0 - tmp3; ++ psubs.sh r9, r9, r6 ++// tmp11 = tmp1 + tmp2; ++ padds.sh r6, r8, r7 ++// tmp12 = tmp1 - tmp2; ++ psubs.sh r8, r8, r7 ++ ++// dataptr[0] = DESCALE(tmp10 + tmp11, PASS1_BITS); ++// dataptr[32] = DESCALE(tmp10 - tmp11, PASS1_BITS); ++//Might get an overflow here ++ padds.sh r7, r5, r6 ++ psubs.sh r5, r5, r6 ++ ++ //Rounding ++ mov lr, (1 << (PASS1_BITS + 2)) ++ orh lr, hi(1 << (16 + PASS1_BITS + 2)) ++ padds.sh r7, r7, lr ++ padds.sh r5, r5, lr ++ ++ pasr.h r7, r7, PASS1_BITS + 3 ++ pasr.h r5, r5, PASS1_BITS + 3 ++ st.w r12[0], r7 ++ st.w r12[4*8*2], r5 ++ ++ lddpc r10, const_table2 ++ ++ ++// z1 = (tmp12 + tmp13) * FIX_0_541196100; ++ padds.sh r5, r8, r9 ++ mulhh.w r6, r5:t, r10:t ++ mulhh.w r7, r5:b, r10:t ++ ++// dataptr[16] = ++// DESCALE(z1 + tmp13 * FIX_0_765366865, CONST_BITS + PASS1_BITS); ++ lddpc r11, const_table2 + 4 ++ mulhh.w lr, r9:t, r10:b ++ mulhh.w r9, r9:b, r10:b ++ add lr, r6 ++ add r9, r7 ++ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31 ++ satrnds r9 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ sthh.w r12[2*8*2], lr:b, r9:b ++ ++// dataptr[48] = ++// DESCALE(z1 + tmp12 * (-FIX_1_847759065), CONST_BITS + PASS1_BITS); ++ mulhh.w lr, r8:t, r11:t ++ mulhh.w r8, r8:b, r11:t ++ add lr, r6 ++ add r8, r7 ++ satrnds lr >> (CONST_BITS + PASS1_BITS + 3), 31 ++ satrnds r8 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ sthh.w r12[6*8*2], lr:b, r8:b ++ ++// /* Odd part per figure 8 --- note paper omits factor of sqrt(2). ++// * cK represents cos(K*pi/16). ++// * i0..i3 in the paper are tmp4..tmp7 here. ++// */ ++// ++// z2 = tmp5 + tmp6; ++// z3 = tmp4 + tmp6; ++// z4 = tmp5 + tmp7; ++ padds.sh r5, r3, r2 ++ padds.sh r6, r4, r2 ++ padds.sh r7, r3, r1 ++ ++// z5 = (z3 + z4) * FIX_1_175875602; /* sqrt(2) * c3 */ ++ padds.sh r8, r6, r7 ++ mulhh.w r9, r8:t, r11:b ++ mulhh.w r8, r8:b, r11:b ++ ++// z3 *= -FIX_1_961570560; /* sqrt(2) * (-c3-c5) */ ++// z3 += z5; ++ lddpc r11, const_table2 + 8 ++ mulhh.w r10, r6:t, r11:t ++ mulhh.w r6, r6:b, r11:t ++ add r10, r9 ++ add r6, r8 ++ ++// z4 *= -FIX_0_390180644; /* sqrt(2) * (c5-c3) */ ++// z4 += z5; ++ mulhh.w lr, r7:t, r11:b ++ mulhh.w r7, r7:b, r11:b ++ lddpc r11, const_table2 + 12 ++ st.w --sp,r0 ++ add lr, r9 ++ add r7, r8 ++ ++// tmp6 *= FIX_3_072711026; /* sqrt(2) * ( c1+c3+c5-c7) */ ++ mulhh.w r0, r2:t, r11:t ++ machh.w r0, r5:t, r11:b ++ mulhh.w r2, r2:b, r11:t ++ machh.w r2, r5:b, r11:b ++ ++// z2 *= -FIX_2_562915447; /* sqrt(2) * (-c1-c3) */ ++// dataptr[24] = DESCALE(tmp6 + z2 + z3, CONST_BITS + PASS1_BITS); ++ add r0, r10 ++ lddpc r11, const_table2 + 16 ++ add r2, r6 ++ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ sthh.w r12[3*8*2], r0:b, r2:b ++// tmp5 *= FIX_2_053119869; /* sqrt(2) * ( c1+c3-c5+c7) */ ++ mulhh.w r0, r3:t, r11:t ++ machh.w r0, r5:t, r11:b ++ mulhh.w r2, r3:b, r11:t ++ machh.w r2, r5:b, r11:b ++ add r0, lr ++ lddpc r11, const_table2 + 20 ++ add r2, r7 ++ ++// dataptr[40] = DESCALE(tmp5 + z2 + z4, CONST_BITS + PASS1_BITS); ++ satrnds r0 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ satrnds r2 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ sthh.w r12[5*8*2], r0:b, r2:b ++ ++ ++// z1 = tmp4 + tmp7; ++ padds.sh r2, r4, r1 ++ ++// tmp4 *= FIX_0_298631336; /* sqrt(2) * (-c1+c3+c5-c7) */ ++ mulhh.w r3, r4:t, r11:t ++ machh.w r3, r2:t, r11:b ++ mulhh.w r4, r4:b, r11:t ++ machh.w r4, r2:b, r11:b ++ add r3, r10 ++ lddpc r11, const_table2 + 24 ++ add r4, r6 ++ ++// z1 *= -FIX_0_899976223; /* sqrt(2) * (c7-c3) */ ++// dataptr[56] = DESCALE(tmp4 + z1 + z3, CONST_BITS + PASS1_BITS); ++ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ sthh.w r12[7*8*2], r3:b, r4:b ++ ++ ++// tmp7 *= FIX_1_501321110; /* sqrt(2) * ( c1+c3-c5-c7) */ ++ mulhh.w r3, r1:t, r11:t ++ machh.w r3, r2:t, r11:b ++ mulhh.w r4, r1:b, r11:t ++ machh.w r4, r2:b, r11:b ++ add r3, lr ++ add r4, r7 ++ ++// dataptr[8] = DESCALE(tmp7 + z1 + z4, CONST_BITS + PASS1_BITS); ++ satrnds r3 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ satrnds r4 >> (CONST_BITS + PASS1_BITS + 3), 31 ++ sthh.w r12[1*8*2], r3:b, r4:b ++ ld.w r0, sp++ ++ ++// dataptr++; /* advance pointer to next column */ ++ sub blkptr, -4 ++ sub loop_ctr, 1 ++ brne COLOUMN_LOOP ++ ++// } ++ ++ popm r0-r3, r4-r7, pc ++ ++// /* descale */ ++// for (i = 0; i < 64; i++) ++// block[i] = (short int) DESCALE(data[i], 3); ++ ++ ++//} ++ ++ ++ .align 2 ++const_table: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602 ++ .short FIX_0_298631336, FIX_2_053119869, FIX_3_072711026, FIX_1_501321110 ++ .short -FIX_0_899976223,-FIX_2_562915447, -FIX_1_961570560, -FIX_0_390180644 ++ ++const_table2: .short FIX_0_541196100, FIX_0_765366865, -FIX_1_847759065, FIX_1_175875602 ++ .short -FIX_1_961570560, -FIX_0_390180644, FIX_3_072711026, -FIX_2_562915447 ++ .short FIX_2_053119869, -FIX_2_562915447, FIX_0_298631336, -FIX_0_899976223 ++ .short FIX_1_501321110, -FIX_0_899976223 ++ ++ ++ ++ +diff --git a/libavcodec/avr32/h264idct.S b/libavcodec/avr32/h264idct.S +new file mode 100644 +index 0000000..4b23e2d +--- /dev/null ++++ b/libavcodec/avr32/h264idct.S +@@ -0,0 +1,451 @@ ++/* ++ * Copyright (c) 2007 Atmel Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials provided ++ * with the distribution. ++ * ++ * 3. The name of ATMEL may not be used to endorse or promote products ++ * derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL ++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ++ * DAMAGE. ++ */ ++ ++ .global h264_idct_add_avr32 ++ ++ /* Macro for performing the 1-D transform on one row line. ++ ++ The register 'w01' should contain the first two pixels, ++ and the register 'w23' should contain the last two pixels ++ in the line. The resulting line is placed in p01 and p23 ++ so that { w01, w23 } = { x0, x1, x3, x2 }. ++ 'tmp' and 'tmp2' should be scratchpad registers. */ ++ .macro transform_row w01, w23, tmp, tmp2 ++ add \tmp, \w23, \w01 << 1 /* tmp = { xxxx, 2*w1 + w3 } */ ++ sub \tmp2, \w01, \w23 << 1 /* tmp2 = { xxxx, w1 - 2*w3 } */ ++ bfins \tmp2, \tmp, 16, 16 /* tmp2 = { 2*w1 + w3, w1 - 2*w3 } */ ++ pasr.h \tmp2, \tmp2, 1 /* tmp2 = { w1 + w3/2, w1/2 - w3 } */ ++ paddsub.h \tmp, \w01:t, \w23:t /* tmp = { w0 + w2, w0 - w2 } */ ++ padd.h \w01, \tmp, \tmp2 /* w01 = { w0 + w2 + w1 + w3/2, w0 - w2 + w1/2 - w3 } */ ++ psub.h \w23, \tmp, \tmp2 /* w23 = { w0 + w2 - w1 - w3/2, w0 - w2 - w1/2 + w3 } */ ++ .endm ++ ++ /* Macro for performing the 1-D transform on two columns. ++ ++ The registers w0, w1, w2, w3 should each contain two ++ packed samples from the two colomns to transform. ++ tmp and tmp2 are scratchpad registers. ++ ++ The resulting transformed columns are placed in the ++ same positions as the input columns. ++ */ ++ .macro transform_2columns w0, w1, w2, w3, tmp, tmp2 ++ padd.h \tmp, \w0, \w2 /* tmp = z0 = w0 + w2 */ ++ psub.h \w0, \w0, \w2 /* w0 = z1 = w0 - w2 */ ++ pasr.h \w2, \w1, 1 /* w2 = w1/2 */ ++ pasr.h \tmp2, \w3, 1 /* tmp2 = w3/2 */ ++ psub.h \w3, \w2, \w3 /* w3 = z2 = w1/2 - w3 */ ++ padd.h \tmp2, \w1, \tmp2/* tmp2 = z3 = w1 + w3/2 */ ++ padd.h \w1, \w0, \w3 /* w1 = x1 = z1 + z2 */ ++ psub.h \w2, \w0, \w3 /* w2 = x2 = z1 - z2 */ ++ padd.h \w0, \tmp, \tmp2/* w0 = x0 = z0 + z3 */ ++ psub.h \w3, \tmp, \tmp2/* w3 = x3 = z0 - z3 */ ++ /* Scale down result. */ ++ pasr.h \w0, \w0, 6 ++ pasr.h \w1, \w1, 6 ++ pasr.h \w2, \w2, 6 ++ pasr.h \w3, \w3, 6 ++ .endm ++ ++/*void h264_idct_add_avr32(uint8_t *dst, DCTELEM *block, int stride)*/ ++ ++h264_idct_add_avr32: ++ ++ stm --sp,r0-r3,r4-r7, lr ++ ++ /* Setup rounding factor. */ ++ mov r0, (1 << 5) ++ lsl r0, 16 ++ ++ /* Load block */ ++ ldm r11,r2-r9 ++ /* r9 = { w00, w01 }, ++ r8 = { w02, w03 }, ++ r7 = { w10, w11 }, ++ r6 = { w12, w13 }, ++ r5 = { w20, w21 }, ++ r4 = { w22, w23 }, ++ r3 = { w30, w31 }, ++ r2 = { w32, w33 } */ ++ ++ ++ /* Add the rounding factor to w00. */ ++ add r9, r0 ++ ++ /* Transform rows */ ++ transform_row r9, r8, r0, r1 ++ transform_row r7, r6, r0, r1 ++ transform_row r5, r4, r0, r1 ++ transform_row r3, r2, r0, r1 ++ ++ /* Transform columns */ ++ transform_2columns r9, r7, r5, r3, r0, r1 ++ transform_2columns r8, r6, r4, r2, r0, r1 ++ ++ /* Load predicted pixels.*/ ++ ld.w lr, r12[0] ++ ld.w r11, r12[r10] ++ ++ /* Unpack to halwords. */ ++ punpckub.h r0, lr:t ++ punpckub.h r1, lr:b ++ ++ /* Add with transformed row. */ ++ padd.h r0, r0, r9 ++ paddx.h r1, r1, r8 ++ /* Pack and saturate back to 8-bit pixels. */ ++ packsh.ub r0, r0, r1 ++ ++ /* Unpack to halwords. */ ++ punpckub.h lr, r11:t ++ punpckub.h r11, r11:b ++ ++ /* Add with transformed row. */ ++ padd.h lr, lr, r7 ++ paddx.h r11, r11, r6 ++ /* Pack and saturate back to 8-bit pixels. */ ++ packsh.ub r1, lr, r11 ++ ++ /* Store back to frame. */ ++ st.w r12[0], r0 ++ st.w r12[r10], r1 ++ ++ add r12, r12, r10 << 1 ++ ++ /* Load predicted pixels.*/ ++ ld.w lr, r12[0] ++ ld.w r11, r12[r10] ++ ++ /* Unpack to halwords. */ ++ punpckub.h r0, lr:t ++ punpckub.h r1, lr:b ++ ++ /* Add with transformed row. */ ++ padd.h r0, r0, r5 ++ paddx.h r1, r1, r4 ++ /* Pack and saturate back to 8-bit pixels. */ ++ packsh.ub r0, r0, r1 ++ ++ /* Unpack to halwords. */ ++ punpckub.h lr, r11:t ++ punpckub.h r11, r11:b ++ ++ /* Add with transformed row. */ ++ padd.h lr, lr, r3 ++ paddx.h r11, r11, r2 ++ /* Pack and saturate back to 8-bit pixels. */ ++ packsh.ub r1, lr, r11 ++ ++ /* Store back to frame. */ ++ st.w r12[0], r0 ++ st.w r12[r10], r1 ++ ++ ldm sp++,r0-r3,r4-r7, pc ++ ++ ++ .global h264_idct8_add_avr32 ++//void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){ ++ ++h264_idct8_add_avr32: ++ stm --sp,r0-r3,r4-r7, lr ++ ++ /* Push dst and stride on stack */ ++ stm --sp,r10,r12 ++ ++// int i; ++// DCTELEM (*src)[8] = (DCTELEM(*)[8])block; ++// uint8_t *cm = cropTbl + MAX_NEG_CROP; ++ ++// block[0] += 32; ++ ++ ++// for( i = 0; i < 8; i++ ) ++// { ++ mov lr, 4 ++0: ++ ld.w r7, r11[0*(8*2)] ++ ld.w r6, r11[1*(8*2)] ++ ld.w r5, r11[2*(8*2)] ++ ld.w r4, r11[3*(8*2)] ++ ld.w r3, r11[4*(8*2)] ++ ld.w r2, r11[5*(8*2)] ++ ld.w r1, r11[6*(8*2)] ++ ld.w r0, r11[7*(8*2)] ++ ++/* ++ ++ const int a0 = src[0][i] + src[4][i]; ++ const int a2 = src[0][i] - src[4][i]; ++ const int a4 = (src[2][i]>>1) - src[6][i]; ++ const int a6 = (src[6][i]>>1) + src[2][i]; ++*/ ++ padd.h r8, r7, r3 /* r8 = a0 */ ++ psub.h r7, r7, r3 /* r7 = a2 */ ++ pasr.h r3, r5, 1 /* r3 = src[2][i] >> 1 */ ++ pasr.h r9, r1, 1 /* r9 = src[6][i] >> 1 */ ++ psub.h r3, r3, r1 /* r3 = a4 */ ++ padd.h r9, r9, r5 /* r9 = a6 */ ++ ++/* ++ const int b0 = a0 + a6; ++ const int b2 = a2 + a4; ++ const int b4 = a2 - a4; ++ const int b6 = a0 - a6; ++*/ ++ padd.h r1, r8, r9 /* r1 = b0 */ ++ psub.h r8, r8, r9 /* r8 = b6 */ ++ padd.h r5, r7, r3 /* r5 = b2 */ ++ psub.h r7, r7, r3 /* r7 = b4 */ ++ ++/* ++ const int a1 = -src[3][i] + src[5][i] - src[7][i] - (src[7][i]>>1); ++ const int a3 = src[1][i] + src[7][i] - src[3][i] - (src[3][i]>>1); ++ const int a5 = -src[1][i] + src[7][i] + src[5][i] + (src[5][i]>>1); ++ const int a7 = src[3][i] + src[5][i] + src[1][i] + (src[1][i]>>1); ++*/ ++ pasr.h r3, r0, 1 ++ padd.h r3, r3, r0 ++ psub.h r3, r2, r3 ++ psub.h r3, r3, r4 /* r3 = a1 */ ++ ++ pasr.h r9, r4, 1 ++ padd.h r9, r9, r4 ++ psub.h r9, r0, r9 ++ padd.h r9, r6, r9 /* r9 = a3 */ ++ ++ pasr.h r10, r2, 1 ++ padd.h r10, r10, r2 ++ padd.h r10, r10, r0 ++ psub.h r10, r10, r6 /* r10 = a5 */ ++ ++ pasr.h r0, r6, 1 ++ padd.h r0, r0, r6 ++ padd.h r0, r0, r2 ++ padd.h r0, r0, r4 /* r0 = a7 */ ++/* ++ const int b1 = (a7>>2) + a1; ++ const int b3 = a3 + (a5>>2); ++ const int b5 = (a3>>2) - a5; ++ const int b7 = a7 - (a1>>2); ++*/ ++ pasr.h r2, r0, 2 ++ padd.h r2, r2, r3 /* r2 = b1 */ ++ pasr.h r3, r3, 2 ++ psub.h r3, r0, r3 /* r3 = b7 */ ++ ++ pasr.h r0, r10, 2 ++ padd.h r0, r0, r9 /* r0 = b3 */ ++ pasr.h r9, r9, 2 ++ psub.h r9, r9, r10 /* r9 = b5 */ ++ ++ ++/* ++ src[0][i] = b0 + b7; ++ src[7][i] = b0 - b7; ++ src[1][i] = b2 + b5; ++ src[6][i] = b2 - b5; ++ src[2][i] = b4 + b3; ++ src[5][i] = b4 - b3; ++ src[3][i] = b6 + b1; ++ src[4][i] = b6 - b1; */ ++ ++ padd.h r4, r1, r3 ++ psub.h r1, r1, r3 ++ st.w r11[0*(8*2)], r4 ++ st.w r11[7*(8*2)], r1 ++ ++ padd.h r3, r5, r9 ++ psub.h r5, r5, r9 ++ st.w r11[1*(8*2)], r3 ++ st.w r11[6*(8*2)], r5 ++ ++ padd.h r9, r7, r0 ++ psub.h r7, r7, r0 ++ st.w r11[2*(8*2)], r9 ++ st.w r11[5*(8*2)], r7 ++ ++ padd.h r0, r8, r2 ++ psub.h r8, r8, r2 ++ st.w r11[3*(8*2)], r0 ++ st.w r11[4*(8*2)], r8 ++ ++ sub r11, -4 ++ sub lr, 1 ++ brne 0b ++ ++// } ++ ++ lddsp r12, sp[0] /* r12 = dst */ ++ sub r11, 4*4 ++ ldm r11++, r4-r7 ++ mov lr, 8 ++ /* Push dst and stride on stack */ ++ ++1: ++// for( i = 0; i < 8; i++ ) ++// { ++ ++ /* r7 = {src[i][0], src[i][1]} ++ r6 = {src[i][2], src[i][3]} ++ r5 = {src[i][4], src[i][5]} ++ r4 = {src[i][6], src[i][7]} */ ++ ++/* ++ const int a0 = src[i][0] + src[i][4]; ++ const int a2 = src[i][0] - src[i][4]; ++ const int a4 = (src[i][2]>>1) - src[i][6]; ++ const int a6 = (src[i][6]>>1) + src[i][2]; ++*/ ++ pasr.h r8, r6, 1 ++ pasr.h r9, r4, 1 ++ addhh.w r0, r7:t, r5:t /* r0 = a0 */ ++ subhh.w r1, r7:t, r5:t /* r1 = a2 */ ++ subhh.w r2, r8:t, r4:t /* r2 = a4 */ ++ addhh.w r3, r9:t, r6:t /* r3 = a6 */ ++ ++/* ++ const int b0 = a0 + a6; ++ const int b2 = a2 + a4; ++ const int b4 = a2 - a4; ++ const int b6 = a0 - a6; ++*/ ++ add r10, r0, r3 /* r10 = b0 */ ++ sub r0, r3 /* r0 = b6 */ ++ add r3, r1, r2 /* r3 = b2 */ ++ sub r1, r2 /* r1 = b4 */ ++/* ++ ++ ++ const int a7 = src[i][5] + src[i][3] + src[i][1] + (src[i][1]>>1); ++ const int a1 = src[i][5] - src[i][3] - src[i][7] - (src[i][7]>>1); ++ const int a3 = src[i][7] + src[i][1] - src[i][3] - (src[i][3]>>1); ++ const int a5 = src[i][7] - src[i][1] + src[i][5] + (src[i][5]>>1); */ ++ addhh.w r8, r8:b, r6:b ++ addhh.w r2, r4:b, r7:b ++ sub r2, r8 /* r2 = a3 */ ++ ++ addhh.w r9, r9:b, r4:b ++ subhh.w r8, r5:b, r6:b ++ sub r8, r9 /* r8 = a1 */ ++ ++ pasr.h r9, r7, 1 ++ addhh.w r9, r9:b, r7:b ++ addhh.w r6, r5:b, r6:b ++ add r6, r9 /* r6 = a7 */ ++ ++ pasr.h r9, r5, 1 ++ addhh.w r9, r9:b, r5:b ++ subhh.w r5, r4:b, r7:b ++ add r5, r9 /* r5 = a5 */ ++ ++/* const int b1 = (a7>>2) + a1; ++ const int b3 = (a5>>2) + a3; ++ const int b5 = (a3>>2) - a5; ++ const int b7 = -(a1>>2) + a7 ; */ ++ asr r4, r6, 2 ++ add r4, r8 /* r4 = b1 */ ++ asr r8, 2 ++ rsub r8, r6 /* r8 = b7 */ ++ ++ asr r6, r5, 2 ++ add r6, r2 /* r6 = b3 */ ++ asr r2, 2 ++ sub r2, r5 /* r2 = b5 */ ++ ++/* ++ dst[i*stride + 0] = cm[ dst[i*stride + 0] + ((b0 + b7) >> 6) ]; ++ dst[i*stride + 1] = cm[ dst[i*stride + 1] + ((b2 + b5) >> 6) ]; ++ dst[i*stride + 2] = cm[ dst[i*stride + 2] + ((b4 + b3) >> 6) ]; ++ dst[i*stride + 3] = cm[ dst[i*stride + 3] + ((b6 + b1) >> 6) ]; ++ dst[i*stride + 4] = cm[ dst[i*stride + 4] + ((b6 - b1) >> 6) ]; ++ dst[i*stride + 5] = cm[ dst[i*stride + 5] + ((b4 - b3) >> 6) ]; ++ dst[i*stride + 6] = cm[ dst[i*stride + 6] + ((b2 - b5) >> 6) ]; ++ dst[i*stride + 7] = cm[ dst[i*stride + 7] + ((b0 - b7) >> 6) ]; ++*/ ++ add r5, r10, r8 ++ satrnds r5 >> 6, 0 /* r5 = (b0 + b7) >> 6 */ ++ sub r10, r8 ++ satrnds r10 >> 6, 0 /* r10 = (b0 - b7) >> 6 */ ++ add r8, r3, r2 ++ satrnds r8 >> 6, 0 /* r8 = (b2 + b5) >> 6 */ ++ sub r3, r2 ++ satrnds r3 >> 6, 0 /* r3 = (b2 - b5) >> 6 */ ++ ++ add r2, r1, r6 ++ satrnds r2 >> 6, 0 /* r2 = (b4 + b3) >> 6 */ ++ sub r1, r6 ++ satrnds r1 >> 6, 0 /* r1 = (b4 - b3) >> 6 */ ++ ++ add r6, r0, r4 ++ satrnds r6 >> 6, 0 /* r6 = (b6 + b1) >> 6 */ ++ sub r0, r4 ++ satrnds r0 >> 6, 0 /* r0 = (b6 - b1) >> 6 */ ++ ++ ld.w r4, r12[0] ++ ++ packw.sh r8, r5, r8 ++ packw.sh r7, r2, r6 ++ ld.w r9, r12[4] ++ packw.sh r6, r0, r1 ++ packw.sh r5, r3, r10 ++ ++ punpckub.h r10, r4:t ++ punpckub.h r4, r4:b ++ punpckub.h r3, r9:t ++ punpckub.h r9, r9:b ++ ++ padd.h r8, r8, r10 ++ padd.h r7, r7, r4 ++ padd.h r6, r6, r3 ++ padd.h r5, r5, r9 ++ ++ lddsp r10, sp[4] /* r10 = stride */ ++ packsh.ub r0, r8, r7 ++ packsh.ub r1, r6, r5 ++ ++ st.w r12[0], r0 ++ st.w r12[4], r1 ++ ++ ldm r11++, r4-r7 ++ add r12, r10 /* dst += stride */ ++ ++ sub lr, 1 ++ brne 1b ++ ++ sub sp, -8 ++ ldm sp++,r0-r3,r4-r7, pc ++ ++ ++ ++// } ++//} +diff --git a/libavcodec/avr32/idct.S b/libavcodec/avr32/idct.S +new file mode 100644 +index 0000000..e7551ec +--- /dev/null ++++ b/libavcodec/avr32/idct.S +@@ -0,0 +1,829 @@ ++/* ++ * Copyright (c) 2007 Atmel Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials provided ++ * with the distribution. ++ * ++ * 3. The name of ATMEL may not be used to endorse or promote products ++ * derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL ++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ++ * DAMAGE. ++ */ ++ ++ .global idct_add_avr32 ++ .global idct_put_avr32 ++ .global idct_avr32 ++ ++ ++#define CONST_BITS 13 ++#define PASS1_BITS 2 ++ ++#define ONE ((INT32) 1) ++ ++#define CONST_SCALE (ONE << CONST_BITS) ++ ++#define LINE_SIZE 32 ++ ++#define FIX_0_298631336 (2446) /* FIX(0.298631336) */ ++#define FIX_0_390180644 (3196) /* FIX(0.390180644) */ ++#define FIX_0_541196100 (4433) /* FIX(0.541196100) */ ++#define FIX_0_765366865 (6270) /* FIX(0.765366865) */ ++#define FIX_0_899976223 (7373) /* FIX(0.899976223) */ ++#define FIX_1_175875602 (9633) /* FIX(1.175875602) */ ++#define FIX_1_501321110 (12299)/* FIX(1.501321110) */ ++#define FIX_1_847759065 (15137)/* FIX(1.847759065) */ ++#define FIX_1_961570560 (16069)/* FIX(1.961570560) */ ++#define FIX_2_053119869 (16819)/* FIX(2.053119869) */ ++#define FIX_2_562915447 (20995)/* FIX(2.562915447) */ ++#define FIX_3_072711026 (25172)/* FIX(3.072711026) */ ++ ++ ++#define loop_cnt r11 ++ ++ .text ++ ++idct_add_avr32: ++ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables ++ ++ // Give room for some variables on the stack ++ sub sp, 8 ++ stdsp SP[0], r12 // rfp ++ stdsp SP[4], r11 // iinc ++ ++ mov loop_cnt, 8 //Initialize loop counter ++ ++FOR_ROW: ++ ++ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block ++ mov r6, 0 ++#ifdef USE_PREFETCH ++ pref r10[LINE_SIZE] //Prefetch next line ++#endif ++ or r4, r2, r3 << 16 ++ or r4, r1 //Check if all DCT-coeffisients except the DC is zero ++ or r4, r0 ++ brne AC_ROW //If there are non-zero AC coeffisients perform row-transform ++ ++ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5 ++ plsl.h r5, r5, PASS1_BITS ++ mov r4, r5 ++ st.d r10++, r4 ++ st.d r10++, r4 ++ ++ sub loop_cnt, 1 //Decrement loop counter ++ brne FOR_ROW //Perform loop one more time if loop_cnt is not zero ++ ++ bral COLOUMN_TRANSFORM //Perform coloumn transform after row transform is computed ++ ++ ++AC_ROW: ++ ++ ++ ld.w r12, pc[coef_table - .] ++ ld.w r9, pc[coef_table - . + 4] ++ ++ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] ++ mulhh.w r5, r4:t, r12:t ++ mulhh.w r6, r0:t, r12:b ++ ld.w r12, pc[coef_table - . + 8] ++ mulhh.w r7, r2:t, r9:t ++ add r6, r5 // tmp2 ++ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 ++ add r7, r5 // tmp3 ++ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31 ++ ++ paddsub.h r5, r3:t, r1:t ++ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1 ++ ++ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13 ++ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12 ++ ++ ++ addhh.w lr, r3:b, r1:b // lr = z4 ++ addhh.w r5, r4:b, lr:b ++ mulhh.w r5, r5:b, r9:b // r5 = z5 ++ ++ ld.w r9, pc[coef_table - . + 12] ++ mulhh.w r4, r4:b, r12:t // r4 = z3 ++ mulhh.w lr, lr:b, r12:b // lr = z4 ++ ++ add r4, r5 ++ add lr, r5 ++ ++ addhh.w r5, r2:b, r1:b // r5 = z2 ++ addhh.w r8, r3:b, r0:b // r8 = z1 ++ ++ ++ mulhh.w r0, r0:b, r9:t // r0 = tmp0 ++ ld.w r12, pc[coef_table - . + 16] ++ mulhh.w r1, r1:b, r9:b // r1 = tmp1 ++ ld.w r9, pc[coef_table - . + 20] ++ mulhh.w r2, r2:b, r12:t // r2 = tmp2 ++ mulhh.w r3, r3:b, r12:b // r3 = tmp3 ++ mulhh.w r8, r8:b, r9:t // r8 = z1 ++ mulhh.w r5, r5:b, r9:b // r5 = z2 ++ ++ ++ add r0, r8 ++ add r0, r4 ++ add r1, r5 ++ add r1, lr ++ add r2, r5 ++ add r2, r4 ++ add r3, r8 ++ add r3, lr ++ ++ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31 ++ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 ++ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31 ++ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 ++ ++ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6] ++ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7] ++ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5] ++ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4] ++ ++ sthh.w r10[0], r4:t, r5:t ++ sthh.w r10[4], r3:t, r2:t ++ sthh.w r10[8], r2:b, r3:b ++ sthh.w r10[12], r5:b, r4:b ++ ++ ++ ++ sub r10, -16 ++ sub loop_cnt, 1 ++ brne FOR_ROW, e ++ ++COLOUMN_TRANSFORM: ++ ++ sub r10, 128 //Set pointer to start of DCT block ++ ++ ++ mov loop_cnt, 8 ++FOR_COLOUMN: ++ ldins.h r3:t,r10[0] // r3:t = dataptr[0] ++ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1] ++ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2] ++ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5] ++ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4] ++ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3] ++ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6] ++ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7] ++ ++ or r4, r1, r3 << 16 ++ or r4, r2 ++ or r4, r0 ++ brne AC_COLOUMN //If there are non-zero AC coeffisients perform row-transform ++ ++ lddsp r12, SP[0] // rfp ++ lddsp r9, SP[4] // iinc ++ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 9 ++ ld.d r0, r12[0] ++ sub r10, -2 // Increment the dataptr ++ bfins r3, r3, 16, 16 ++ punpckub.h r2, r1:t ++ padd.h r2, r2, r3 ++ punpckub.h r1, r1:b ++ padd.h r1, r1, r3 ++ packsh.ub r1, r2, r1 ++ punpckub.h r2, r0:t ++ padd.h r2, r2, r3 ++ punpckub.h r0, r0:b ++ padd.h r0, r0, r3 ++ packsh.ub r0, r2, r0 ++ st.d r12[0], r0 ++ add r12, r9 // increment rfp ++ stdsp SP[0], r12 ++ ++ sub loop_cnt, 1//Decrement loop counter ++ brne FOR_COLOUMN//Perform loop one more time if loop_cnt is not zero ++ ++ sub sp, -8 ++ popm r0-r3, r4-r7, pc//Pop back registers and PC ++ ++AC_COLOUMN: ++ ++ ld.w r12, pc[coef_table - .] ++ ld.w r9, pc[coef_table - . + 4] ++ ++ addhh.w r4, r2:t, r2:b ++ mulhh.w r4, r4:b, r12:t // r4 = z1 ++ mulhh.w r5, r2:b, r12:b ++ ld.w r12, pc[coef_table - . + 8] ++ mulhh.w r6, r2:t, r9:t ++ add r5, r4 // r5 = tmp2 ++ add r6, r4 // r6 = tmp3 ++ ++ addhh.w r7, r3:t, r3:b ++ subhh.w r8, r3:t, r3:b ++ ++ lsl r7, CONST_BITS ++ lsl r8, CONST_BITS ++ ++ add r2, r7, r6 // r2 = tmp10 ++ sub r3, r7, r6 // r3 = tmp13 ++ add r4, r8, r5 // r4 = tmp11 ++ sub r5, r8, r5 // r5 = tmp12 ++ ++ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3 ++ addhh.w r7, r6:t, r6:b ++ mulhh.w r7, r7:b, r9:b // r7 = z5 ++ ++ ld.w r9, pc[coef_table - . + 12] ++ mulhh.w r8, r6:b, r12:t // r8 = z3 ++ mulhh.w r6, r6:t, r12:b // r6 = z4 ++ ++ add r8, r7 ++ add r6, r7 ++ ++ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1 ++ ++ mulhh.w r12, r0:b, r9:t // r12 = tmp0 ++ mulhh.w r0, r0:t, r9:b // r0 = tmp1 ++ ld.w r9, pc[coef_table - . + 16] ++ add r12, r8 ++ add r0, r6 ++ ++ ld.w lr, pc[coef_table - . + 20] ++ machh.w r8, r1:b, r9:t // r8 = tmp2 ++ machh.w r6, r1:t, r9:b // r6 = tmp3 ++ mulhh.w r9, r7:b, lr:t // r9 = z1 ++ mulhh.w r7, r7:t, lr:b // r7 = z2 ++ ++ ++ add r12, r9 ++ add r0, r7 ++ add r8, r7 ++ add r6, r9 ++ ++ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0] ++ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7] ++ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1] ++ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6] ++ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2] ++ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5] ++ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3] ++ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4] ++ ++ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9 ++ ++ packw.sh r1, r1, r6 ++ packw.sh r8, r8, r0 ++ packw.sh r3, r3, r5 ++ packw.sh r4, r4, r2 ++ ++ lddsp r12, SP[0] // rfp ++ lddsp r9, SP[4] // iinc ++ ld.d r6, r12[0] ++ sub r10, -2 // Increment the dataptr ++ punpckub.h r0, r7:t ++ padd.h r1, r1, r0 ++ punpckub.h r0, r7:b ++ padd.h r8, r8, r0 ++ packsh.ub r7, r1, r8 ++ punpckub.h r0, r6:t ++ padd.h r3, r3, r0 ++ punpckub.h r0, r6:b ++ padd.h r4, r4, r0 ++ packsh.ub r6, r3, r4 ++ st.d r12[0], r6 ++ add r12, r9 // increment rfp ++ stdsp SP[0], r12 ++ ++ sub loop_cnt, 1 //Decrement loop counter ++ brne FOR_COLOUMN //Perform loop one more time if loop_cnt is not zero ++ ++ sub sp, -8 ++ popm r0-r3, r4-r7, pc //Pop back registers and PC ++ ++ ++ ++//Coeffisient Table: ++ .align 2 ++coef_table: ++ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 ++ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869 ++ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447 ++ ++ ++idct_put_avr32: ++ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables ++ ++ //; Give room for some variables on the stack ++ sub sp, 8 ++ stdsp SP[0], r12 // rfp ++ stdsp SP[4], r11 // iinc ++ ++ mov loop_cnt, 8 //Initialize loop counter ++ ++0: ++ ++ ldm r10, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block ++ mov r6, 0 ++#ifdef USE_PREFETCH ++ pref r10[LINE_SIZE] //Prefetch next line ++#endif ++ or r4, r2, r3 << 16 ++ or r4, r1 //Check if all DCT-coeffisients except the DC is zero ++ or r4, r0 ++ brne 1f //If there are non-zero AC coeffisients perform row-transform ++ ++ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5 ++ plsl.h r5, r5, PASS1_BITS ++ mov r4, r5 ++ st.d r10++, r4 ++ st.d r10++, r4 ++ ++ sub loop_cnt, 1 //Decrement loop counter ++ brne 0b //Perform loop one more time if loop_cnt is not zero ++ ++ bral 2f //Perform coloumn transform after row transform is computed ++ ++1: ++ ++ ld.w r12, pc[coef_table_copy - .] ++ ld.w r9, pc[coef_table_copy - . + 4] ++ ++ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] ++ mulhh.w r5, r4:t, r12:t ++ mulhh.w r6, r0:t, r12:b ++ ld.w r12, pc[coef_table_copy - . + 8] ++ mulhh.w r7, r2:t, r9:t ++ add r6, r5 // tmp2 ++ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 ++ add r7, r5 // tmp3 ++ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31 ++ ++ paddsub.h r5, r3:t, r1:t ++ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1 ++ ++ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13 ++ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12 ++ ++ ++ ++ addhh.w lr, r3:b, r1:b // lr = z4 ++ addhh.w r5, r4:b, lr:b ++ mulhh.w r5, r5:b, r9:b // r5 = z5 ++ ++ ld.w r9, pc[coef_table_copy - . + 12] ++ mulhh.w r4, r4:b, r12:t // r4 = z3 ++ mulhh.w lr, lr:b, r12:b // lr = z4 ++ ++ add r4, r5 ++ add lr, r5 ++ ++ addhh.w r5, r2:b, r1:b // r5 = z2 ++ addhh.w r8, r3:b, r0:b // r8 = z1 ++ ++ ++ mulhh.w r0, r0:b, r9:t // r0 = tmp0 ++ ld.w r12, pc[coef_table_copy - . + 16] ++ mulhh.w r1, r1:b, r9:b // r1 = tmp1 ++ ld.w r9, pc[coef_table_copy - . + 20] ++ mulhh.w r2, r2:b, r12:t // r2 = tmp2 ++ mulhh.w r3, r3:b, r12:b // r3 = tmp3 ++ mulhh.w r8, r8:b, r9:t // r8 = z1 ++ mulhh.w r5, r5:b, r9:b // r5 = z2 ++ ++ ++ add r0, r8 ++ add r0, r4 ++ add r1, r5 ++ add r1, lr ++ add r2, r5 ++ add r2, r4 ++ add r3, r8 ++ add r3, lr ++ ++ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31 ++ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 ++ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31 ++ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 ++ ++ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6] ++ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7] ++ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5] ++ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4] ++ ++ sthh.w r10[0], r4:t, r5:t ++ sthh.w r10[4], r3:t, r2:t ++ sthh.w r10[8], r2:b, r3:b ++ sthh.w r10[12], r5:b, r4:b ++ ++ ++ ++ sub r10, -16 ++ sub loop_cnt, 1 ++ brne 0b ++ ++2: ++ ++ sub r10, 128 //Set pointer to start of DCT block ++ ++ mov loop_cnt, 8 ++ ++0: ++ ldins.h r3:t,r10[0] // r3:t = dataptr[0] ++ ldins.h r1:t,r10[1*8*2]// r1:t = dataptr[1] ++ ldins.h r2:t,r10[2*8*2]// r2:t = dataptr[2] ++ ldins.h r0:t,r10[5*8*2]// r0:t = dataptr[5] ++ ldins.h r3:b,r10[4*8*2]// r3:b = dataptr[4] ++ ldins.h r1:b,r10[3*8*2]// r1:b = dataptr[3] ++ ldins.h r2:b,r10[6*8*2]// r2:b = dataptr[6] ++ ldins.h r0:b,r10[7*8*2]// r0:b = dataptr[7] ++ ++ or r4, r1, r3 << 16 ++ or r4, r2 ++ or r4, r0 ++ brne 1f //If there are non-zero AC coeffisients perform row-transform ++ ++ lddsp r12, SP[0] // rfp ++ lddsp r9, SP[4] // iinc ++ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31 ++ packw.sh r3, r3, r3 ++ packsh.ub r3, r3, r3 ++ mov r2, r3 ++ st.d r12[0], r2 ++ add r12, r9 // increment rfp ++ sub r10, -2 // Increment the dataptr ++ stdsp SP[0], r12 ++ ++ sub loop_cnt, 1//Decrement loop counter ++ brne 0b //Perform loop one more time if loop_cnt is not zero ++ ++ sub sp, -8 ++ popm r0-r3, r4-r7, pc//Pop back registers and PC ++ ++1: ++ ++ ld.w r12, pc[coef_table_copy - .] ++ ld.w r9, pc[coef_table_copy - . + 4] ++ ++ addhh.w r4, r2:t, r2:b ++ mulhh.w r4, r4:b, r12:t // r4 = z1 ++ mulhh.w r5, r2:b, r12:b ++ ld.w r12, pc[coef_table_copy - . + 8] ++ mulhh.w r6, r2:t, r9:t ++ add r5, r4 // r5 = tmp2 ++ add r6, r4 // r6 = tmp3 ++ ++ addhh.w r7, r3:t, r3:b ++ subhh.w r8, r3:t, r3:b ++ ++ lsl r7, CONST_BITS ++ lsl r8, CONST_BITS ++ ++ add r2, r7, r6 // r2 = tmp10 ++ sub r3, r7, r6 // r3 = tmp13 ++ add r4, r8, r5 // r4 = tmp11 ++ sub r5, r8, r5 // r5 = tmp12 ++ ++ ++ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3 ++ addhh.w r7, r6:t, r6:b ++ mulhh.w r7, r7:b, r9:b // r7 = z5 ++ ++ ld.w r9, pc[coef_table_copy - . + 12] ++ mulhh.w r8, r6:b, r12:t // r8 = z3 ++ mulhh.w r6, r6:t, r12:b // r6 = z4 ++ ++ add r8, r7 ++ add r6, r7 ++ ++ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1 ++ ++ mulhh.w r12, r0:b, r9:t // r12 = tmp0 ++ mulhh.w r0, r0:t, r9:b // r0 = tmp1 ++ ld.w r9, pc[coef_table_copy - . + 16] ++ add r12, r8 ++ add r0, r6 ++ ++ ld.w lr, pc[coef_table_copy - . + 20] ++ machh.w r8, r1:b, r9:t // r8 = tmp2 ++ machh.w r6, r1:t, r9:b // r6 = tmp3 ++ mulhh.w r9, r7:b, lr:t // r9 = z1 ++ mulhh.w r7, r7:t, lr:b // r7 = z2 ++ ++ ++ add r12, r9 ++ add r0, r7 ++ add r8, r7 ++ add r6, r9 ++ ++ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0] ++ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7] ++ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1] ++ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6] ++ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2] ++ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5] ++ add r0, r3, r12 // r0 = dataptr[DCTSIZE*3] ++ sub r3, r3, r12 // r3 = dataptr[DCTSIZE*4] ++ ++ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9 ++ ++ packw.sh r1, r1, r6 ++ packw.sh r8, r8, r0 ++ packw.sh r3, r3, r5 ++ packw.sh r4, r4, r2 ++ ++ packsh.ub r1, r1, r8 ++ packsh.ub r0, r3, r4 ++ lddsp r12, SP[0] // rfp ++ lddsp r9, SP[4] // iinc ++ st.d r12[0], r0 ++ sub r10, -2 // Increment the dataptr ++ add r12, r9 // increment rfp ++ stdsp SP[0], r12 ++ ++ sub loop_cnt, 1 //Decrement loop counter ++ brne 0b //Perform loop one more time if loop_cnt is not zero ++ ++ sub sp, -8 ++ popm r0-r3, r4-r7, pc //Pop back registers and PC ++ ++ ++ ++ .align 2 ++coef_table_copy: ++ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 ++ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869 ++ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447 ++ ++ ++idct_avr32: ++ pushm r0-r3, r4-r7, lr //Free up registers to use for local variables ++ ++ //; Give room for a temporary block on the stack ++ sub sp, 8*8*2 ++ ++ mov loop_cnt, 8 //Initialize loop counter ++ ++0: ++ ++ ldm r12++, r0, r1, r2, r3 //Load 8 DCT-coeffisients from the current row in the DCT-block ++ mov r6, 0 ++#ifdef USE_PREFETCH ++ pref r12[LINE_SIZE] //Prefetch next line ++#endif ++ or r4, r2, r3 << 16 ++ or r4, r1 //Check if all DCT-coeffisients except the DC is zero ++ or r4, r0 ++ brne 1f //If there are non-zero AC coeffisients perform row-transform ++ ++ paddsub.h r5, r3:t, r6:b //Extract the DC-coeff from r5 ++ plsl.h r5, r5, PASS1_BITS ++ mov r4, r5 ++ st.d sp++, r4 ++ st.d sp++, r4 ++ ++ sub loop_cnt, 1 //Decrement loop counter ++ brne 0b //Perform loop one more time if loop_cnt is not zero ++ ++ bral 2f //Perform coloumn transform after row transform is computed ++ ++1: ++ ++ ld.w r10, pc[coef_table_idct - .] ++ ld.w r9, pc[coef_table_idct - . + 4] ++ ++ padd.h r4, r2, r0 // r4:t = dataptr[2] + dataptr[6],r4:b = dataptr[3] + dataptr[7] ++ mulhh.w r5, r4:t, r10:t ++ mulhh.w r6, r0:t, r10:b ++ ld.w r10, pc[coef_table_idct - . + 8] ++ mulhh.w r7, r2:t, r9:t ++ add r6, r5 // tmp2 ++ satrnds r6 >> (CONST_BITS - PASS1_BITS), 31 ++ add r7, r5 // tmp3 ++ satrnds r7 >> (CONST_BITS - PASS1_BITS), 31 ++ ++ paddsub.h r5, r3:t, r1:t ++ plsl.h r5, r5, PASS1_BITS // r5:t = tmp0, r5:b = tmp1 ++ ++ paddsub.h r7, r5:t, r7:b // r7:t = tmp10, r7:b = tmp13 ++ paddsub.h r6, r5:b, r6:b // r6:t = tmp11, r6:b = tmp12 ++ ++ ++ ++ addhh.w lr, r3:b, r1:b // lr = z4 ++ addhh.w r5, r4:b, lr:b ++ mulhh.w r5, r5:b, r9:b // r5 = z5 ++ ++ ld.w r9, pc[coef_table_idct - . + 12] ++ mulhh.w r4, r4:b, r10:t // r4 = z3 ++ mulhh.w lr, lr:b, r10:b // lr = z4 ++ ++ add r4, r5 ++ add lr, r5 ++ ++ addhh.w r5, r2:b, r1:b // r5 = z2 ++ addhh.w r8, r3:b, r0:b // r8 = z1 ++ ++ ++ mulhh.w r0, r0:b, r9:t // r0 = tmp0 ++ ld.w r10, pc[coef_table_idct - . + 16] ++ mulhh.w r1, r1:b, r9:b // r1 = tmp1 ++ ld.w r9, pc[coef_table_idct - . + 20] ++ mulhh.w r2, r2:b, r10:t // r2 = tmp2 ++ mulhh.w r3, r3:b, r10:b // r3 = tmp3 ++ mulhh.w r8, r8:b, r9:t // r8 = z1 ++ mulhh.w r5, r5:b, r9:b // r5 = z2 ++ ++ ++ add r0, r8 ++ add r0, r4 ++ add r1, r5 ++ add r1, lr ++ add r2, r5 ++ add r2, r4 ++ add r3, r8 ++ add r3, lr ++ ++ satrnds r0 >> (CONST_BITS - PASS1_BITS), 31 ++ satrnds r1 >> (CONST_BITS - PASS1_BITS), 31 ++ satrnds r2 >> (CONST_BITS - PASS1_BITS), 31 ++ satrnds r3 >> (CONST_BITS - PASS1_BITS), 31 ++ ++ paddsub.h r5, r6:t, r2:b // r5:t = dataptr[1], r5:b = dataptr[6] ++ paddsub.h r4, r7:t, r3:b // r4:t = dataptr[0], r4:b = dataptr[7] ++ paddsub.h r3, r6:b, r1:b // r3:t = dataptr[2], r3:b = dataptr[5] ++ paddsub.h r2, r7:b, r0:b // r2:t = dataptr[3], r2:b = dataptr[4] ++ ++ sthh.w sp[0], r4:t, r5:t ++ sthh.w sp[4], r3:t, r2:t ++ sthh.w sp[8], r2:b, r3:b ++ sthh.w sp[12], r5:b, r4:b ++ ++ ++ ++ sub sp, -16 ++ sub loop_cnt, 1 ++ brne 0b ++ ++2: ++ ++ sub sp, 8*8*2 //Set pointer to start of DCT block ++ sub r12, 8*8*2 //Set pointer to start of DCT block ++ ++ mov loop_cnt, 8 ++ ++0: ++ ldins.h r3:t,sp[0] // r3:t = dataptr[0] ++ ldins.h r1:t,sp[1*8*2]// r1:t = dataptr[1] ++ ldins.h r2:t,sp[2*8*2]// r2:t = dataptr[2] ++ ldins.h r0:t,sp[5*8*2]// r0:t = dataptr[5] ++ ldins.h r3:b,sp[4*8*2]// r3:b = dataptr[4] ++ ldins.h r1:b,sp[3*8*2]// r1:b = dataptr[3] ++ ldins.h r2:b,sp[6*8*2]// r2:b = dataptr[6] ++ ldins.h r0:b,sp[7*8*2]// r0:b = dataptr[7] ++ ++ or r4, r1, r3 << 16 ++ or r4, r2 ++ or r4, r0 ++ brne 1f //If there are non-zero AC coeffisients perform row-transform ++ ++ satrnds r3 >> ( PASS1_BITS + 3 + 16 ), 31 ++ packw.sh r3, r3, r3 ++ mov r2, r3 ++ st.d r12++, r2 ++ st.d r12++, r2 ++ sub sp, -2 // Increment the dataptr ++ ++ sub loop_cnt, 1//Decrement loop counter ++ brne 0b //Perform loop one more time if loop_cnt is not zero ++ ++ sub sp, -(8*8*2 - 8) ++ popm r0-r3, r4-r7, pc//Pop back registers and PC ++ ++1: ++ ++ ld.w r10, pc[coef_table_idct - .] ++ ld.w r9, pc[coef_table_idct - . + 4] ++ ++ addhh.w r4, r2:t, r2:b ++ mulhh.w r4, r4:b, r10:t // r4 = z1 ++ mulhh.w r5, r2:b, r10:b ++ ld.w r10, pc[coef_table_idct - . + 8] ++ mulhh.w r6, r2:t, r9:t ++ add r5, r4 // r5 = tmp2 ++ add r6, r4 // r6 = tmp3 ++ ++ addhh.w r7, r3:t, r3:b ++ subhh.w r8, r3:t, r3:b ++ ++ lsl r7, CONST_BITS ++ lsl r8, CONST_BITS ++ ++ add r2, r7, r6 // r2 = tmp10 ++ sub r3, r7, r6 // r3 = tmp13 ++ add r4, r8, r5 // r4 = tmp11 ++ sub r5, r8, r5 // r5 = tmp12 ++ ++ ++ padd.h r6, r0, r1 // r6:t = z4, r6:b = z3 ++ addhh.w r7, r6:t, r6:b ++ mulhh.w r7, r7:b, r9:b // r7 = z5 ++ ++ ld.w r9, pc[coef_table_idct - . + 12] ++ mulhh.w r8, r6:b, r10:t // r8 = z3 ++ mulhh.w r6, r6:t, r10:b // r6 = z4 ++ ++ add r8, r7 ++ add r6, r7 ++ ++ paddx.h r7, r0, r1 // r7:t = z2, r7:b = z1 ++ ++ mulhh.w r10, r0:b, r9:t // r10 = tmp0 ++ mulhh.w r0, r0:t, r9:b // r0 = tmp1 ++ ld.w r9, pc[coef_table_idct - . + 16] ++ add r10, r8 ++ add r0, r6 ++ ++ ld.w lr, pc[coef_table_idct - . + 20] ++ machh.w r8, r1:b, r9:t // r8 = tmp2 ++ machh.w r6, r1:t, r9:b // r6 = tmp3 ++ mulhh.w r9, r7:b, lr:t // r9 = z1 ++ mulhh.w r7, r7:t, lr:b // r7 = z2 ++ ++ ++ add r10, r9 ++ add r0, r7 ++ add r8, r7 ++ add r6, r9 ++ ++ add r1, r2, r6 // r1 = dataptr[DCTSIZE*0] ++ sub r2, r2, r6 // r2 = dataptr[DCTSIZE*7] ++ add r6, r4, r8 // r6 = dataptr[DCTSIZE*1] ++ sub r4, r4, r8 // r4 = dataptr[DCTSIZE*6] ++ add r8, r5, r0 // r8 = dataptr[DCTSIZE*2] ++ sub r5, r5, r0 // r5 = dataptr[DCTSIZE*5] ++ add r0, r3, r10 // r0 = dataptr[DCTSIZE*3] ++ sub r3, r3, r10 // r3 = dataptr[DCTSIZE*4] ++ ++ satrnds r1 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r2 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r6 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r4 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r8 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r5 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r0 >> (CONST_BITS+PASS1_BITS+3), 9 ++ satrnds r3 >> (CONST_BITS+PASS1_BITS+3), 9 ++ ++ packw.sh r7, r1, r6 ++ packw.sh r6, r8, r0 ++ packw.sh r5, r3, r5 ++ packw.sh r4, r4, r2 ++ ++ stm r12, r4-r7 ++ sub sp, -2 // Increment the dataptr ++ sub r12, -16 ++ ++ sub loop_cnt, 1 //Decrement loop counter ++ brne 0b //Perform loop one more time if loop_cnt is not zero ++ ++ sub sp, -(8*8*2 - 8) ++ popm r0-r3, r4-r7, pc //Pop back registers and PC ++ ++ ++ ++ .align 2 ++coef_table_idct: ++ .short FIX_0_541196100, -FIX_1_847759065, FIX_0_765366865, FIX_1_175875602 ++ .short - FIX_1_961570560, - FIX_0_390180644, FIX_0_298631336, FIX_2_053119869 ++ .short FIX_3_072711026, FIX_1_501321110, - FIX_0_899976223, - FIX_2_562915447 ++ +diff --git a/libavcodec/avr32/mc.S b/libavcodec/avr32/mc.S +new file mode 100644 +index 0000000..07a002d +--- /dev/null ++++ b/libavcodec/avr32/mc.S +@@ -0,0 +1,434 @@ ++/* ++ * Copyright (c) 2007 Atmel Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials provided ++ * with the distribution. ++ * ++ * 3. The name of ATMEL may not be used to endorse or promote products ++ * derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL ++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ++ * DAMAGE. ++ */ ++ ++ ++ /* Macro for masking the lowest bit of each byte in a ++ packed word */ ++ .macro packedmask1 reg, round ++ .if \round ++ and \reg, \reg, r8 >> 1 ++ .else ++ and \reg, r8 ++ .endif ++ .endm ++ ++ /* Macro for 8 pixel wide horizontal and vertical interpolation functions */ ++ .macro pixels8_hv round, put ++ ++ ++ pushm r0-r7, lr ++ ++ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ ++ ++ /* Rounding immediate */ ++ .if \round ++ mov r8, lo(0x02020202) ++ orh r8, hi(0x02020202) ++ .else ++ mov r8, lo(0x01010101) ++ orh r8, hi(0x01010101) ++ .endif ++ mov r7, 2 ++ ++ /* Pixel naming convention : ++ ++ |-----------------------------------------------------| ++ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | s08 | ++ |----d00---d01---d02---d03---d04---d05---d06---d07----| ++ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | s18 | ++ |-----------------------------------------------------| ++ */ ++1: ++ ld.w r0, r11[0] // r0 = { s00, s01, s02, s03 } ++ ld.w r1, r11[1] // r1 = { s01, s02, s03, s04 } ++ mov lr, r9 ++ eor r2, r0, r1 ++ packedmask1 r2, \round ++ add r2, r8 ++ ++ paddh.ub r0, r0, r1 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} ++ ++ add r11, r10 // pixels += line_size ++ ld.w r1, r11[0] // r1 = { s10, s11, s12, s13 } ++ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 } ++0: ++ eor r5, r1, r3 ++ packedmask1 r5, \round ++ add r2, r5 ++ ++ paddh.ub r1, r1, r3 // r1 = {(s10+s11)/2,(s11+s12)/2,(s12+s13)/2,(s13+s14)/2} ++ eor r6, r0, r1 ++ packedmask1 r6, \round ++ add r2, r2, r6 << 1 ++ ++ ld.w r3, r11[r10] // r3 = { s00, s01, s02, s03 } ++ add r11, r10 // pixels += line_size ++ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 } ++ ++ paddh.ub r0, r0, r1 ++ plsr.b r2, r2, 2 ++ padd.b r0, r0, r2 // r0 = { d00, d01, d02, d03 } ++ ++ /* Next row */ ++ .if \put ++ eor r2, r3, r4 ++ packedmask1 r2, \round ++ add r2, r8 ++ .else ++ ld.w r6, r12[0] ++ eor r2, r3, r4 ++ packedmask1 r2, \round ++ add r2, r8 ++ pavg.ub r0, r0, r6 ++ .endif ++ st.w r12[0], r0 // Put data into the block ++ ++ add r5, r2 ++ paddh.ub r0, r3, r4 // r0 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} ++ ++ eor r6, r0, r1 ++ packedmask1 r6, \round ++ add r5, r5, r6 << 1 ++ ++ .if \put ++ paddh.ub r1, r0, r1 ++ plsr.b r5, r5, 2 ++ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 } ++ .else ++ ld.w r3, r12[r10] ++ paddh.ub r1, r0, r1 ++ plsr.b r5, r5, 2 ++ padd.b r1, r1, r5 // r1 = { d10, d11, d12, d13 } ++ pavg.ub r1, r1, r3 ++ .endif ++ ++ st.w r12[r10], r1 // Put data into the block ++ ++ ++ ld.w r1, r11[r10] // r1 = { s10, s11, s12, s13 } ++ add r11, r10 // pixels += line_size ++ ld.w r3, r11[1] // r3 = { s11, s12, s13, s14 } ++ add r12, r12, r10 << 1 // block += 2*line_size ++ sub lr, 2 ++ brne 0b ++ ++ mul r0, r10, r9 // r0 = line_size * h ++ rsub r0, r0, 4 // r0 = 4 - (line_size * h) ++ add r11, r0 ++ sub r11, r10 // pixels += 4 - (line_size * (h+1)) ++ add r12, r0 // pixels += 4 - (line_size * (h)) ++ sub r7, 1 ++ brne 1b ++ ++ popm r0-r7, pc ++ .endm ++ ++ ++ /* Macro for 8 pixel wide vertical interpolation functions */ ++ ++ .macro pixels8_v round, put ++ pushm r4-r7,lr ++ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ ++ ++ /* ++ Pixel Naming Convention : ++ |-----------------------------------------------| ++ | s00 | s01 | s02 | s03 | s04 | s05 | s06 | s07 | ++ |-d00---d01---d02---d03---d04---d05---d06---d07-| ++ | s10 | s11 | s12 | s13 | s14 | s15 | s16 | s17 | ++ |-----------------------------------------------| ++ */ ++ ld.w r8, r11[r10] // r8 = { s10, s11, s12, s13 } ++ ld.w lr, r11++ // lr = { s00, s01, s02, s03 }, src += 4 ++ ld.w r7, r11[0] // r7 = { s04, s05, s06, s07 } ++ ld.w r6, r11[r10] // r6 = { s14, s15, s16, s17 } ++ sub r10, 4 // stride -= 4 ++ add r11, r11, r10 << 1 // src += 2*stride ++ sub r11, -4 // src += 4 ++ ++0: ++ .if \round ++ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} ++ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} ++ .else ++ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} ++ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} ++ .endif ++ ++ .if \put ++ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } ++ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4 ++ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } ++ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 } ++ .else ++ ld.w lr, r12[0] ++ ld.w r7, r12[4] ++ pavg.ub r5, r5, lr ++ pavg.ub r4, r4, r7 ++ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } ++ ld.w lr, r11++ // lr = { s10, s11, s12, s13 }, src += 4 ++ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } ++ ld.w r7, r11[0] // r7 = { s14, s15, s16, s17 } ++ .endif ++ add r11, r10 // src += stride ++#ifdef USE_PREFETCH ++ pref r11[0] ++#endif ++ add r12, r10 // dst += stride ++ ++ .if \round ++ pavg.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} ++ pavg.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} ++ .else ++ paddh.ub r5, r8, lr // r5 = {(s10+s00)/2,(s11+s01)/2,(s12+s02)/2,(s13+s03)/2} ++ paddh.ub r4, r6, r7 // r4 = {(s14+s04)/2,(s15+s05)/2,(s16+s06)/2,(s17+s07)/2} ++ .endif ++ .if \put ++ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } ++ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4 ++ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } ++ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 } ++ .else ++ ld.w r8, r12[0] ++ ld.w r6, r12[4] ++ pavg.ub r5, r5, r8 ++ pavg.ub r4, r4, r6 ++ st.w r12++, r5 // *dst++ = { d00, d01, d02, d03 } ++ ld.w r8, r11++ // r8 = { s10, s11, s12, s13 }, src += 4 ++ st.w r12[0], r4 // *dst = { d04, d05, d06, d07 } ++ ld.w r6, r11[0] // r6 = { s14, s15, s16, s17 } ++ .endif ++ ++ add r11, r10 // src += stride ++#ifdef USE_PREFETCH ++ pref r11[0] ++#endif ++ add r12, r10 // dst += stride ++ sub r9, 2 ++ brne 0b ++ ++ popm r4-r7,pc ++ .endm ++ ++ /* Macro for 8 pixel wide horizontal interpolation functions */ ++ ++ .macro pixels8_h round, put ++ pushm r4-r7, lr ++ ++ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ ++ /* ++ Pixel Naming Convention: ++ |--------------------------------------------------------------------| ++ | s00 d00 s01 d01 s02 d02 s03 d03 s04 d04 s05 d05 s06 d06 s07 d07 s08| ++ |------|-------|-------|-------|-------|-------|-------|-------|-----| ++ | s10 d10 s11 d11 s12 d12 s13 d13 s14 d14 s15 d15 s16 d16 s17 d17 s18| ++ |--------------------------------------------------------------------| ++ */ ++ ++ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 } ++ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 } ++ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 } ++ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 } ++ add r11, r10 // src += stride ++ ++0: ++ .if \round ++ pavg.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} ++ pavg.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} ++ .else ++ paddh.ub lr, r8, lr // lr = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} ++ paddh.ub r7, r6, r7 // r7 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} ++ .endif ++ .if \put ++ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 } ++ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 } ++ .else ++ ld.w r8, r12[0] ++ ld.w r6, r12[4] ++ ld.w r5, r11[0] // r5 = { s00, s01, s02, s03 } ++ ld.w r4, r11[1] // r4 = { s01, s02, s03, s04 } ++ pavg.ub lr, lr, r8 ++ pavg.ub r7, r7, r6 ++ .endif ++ st.w r12[0], lr // dst = { d00, d01, d02, d03 } ++ st.w r12[4], r7 // dst = { d04, d05, d06, d07 } ++ ld.w r8, r11[4] // r8 = { s04, s05, s06, s07 } ++ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 } ++ add r11, r10 // src += stride ++#ifdef USE_PREFETCH ++ pref r11[0] ++#endif ++ add r12, r10 // dst += stride ++ ++ .if \round ++ pavg.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} ++ pavg.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} ++ .else ++ paddh.ub r5, r4, r5 // r5 = {(s00+s01)/2,(s01+s02)/2,(s02+s03)/2,(s03+s04)/2} ++ paddh.ub r4, r6, r8 // r4 = {(s04+s05)/2,(s05+s06)/2,(s06+s07)/2,(s07+s08)/2} ++ .endif ++ .if \put ++ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 } ++ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 } ++ .else ++ ld.w r7, r12[0] ++ ld.w r6, r12[4] ++ ld.w lr, r11[0] // lr = { s00, s01, s02, s03 } ++ ld.w r8, r11[1] // r8 = { s01, s02, s03, s04 } ++ pavg.ub r5, r5, r7 ++ pavg.ub r4, r4, r6 ++ .endif ++ st.w r12[0], r5 // dst = { d00, d01, d02, d03 } ++ st.w r12[4], r4 // dst = { d04, d05, d06, d07 } ++ ld.w r7, r11[4] // r7 = { s04, s05, s06, s07 } ++ ld.w r6, r11[5] // r6 = { s05, s06, s07, s08 } ++ add r11, r10 // src += stride ++#ifdef USE_PREFETCH ++ pref r11[0] ++#endif ++ add r12, r10 // dst += stride ++ sub r9, 2 ++ brne 0b ++ ++ popm r4-r7, pc ++ .endm ++ ++ /* Macro for 8 pixel wide copy functions */ ++ .macro pixels8 put ++ stm --sp, r3-r7,lr ++ /* R12 = uint8_t *block, R11 = uint8_t pixels, R10 = int line_size, R9 = int h */ ++ mov lr, r9 ++ sub r3, r10, 2 // stride2 = stride - 2 ++0: ++ .if \put ++ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 } ++ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4 ++ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 } ++ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 } ++ .else ++ ld.w r9, r11[r10] // r9 = { s10, s11, s12, s13 } ++ ld.d r4, r12[0] ++ ld.w r7, r11++ // r7 = { s00, s01, s02, s03 }, src += 4 ++ ld.w r6, r11[0] // r6 = { s04, s05, s06, s07 } ++ ld.w r8, r11[r10] // r8 = { s14, s15, s16, s17 } ++ pavg.ub r6, r6, r4 ++ pavg.ub r7, r7, r5 ++ ld.d r4, r12[r10] ++ .endif ++ st.d r12, r6 // *dst = { s00, s01, s02, s03, s04, s05, s06, s07 } ++ add r11, r11, r3 << 1 // src += stride2 * 2 ++ .ifeq \put ++ pavg.ub r8, r8, r4 ++ pavg.ub r9, r9, r5 ++ .endif ++ st.d r12[r10 << 0], r8 // *(dst + stride) = { s10, s11, s12, s13, s14, s15, s16, s17 } ++ add r12, r12, r10 << 1 // dst += 2*stride ++ sub lr, 2 ++ brne 0b ++ ldm sp++, r3-r7,pc ++ ++ .endm ++ ++ .global put_no_rnd_pixels8_hv_avr32 ++ .text ++put_no_rnd_pixels8_hv_avr32: ++ pixels8_hv 0, 1 ++ ++ .global put_pixels8_hv_avr32 ++ .text ++put_pixels8_hv_avr32: ++ pixels8_hv 1, 1 ++ ++ .global avg_no_rnd_pixels8_hv_avr32 ++ .text ++avg_no_rnd_pixels8_hv_avr32: ++ pixels8_hv 0, 0 ++ ++ .global avg_pixels8_hv_avr32 ++ .text ++avg_pixels8_hv_avr32: ++ pixels8_hv 1, 0 ++ ++ .global put_no_rnd_pixels8_v_avr32 ++ .text ++put_no_rnd_pixels8_v_avr32: ++ pixels8_v 0, 1 ++ ++ .global put_pixels8_v_avr32 ++ .text ++put_pixels8_v_avr32: ++ pixels8_v 1, 1 ++ ++ .global avg_no_rnd_pixels8_v_avr32 ++ .text ++avg_no_rnd_pixels8_v_avr32: ++ pixels8_v 0, 0 ++ ++ .global avg_pixels8_v_avr32 ++ .text ++avg_pixels8_v_avr32: ++ pixels8_v 1, 0 ++ ++ .global put_no_rnd_pixels8_h_avr32 ++ .text ++put_no_rnd_pixels8_h_avr32: ++ pixels8_h 0, 1 ++ ++ .global put_pixels8_h_avr32 ++ .text ++put_pixels8_h_avr32: ++ pixels8_h 1, 1 ++ ++ .global avg_no_rnd_pixels8_h_avr32 ++ .text ++avg_no_rnd_pixels8_h_avr32: ++ pixels8_h 0, 0 ++ ++ .global avg_pixels8_h_avr32 ++ .text ++avg_pixels8_h_avr32: ++ pixels8_h 1, 0 ++ ++ .global put_pixels8_avr32 ++ .global put_no_rnd_pixels8_avr32 ++ .text ++put_pixels8_avr32: ++put_no_rnd_pixels8_avr32: ++ pixels8 1 ++ ++ .global avg_no_rnd_pixels8_avr32 ++ .global avg_pixels8_avr32 ++ .text ++avg_pixels8_avr32: ++avg_no_rnd_pixels8_avr32: ++ pixels8 0 +diff --git a/libavcodec/avr32/pico.h b/libavcodec/avr32/pico.h +new file mode 100644 +index 0000000..32201ba +--- /dev/null ++++ b/libavcodec/avr32/pico.h +@@ -0,0 +1,260 @@ ++/* ++ * Copyright (c) 2007 Atmel Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials provided ++ * with the distribution. ++ * ++ * 3. The name of ATMEL may not be used to endorse or promote products ++ * derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL ++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ++ * DAMAGE. ++ */ ++#ifndef __PICO_H__ ++#define __PICO_H__ ++ ++ ++ ++/* Coprocessor Number */ ++#define PICO_CPNO 1 ++ ++/* Pixel Coprocessor Register file */ ++#define PICO_REGVECT_INPIX2 cr0 ++#define PICO_REGVECT_INPIX1 cr1 ++#define PICO_REGVECT_INPIX0 cr2 ++#define PICO_REGVECT_OUTPIX2 cr3 ++#define PICO_REGVECT_OUTPIX1 cr4 ++#define PICO_REGVECT_OUTPIX0 cr5 ++#define PICO_REGVECT_COEFF0_A cr6 ++#define PICO_REGVECT_COEFF0_B cr7 ++#define PICO_REGVECT_COEFF1_A cr8 ++#define PICO_REGVECT_COEFF1_B cr9 ++#define PICO_REGVECT_COEFF2_A cr10 ++#define PICO_REGVECT_COEFF2_B cr11 ++#define PICO_REGVECT_VMU0_OUT cr12 ++#define PICO_REGVECT_VMU1_OUT cr13 ++#define PICO_REGVECT_VMU2_OUT cr14 ++#define PICO_REGVECT_CONFIG cr15 ++ ++#define PICO_INPIX2 0 ++#define PICO_INPIX1 1 ++#define PICO_INPIX0 2 ++#define PICO_OUTPIX2 3 ++#define PICO_OUTPIX1 4 ++#define PICO_OUTPIX0 5 ++#define PICO_COEFF0_A 6 ++#define PICO_COEFF0_B 7 ++#define PICO_COEFF1_A 8 ++#define PICO_COEFF1_B 9 ++#define PICO_COEFF2_A 10 ++#define PICO_COEFF2_B 11 ++#define PICO_VMU0_OUT 12 ++#define PICO_VMU1_OUT 13 ++#define PICO_VMU2_OUT 14 ++#define PICO_CONFIG 15 ++ ++/* Config Register */ ++#define PICO_COEFF_FRAC_BITS_OFFSET 0 ++#define PICO_COEFF_FRAC_BITS_SIZE 4 ++#define PICO_OFFSET_FRAC_BITS_OFFSET 4 ++#define PICO_OFFSET_FRAC_BITS_SIZE 4 ++#define PICO_INPUT_MODE_OFFSET 8 ++#define PICO_INPUT_MODE_SIZE 2 ++#define PICO_OUTPUT_MODE_OFFSET 10 ++#define PICO_OUTPUT_MODE_SIZE 1 ++ ++struct pico_config_t { ++ unsigned int : 32 - PICO_OUTPUT_MODE_OFFSET - PICO_OUTPUT_MODE_SIZE; ++ unsigned int output_mode : PICO_OUTPUT_MODE_SIZE; ++ unsigned int input_mode : PICO_INPUT_MODE_SIZE; ++ unsigned int offset_frac_bits : PICO_OFFSET_FRAC_BITS_SIZE; ++ unsigned int coeff_frac_bits : PICO_COEFF_FRAC_BITS_SIZE; ++ int vmu2_out; ++ int vmu1_out; ++ int vmu0_out; ++ short coeff2_2; ++ short coeff2_3; ++ short coeff2_0; ++ short coeff2_1; ++ short coeff1_2; ++ short coeff1_3; ++ short coeff1_0; ++ short coeff1_1; ++ short coeff0_2; ++ short coeff0_3; ++ short coeff0_0; ++ short coeff0_1; ++}; ++ ++ ++#define PICO_COEFF_FRAC_BITS(x) (x << PICO_COEFF_FRAC_BITS_OFFSET) ++#define PICO_OFFSET_FRAC_BITS(x) (x << PICO_OFFSET_FRAC_BITS_OFFSET) ++#define PICO_INPUT_MODE(x) (x << PICO_INPUT_MODE_OFFSET) ++#define PICO_OUTPUT_MODE(x) (x << PICO_OUTPUT_MODE_OFFSET) ++ ++#define GET_PICO_COEFF_FRAC_BITS(x) ((x >> PICO_COEFF_FRAC_BITS_OFFSET)&((1 << PICO_COEFF_FRAC_BITS_SIZE)-1)) ++#define GET_PICO_OFFSET_FRAC_BITS(x) ((x >> PICO_OFFSET_FRAC_BITS_OFFSET)&((1 << PICO_OFFSET_FRAC_BITS_SIZE)-1)) ++#define GET_PICO_INPUT_MODE(x) ((x >> PICO_INPUT_MODE_OFFSET)&((1 << PICO_INPUT_MODE_SIZE)-1)) ++#define GET_PICO_OUTPUT_MODE(x) ((x >> PICO_OUTPUT_MODE_OFFSET)&((1 << PICO_OUTPUT_MODE_SIZE)-1)) ++ ++enum pico_input_mode { PICO_TRANSFORMATION_MODE, ++ PICO_HOR_FILTER_MODE, ++ PICO_VERT_FILTER_MODE }; ++ ++enum pico_output_mode { PICO_PACKED_MODE, ++ PICO_PLANAR_MODE }; ++ ++/* Bits in coefficients */ ++#define PICO_COEFF_BITS 12 ++ ++/* Operation bits */ ++#define PICO_MATRIX (0) ++#define PICO_USE_ACC (1 << 2) ++#define PICO_SINGLE_VECTOR (1 << 3) ++ ++ ++#define __str(x...) #x ++#define __xstr(x...) __str(x) ++ ++#define PICO_PUT_W(pico_reg, x) \ ++ __builtin_mvrc_w(PICO_CPNO, pico_reg, x); ++#define PICO_GET_W(pico_reg) \ ++ __builtin_mvcr_w(PICO_CPNO, pico_reg) ++ ++#define PICO_MVCR_W(x, pico_reg) \ ++ asm ("mvcr.w\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x)); ++ ++#define PICO_MVRC_W(pico_reg, x) \ ++ asm ("mvrc.w\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x)); ++ ++#define PICO_PUT_D(pico_reg, x) \ ++ __builtin_mvrc_d(PICO_CPNO, pico_reg, x); ++#define PICO_GET_D(pico_reg) \ ++ __builtin_mvcr_d(PICO_CPNO, pico_reg) ++ ++#define PICO_MVCR_D(x, pico_reg) \ ++ asm volatile ("mvcr.d\tcp" __xstr(PICO_CPNO) ", %0, cr" __xstr(pico_reg) : "=r"(x)); ++#define PICO_MVRC_D(pico_reg, x) \ ++ asm volatile ("mvrc.d\tcp" __xstr(PICO_CPNO) ", cr" __xstr(pico_reg) ", %0" :: "r"(x)); ++ ++#define PICO_STCM_W(ptr, pico_regs...) \ ++ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); ++#define PICO_STCM_D(ptr, pico_regs...) \ ++ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); ++ ++#define PICO_STCM_W_DEC(ptr, pico_regs...) \ ++ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); ++#define PICO_STCM_D_DEC(ptr, pico_regs...) \ ++ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); ++ ++#define PICO_LDCM_W(ptr, pico_regs...) \ ++ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); ++#define PICO_LDCM_D(ptr, pico_regs...) \ ++ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); ++ ++#define PICO_LDCM_W_INC(ptr, pico_regs...) \ ++ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); ++#define PICO_LDCM_D_INC(ptr, pico_regs...) \ ++ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); ++ ++#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \ ++ __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr); ++ ++static inline void set_pico_config(struct pico_config_t *config){ ++ PICO_LDCM_D(config, ++ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, ++ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B, ++ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B, ++ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG); ++} ++ ++static inline void get_pico_config(struct pico_config_t *config){ ++ PICO_STCM_D(config, ++ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, ++ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B, ++ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B, ++ PICO_REGVECT_VMU0_OUT, PICO_REGVECT_VMU1_OUT, ++ PICO_REGVECT_VMU2_OUT, PICO_REGVECT_CONFIG); ++} ++ ++static inline void dump_pico_config(){ ++ struct pico_config_t pico_config; ++ char *input_mode, *output_mode; ++ get_pico_config(&pico_config); ++ ++ ++ av_log(NULL, AV_LOG_INFO, "Dumping pico configuration:\n\n"); ++ av_log(NULL, AV_LOG_INFO, "\tcoeff_frac_bits = %d\n", pico_config.coeff_frac_bits); ++ av_log(NULL, AV_LOG_INFO, "\toffset_frac_bits = %d\n", pico_config.offset_frac_bits); ++ ++ switch ( pico_config.input_mode ){ ++ case PICO_TRANSFORMATION_MODE: ++ input_mode = "Transformation Mode"; ++ break; ++ case PICO_HOR_FILTER_MODE: ++ input_mode = "Horisontal Filter Mode"; ++ break; ++ case PICO_VERT_FILTER_MODE: ++ input_mode = "Vertical Filter Mode"; ++ break; ++ default: ++ input_mode = "Unknown Mode!!"; ++ break; ++ } ++ av_log(NULL, AV_LOG_INFO, "\tinput_mode = %s\n", input_mode); ++ ++ switch ( pico_config.output_mode ){ ++ case PICO_PLANAR_MODE: ++ output_mode = "Planar Mode"; ++ break; ++ case PICO_PACKED_MODE: ++ output_mode = "Packed Mode"; ++ break; ++ default: ++ output_mode = "Unknown Mode!!"; ++ break; ++ } ++ ++ av_log(NULL, AV_LOG_INFO, "\toutput_mode = %s\n", output_mode); ++ ++ av_log(NULL, AV_LOG_INFO, "\tCoeff0_0 = %f\n", (float)pico_config.coeff0_0/(float)(1 << pico_config.coeff_frac_bits)); ++ av_log(NULL, AV_LOG_INFO, "\tCoeff0_1 = %f\n", (float)pico_config.coeff0_1/(float)(1 << pico_config.coeff_frac_bits)); ++ av_log(NULL, AV_LOG_INFO, "\tCoeff0_2 = %f\n", (float)pico_config.coeff0_2/(float)(1 << pico_config.coeff_frac_bits)); ++ av_log(NULL, AV_LOG_INFO, "\tCoeff0_3 = %f\n", (float)pico_config.coeff0_3/(float)(1 << pico_config.offset_frac_bits)); ++ ++ av_log(NULL, AV_LOG_INFO, "\tCoeff1_0 = %f\n", (float)pico_config.coeff1_0/(float)(1 << pico_config.coeff_frac_bits)); ++ av_log(NULL, AV_LOG_INFO, "\tCoeff1_1 = %f\n", (float)pico_config.coeff1_1/(float)(1 << pico_config.coeff_frac_bits)); ++ av_log(NULL, AV_LOG_INFO, "\tCoeff1_2 = %f\n", (float)pico_config.coeff1_2/(float)(1 << pico_config.coeff_frac_bits)); ++ av_log(NULL, AV_LOG_INFO, "\tCoeff1_3 = %f\n", (float)pico_config.coeff1_3/(float)(1 << pico_config.offset_frac_bits)); ++ ++ av_log(NULL, AV_LOG_INFO, "\tCoeff2_0 = %f\n", (float)pico_config.coeff2_0/(float)(1 << pico_config.coeff_frac_bits)); ++ av_log(NULL, AV_LOG_INFO, "\tCoeff2_1 = %f\n", (float)pico_config.coeff2_1/(float)(1 << pico_config.coeff_frac_bits)); ++ av_log(NULL, AV_LOG_INFO, "\tCoeff2_2 = %f\n", (float)pico_config.coeff2_2/(float)(1 << pico_config.coeff_frac_bits)); ++ av_log(NULL, AV_LOG_INFO, "\tCoeff2_3 = %f\n", (float)pico_config.coeff2_3/(float)(1 << pico_config.offset_frac_bits)); ++} ++ ++ ++ ++#endif ++ +diff --git a/libavcodec/bitstream.h b/libavcodec/bitstream.h +index 26b4f8d..1f8fabf 100644 +--- a/libavcodec/bitstream.h ++++ b/libavcodec/bitstream.h +@@ -171,7 +171,7 @@ typedef struct RL_VLC_ELEM { + #endif + + /* used to avoid missaligned exceptions on some archs (alpha, ...) */ +-#if defined(ARCH_X86) || defined(ARCH_X86_64) ++#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_AVR32) + # define unaligned16(a) (*(const uint16_t*)(a)) + # define unaligned32(a) (*(const uint32_t*)(a)) + # define unaligned64(a) (*(const uint64_t*)(a)) +@@ -813,6 +813,44 @@ void free_vlc(VLC *vlc); + * if the vlc code is invalid and max_depth>1 than the number of bits removed + * is undefined + */ ++ ++#if defined(ARCH_AVR32) ++#define GET_VLC(code, name, gb, table, bits, max_depth)\ ++{\ ++ int n, index, nb_bits;\ ++ union { VLC_TYPE vlc[2];\ ++ uint32_t u32; } table_elem;\ ++\ ++ index= SHOW_UBITS(name, gb, bits);\ ++ table_elem.u32 = unaligned32(&table[index]); \ ++ code = table_elem.vlc[0];\ ++ n = table_elem.vlc[1];\ ++\ ++ if(max_depth > 1 && n < 0 ){\ ++ LAST_SKIP_BITS(name, gb, bits)\ ++ UPDATE_CACHE(name, gb)\ ++\ ++ nb_bits = -n;\ ++\ ++ index= SHOW_UBITS(name, gb, nb_bits) + code;\ ++ table_elem.u32 = unaligned32(&table[index]); \ ++ code = table_elem.vlc[0];\ ++ n = table_elem.vlc[1];\ ++ if(max_depth > 2 && n < 0){\ ++ LAST_SKIP_BITS(name, gb, nb_bits)\ ++ UPDATE_CACHE(name, gb)\ ++\ ++ nb_bits = -n;\ ++\ ++ index= SHOW_UBITS(name, gb, nb_bits) + code;\ ++ code = table[index][0];\ ++ n = table[index][1];\ ++ }\ ++ }\ ++ SKIP_BITS(name, gb, n)\ ++} ++ ++#else + #define GET_VLC(code, name, gb, table, bits, max_depth)\ + {\ + int n, index, nb_bits;\ +@@ -821,7 +859,7 @@ void free_vlc(VLC *vlc); + code = table[index][0];\ + n = table[index][1];\ + \ +- if(max_depth > 1 && n < 0){\ ++ if(max_depth > 1 && n < 0 ){\ + LAST_SKIP_BITS(name, gb, bits)\ + UPDATE_CACHE(name, gb)\ + \ +@@ -843,7 +881,38 @@ void free_vlc(VLC *vlc); + }\ + SKIP_BITS(name, gb, n)\ + } ++#endif + ++#if defined(ARCH_AVR32) ++#define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\ ++{\ ++ int n, index, nb_bits;\ ++ union { RL_VLC_ELEM vlc;\ ++ uint32_t u32; } table_elem;\ ++\ ++ index= SHOW_UBITS(name, gb, bits);\ ++ table_elem.u32 = unaligned32(&table[index]); \ ++ level = table_elem.vlc.level;\ ++ n = table_elem.vlc.len;\ ++\ ++ if(max_depth > 1 && n < 0 ){\ ++ SKIP_BITS(name, gb, bits)\ ++ if(need_update){\ ++ UPDATE_CACHE(name, gb)\ ++ }\ ++\ ++ nb_bits = -n;\ ++\ ++ index= SHOW_UBITS(name, gb, nb_bits) + level;\ ++ table_elem.u32 = unaligned32(&table[index]); \ ++ level = table_elem.vlc.level;\ ++ n = table_elem.vlc.len;\ ++ }\ ++ run= table_elem.vlc.run;\ ++ SKIP_BITS(name, gb, n)\ ++} ++ ++#else + #define GET_RL_VLC(level, run, name, gb, table, bits, max_depth, need_update)\ + {\ + int n, index, nb_bits;\ +@@ -852,7 +921,7 @@ void free_vlc(VLC *vlc); + level = table[index].level;\ + n = table[index].len;\ + \ +- if(max_depth > 1 && n < 0){\ ++ if(max_depth > 1 && n < 0 ){\ + SKIP_BITS(name, gb, bits)\ + if(need_update){\ + UPDATE_CACHE(name, gb)\ +@@ -867,7 +936,7 @@ void free_vlc(VLC *vlc); + run= table[index].run;\ + SKIP_BITS(name, gb, n)\ + } +- ++#endif + + /** + * parses a vlc code, faster then get_vlc() +diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c +index 56c42b9..8fc10c6 100644 +--- a/libavcodec/dsputil.c ++++ b/libavcodec/dsputil.c +@@ -4197,6 +4197,9 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) + #ifdef ARCH_BFIN + dsputil_init_bfin(c,avctx); + #endif ++#ifdef ARCH_AVR32 ++ dsputil_init_avr32(c,avctx); ++#endif + + for(i=0; i<64; i++){ + if(!c->put_2tap_qpel_pixels_tab[0][i]) +diff --git a/libavcodec/h264.c b/libavcodec/h264.c +index 865e80a..8f7c3f1 100644 +--- a/libavcodec/h264.c ++++ b/libavcodec/h264.c +@@ -3258,7 +3258,12 @@ static void free_tables(H264Context *h){ + + static void init_dequant8_coeff_table(H264Context *h){ + int i,q,x; ++#ifdef ARCH_AVR32 ++ const int transpose = 0; ++#else + const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly ++#endif ++ + h->dequant8_coeff[0] = h->dequant8_buffer[0]; + h->dequant8_coeff[1] = h->dequant8_buffer[1]; + +@@ -3281,7 +3286,13 @@ static void init_dequant8_coeff_table(H264Context *h){ + + static void init_dequant4_coeff_table(H264Context *h){ + int i,j,q,x; ++ // Yes this is ugly as hell.... ++#ifdef ARCH_AVR32 ++ const int transpose = 0; ++#else + const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly ++#endif ++ + for(i=0; i<6; i++ ){ + h->dequant4_coeff[i] = h->dequant4_buffer[i]; + for(j=0; jdsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly ++#endif + memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t)); + memcpy(h-> field_scan, field_scan, 16*sizeof(uint8_t)); + }else{ +diff --git a/libavutil/common.h b/libavutil/common.h +index 3ae5971..7e52b90 100644 +--- a/libavutil/common.h ++++ b/libavutil/common.h +@@ -283,23 +283,39 @@ static inline int mid_pred(int a, int b, int c) + * @param amax maximum value of the clip range + * @return cliped value + */ ++#if defined(ARCH_AVR32) ++#define clip(a, amin, amax) \ ++ ({ int __tmp__; \ ++ asm ("min\t%0, %1, %2\n" \ ++ "max\t%0, %0, %3\n" \ ++ : "=&r"(__tmp__) : "r"(a), "r"(amax), "r"(amin)); \ ++ __tmp__; }) ++#else + static inline int clip(int a, int amin, int amax) + { + if (a < amin) return amin; + else if (a > amax) return amax; + else return a; + } ++#endif + + /** + * clip a signed integer value into the 0-255 range + * @param a value to clip + * @return cliped value + */ ++#if defined(ARCH_AVR32) ++#define clip_uint8(a) \ ++ ({ int __tmp__ = a; \ ++ asm ("satu\t%0 >> 0, 8" : "+r"(__tmp__)); \ ++ __tmp__; }) ++#else + static inline uint8_t clip_uint8(int a) + { + if (a&(~255)) return (-a)>>31; + else return a; + } ++#endif + + /* math */ + int64_t ff_gcd(int64_t a, int64_t b); +diff --git a/libavutil/internal.h b/libavutil/internal.h +index 285d304..a8b0718 100644 +--- a/libavutil/internal.h ++++ b/libavutil/internal.h +@@ -210,6 +210,15 @@ if((y)<(x)){\ + }\ + } + ++/* XXX: Hack for uclibc which declares lrintf but does not implement it... */ ++#ifdef ARCH_AVR32 ++#undef HAVE_LRINTF ++#define HAVE_LRINTF 1 ++#define lrintf(x) rint(x) ++#define llrint(x) (long long)rint(x) ++#endif ++ ++ + #ifndef HAVE_LRINTF + /* XXX: add ISOC specific test to avoid specific BSD testing. */ + /* better than nothing implementation. */ +diff --git a/libfaad2/common.h b/libfaad2/common.h +index f809042..6c5fb21 100644 +--- a/libfaad2/common.h ++++ b/libfaad2/common.h +@@ -67,7 +67,7 @@ extern "C" { + /* Use if target platform has address generators with autoincrement */ + //#define PREFER_POINTERS + +-#if defined(_WIN32_WCE) || defined(__arm__) ++#if defined(_WIN32_WCE) || defined(__arm__) || defined(__avr32__) + #define FIXED_POINT + #endif + +diff --git a/libmpcodecs/ad_libmad.c b/libmpcodecs/ad_libmad.c +index 076359a..51b77fe 100644 +--- a/libmpcodecs/ad_libmad.c ++++ b/libmpcodecs/ad_libmad.c +@@ -86,6 +86,11 @@ static int init(sh_audio_t *sh){ + sh->channels=(this->frame.header.mode == MAD_MODE_SINGLE_CHANNEL) ? 1 : 2; + sh->samplerate=this->frame.header.samplerate; + sh->i_bps=this->frame.header.bitrate/8; ++#ifdef WORDS_BIGENDIAN ++ sh->sample_format = AF_FORMAT_S16_BE; ++#else ++ sh->sample_format = AF_FORMAT_S16_LE; ++#endif + sh->samplesize=2; + + return 1; +diff --git a/libswscale/pico-avr32.h b/libswscale/pico-avr32.h +new file mode 100644 +index 0000000..7ac6200 +--- /dev/null ++++ b/libswscale/pico-avr32.h +@@ -0,0 +1,137 @@ ++/* ++ * Copyright (c) 2007 Atmel Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials provided ++ * with the distribution. ++ * ++ * 3. The name of ATMEL may not be used to endorse or promote products ++ * derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL ++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ++ * DAMAGE. ++ */ ++#ifndef __PICO_H__ ++#define __PICO_H__ ++ ++/* Coprocessor Number */ ++#define PICO_CPNO 1 ++ ++/* Pixel Coprocessor Register file */ ++#define PICO_REGVECT_INPIX2 cr0 ++#define PICO_REGVECT_INPIX1 cr1 ++#define PICO_REGVECT_INPIX0 cr2 ++#define PICO_REGVECT_OUTPIX2 cr3 ++#define PICO_REGVECT_OUTPIX1 cr4 ++#define PICO_REGVECT_OUTPIX0 cr5 ++#define PICO_REGVECT_COEFF0_A cr6 ++#define PICO_REGVECT_COEFF0_B cr7 ++#define PICO_REGVECT_COEFF1_A cr8 ++#define PICO_REGVECT_COEFF1_B cr9 ++#define PICO_REGVECT_COEFF2_A cr10 ++#define PICO_REGVECT_COEFF2_B cr11 ++#define PICO_REGVECT_VMU0_OUT cr12 ++#define PICO_REGVECT_VMU1_OUT cr13 ++#define PICO_REGVECT_VMU2_OUT cr14 ++#define PICO_REGVECT_CONFIG cr15 ++ ++#define PICO_INPIX2 0 ++#define PICO_INPIX1 1 ++#define PICO_INPIX0 2 ++#define PICO_OUTPIX2 3 ++#define PICO_OUTPIX1 4 ++#define PICO_OUTPIX0 5 ++#define PICO_COEFF0_A 6 ++#define PICO_COEFF0_B 7 ++#define PICO_COEFF1_A 8 ++#define PICO_COEFF1_B 9 ++#define PICO_COEFF2_A 10 ++#define PICO_COEFF2_B 11 ++#define PICO_VMU0_OUT 12 ++#define PICO_VMU1_OUT 13 ++#define PICO_VMU2_OUT 14 ++#define PICO_CONFIG 15 ++ ++/* Config Register */ ++#define PICO_COEFF_FRAC_BITS 0 ++#define PICO_COEFF_FRAC_BITS_WIDTH 4 ++#define PICO_OFFSET_FRAC_BITS 4 ++#define PICO_OFFSET_FRAC_BITS_WIDTH 4 ++#define PICO_INPUT_MODE 8 ++#define PICO_INPUT_MODE_WIDTH 2 ++#define PICO_OUTPUT_MODE 10 ++ ++#define PICO_TRANSFORMATION_MODE 0 ++#define PICO_HOR_FILTER_MODE 1 ++#define PICO_VERT_FILTER_MODE 2 ++ ++#define PICO_PLANAR_MODE 1 ++#define PICO_PACKED_MODE 0 ++ ++/* Bits in coefficients */ ++#define PICO_COEFF_BITS 12 ++ ++/* Operation bits */ ++#define PICO_USE_ACC (1 << 2) ++#define PICO_SINGLE_VECTOR (1 << 3) ++ ++ ++#define __str(x...) #x ++#define __xstr(x...) __str(x) ++ ++#define PICO_PUT_W(pico_reg, x) \ ++ __builtin_mvrc_w(PICO_CPNO, pico_reg, x); ++#define PICO_GET_W(pico_reg) \ ++ __builtin_mvcr_w(PICO_CPNO, pico_reg) ++ ++#define PICO_PUT_D(pico_reg, x) \ ++ __builtin_mvrc_d(PICO_CPNO, pico_reg, x); ++#define PICO_GET_D(pico_reg) \ ++ __builtin_mvcr_d(PICO_CPNO, pico_reg) ++ ++ ++#define PICO_STCM_W(ptr, pico_regs...) \ ++ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); ++#define PICO_STCM_D(ptr, pico_regs...) \ ++ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); ++ ++#define PICO_STCM_W_DEC(ptr, pico_regs...) \ ++ asm volatile ("stcm.w\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); ++#define PICO_STCM_D_DEC(ptr, pico_regs...) \ ++ asm volatile ("stcm.d\tcp" __xstr(PICO_CPNO) ", --%0," __xstr(pico_regs) : "+r"(ptr)); ++ ++#define PICO_LDCM_W(ptr, pico_regs...) \ ++ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); ++#define PICO_LDCM_D(ptr, pico_regs...) \ ++ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0," __xstr(pico_regs) :: "r"(ptr)); ++ ++#define PICO_LDCM_W_INC(ptr, pico_regs...) \ ++ asm volatile ("ldcm.w\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); ++#define PICO_LDCM_D_INC(ptr, pico_regs...) \ ++ asm volatile ("ldcm.d\tcp" __xstr(PICO_CPNO) ", %0++," __xstr(pico_regs) : "+r"(ptr)); ++ ++#define PICO_OP(op, dst_addr, addr0, addr1, addr2) \ ++ __builtin_cop(PICO_CPNO, addr0, addr1, addr2, op | dst_addr); ++ ++ ++#endif ++ +diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h +index ecd28f5..3221d0c 100644 +--- a/libswscale/swscale_internal.h ++++ b/libswscale/swscale_internal.h +@@ -173,7 +173,7 @@ typedef struct SwsContext{ + SwsFunc yuv2rgb_get_func_ptr (SwsContext *c); + int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation); + +-char *sws_format_name(int format); ++char *sws_format_name(enum PixelFormat format); + + //FIXME replace this with something faster + #define isPlanarYUV(x) ((x)==PIX_FMT_YUV410P || (x)==PIX_FMT_YUV420P \ +diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c +index 71759bc..fa83985 100644 +--- a/libswscale/yuv2rgb.c ++++ b/libswscale/yuv2rgb.c +@@ -44,6 +44,10 @@ + #include "yuv2rgb_mlib.c" + #endif + ++#ifdef ARCH_AVR32 ++#include "yuv2rgb_avr32.c" ++#endif ++ + #define DITHER1XBPP // only for mmx + + const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={ +@@ -601,6 +605,12 @@ SwsFunc yuv2rgb_get_func_ptr (SwsContext *c) + if(t) return t; + } + #endif ++#ifdef ARCH_AVR32 ++ { ++ SwsFunc t= yuv2rgb_init_avr32(c); ++ if(t) return t; ++ } ++#endif + #ifdef HAVE_ALTIVEC + if (c->flags & SWS_CPU_CAPS_ALTIVEC) + { +@@ -678,6 +688,10 @@ int yuv2rgb_c_init_tables (SwsContext *c, const int inv_table[4], int fullRange, + //printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv); + oy -= 256*brightness; + ++#ifdef ARCH_AVR32 ++ yuv2rgb_c_init_tables_avr32 (c, inv_table, fullRange, brightness, contrast, saturation); ++#endif ++ + for (i = 0; i < 1024; i++) { + int j; + +diff --git a/libswscale/yuv2rgb_avr32.c b/libswscale/yuv2rgb_avr32.c +new file mode 100644 +index 0000000..4a8341e +--- /dev/null ++++ b/libswscale/yuv2rgb_avr32.c +@@ -0,0 +1,416 @@ ++/* ++ * Copyright (c) 2007 Atmel Corporation. All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * 2. Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials provided ++ * with the distribution. ++ * ++ * 3. The name of ATMEL may not be used to endorse or promote products ++ * derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY ATMEL ``AS IS'' AND ANY EXPRESS OR ++ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE EXPRESSLY AND SPECIFICALLY DISCLAIMED. IN NO EVENT SHALL ATMEL ++ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, ++ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY ++ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ++ * DAMAGE. ++ */ ++#include "pico-avr32.h" ++ ++ ++#define RGB(uv_part) \ ++ __asm__ volatile ( \ ++ "ld.w\t%0, %3[%7:" uv_part " << 2]\n\t" /* tmp = c->table_gV[V] */ \ ++ "ld.w\t%1, %4[%8:" uv_part " << 2]\n\t" /* g = c->table_gU[U] */ \ ++ "ld.w\t%2, %5[%8:" uv_part " << 2]\n\t" /* b = c->table_bU[U] */ \ ++ "add\t%1, %0\n\t" /* g += tmp */\ ++ "ld.w\t%0, %6[%7:" uv_part " << 2]" /* r = c->table_rV[V] */ \ ++ : "=&r" (r), "=&r" (g), "=&r" (b) \ ++ : "r" (&c->table_gV[0]), "r" (&c->table_gU[0]),"r" (&c->table_bU[0]), \ ++ "r" (&c->table_rV[0]), "r" (V), "r" (U)); ++ ++ ++#undef YUV2RGB1 ++#define YUV2RGB1(dst, src, y, idx) \ ++ { int tmp2; __asm__ volatile ( \ ++ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ ++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ ++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \ ++ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ ++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ ++ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ ++ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \ ++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ ++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \ ++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ ++ "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \ ++ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ ++ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } ++ ++#undef YUV2RGB2 ++#define YUV2RGB2(dst, src, y, idx) \ ++ { int tmp2; __asm__ volatile ( \ ++ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ ++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ ++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[2] = tmp; */ \ ++ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ ++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ ++ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ ++ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[0] = tmp; */ \ ++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ ++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 3], %1\n\t" /* dst_1[5] = tmp; */ \ ++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ ++ "st.b\t%7[6*%8 + 5], %1" /* dst_1[3] = tmp; */ \ ++ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ ++ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } ++ ++ ++#undef YUV2BGR1 ++#define YUV2BGR1(dst, src, y, idx) \ ++ { int tmp2; __asm__ volatile ( \ ++ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ ++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ ++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \ ++ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ ++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ ++ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ ++ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \ ++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ ++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \ ++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ ++ "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \ ++ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ ++ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } ++ ++#undef YUV2BGR2 ++#define YUV2BGR2(dst, src, y, idx) \ ++ { int tmp2; __asm__ volatile ( \ ++ "ld.ub\t%0, %3[2*%8]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ ++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 24) & 0xFF] */ \ ++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 24) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 2], %1\n\t" /* dst_1[2] = tmp; */ \ ++ "st.b\t%7[6*%8 + 1], %2\n\t" /* dst_1[1] = tmp; */ \ ++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 24) & 0xFF] */ \ ++ "ld.ub\t%0, %3[2*%8 + 1]\n\t" /* Y = ((uint32_t*)py_1)[0] */ \ ++ "st.b\t%7[6*%8 + 0], %1\n\t" /* dst_1[0] = tmp; */ \ ++ "ld.ub\t%1, %4[%0]\n\t" /* tmp = r[(Y >> 16) & 0xFF] */ \ ++ "ld.ub\t%2, %5[%0]\n\t" /* tmp = g[(Y >> 16) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 5], %1\n\t" /* dst_1[5] = tmp; */ \ ++ "ld.ub\t%1, %6[%0]\n\t" /* tmp = b[(Y >> 16) & 0xFF] */ \ ++ "st.b\t%7[6*%8 + 4], %2\n\t" /* dst_1[4] = tmp; */ \ ++ "st.b\t%7[6*%8 + 3], %1" /* dst_1[3] = tmp; */ \ ++ : "=&r" (y), "=&r" (tmp), "=&r" (tmp2) \ ++ : "r" (src), "r" (r), "r" (g), "r" (b), "r" (dst), "i" (idx)); } ++ ++ ++ ++int yuv2bgr24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, ++ int srcSliceH, uint8_t* dst[], int dstStride[]){ ++ int y; ++ ++ if(c->srcFormat == PIX_FMT_YUV422P){ ++ srcStride[1] *= 2; ++ srcStride[2] *= 2; ++ } ++ ++ ++ for(y=0; y>1)*srcStride[1]; ++ uint8_t *pv= src[2] + (y>>1)*srcStride[2]; ++ unsigned int h_size= c->dstW>>3; ++ while (h_size--) { ++ uint32_t U, V, Y1, Y2, tmp; ++ U = ((uint32_t*)pu)[0]; ++ V = ((uint32_t*)pv)[0]; ++ ++ RGB("t") ++ YUV2BGR1(dst_1, py_1, Y1, 0) ++ YUV2BGR1(dst_2, py_2, Y2, 0) ++ ++ RGB("u") ++ YUV2BGR2(dst_1, py_1, Y1, 1) ++ YUV2BGR2(dst_2, py_2, Y2, 1) ++ ++ RGB("l") ++ YUV2BGR1(dst_1, py_1, Y1, 2) ++ YUV2BGR1(dst_2, py_2, Y2, 2) ++ ++ RGB("b") ++ YUV2BGR2(dst_1, py_1, Y1, 3) ++ YUV2BGR2(dst_2, py_2, Y2, 3) ++ ++ ++ ++ pu += 4; ++ pv += 4; ++ py_1 += 8; ++ py_2 += 8; ++ dst_1 += 24; ++ dst_2 += 24; ++ } ++ } ++ return srcSliceH; ++} ++ ++ ++ ++static int yuv2rgb24_avr32(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, ++ int srcSliceH, uint8_t* dst[], int dstStride[]){ ++ int y; ++ ++ if(c->srcFormat == PIX_FMT_YUV422P){ ++ srcStride[1] *= 2; ++ srcStride[2] *= 2; ++ } ++ for(y=0; y>1)*srcStride[1]; ++ uint8_t *pv= src[2] + (y>>1)*srcStride[2]; ++ unsigned int h_size= c->dstW>>3; ++ while (h_size--) { ++ uint32_t U, V, Y1, Y2, tmp; ++ U = ((uint32_t*)pu)[0]; ++ V = ((uint32_t*)pv)[0]; ++ ++ RGB("t") ++ YUV2RGB1(dst_1, py_1, Y1, 0) ++ YUV2RGB1(dst_2, py_2, Y2, 0) ++ ++ RGB("u") ++ YUV2RGB2(dst_1, py_1, Y1, 1) ++ YUV2RGB2(dst_2, py_2, Y2, 1) ++ ++ RGB("l") ++ YUV2RGB1(dst_1, py_1, Y1, 2) ++ YUV2RGB1(dst_2, py_2, Y2, 2) ++ ++ RGB("b") ++ YUV2RGB2(dst_1, py_1, Y1, 3) ++ YUV2RGB2(dst_2, py_2, Y2, 3) ++ ++ pu += 4; ++ pv += 4; ++ py_1 += 8; ++ py_2 += 8; ++ dst_1 += 24; ++ dst_2 += 24; ++ } ++ } ++ return srcSliceH; ++} ++ ++#define SCALE(x, bits) (((x) + ( 1 << (bits - 1))) >> bits) ++#define COEFF_FRAC_BITS 9 ++#define OFFSET_FRAC_BITS 2 ++ ++/* Coefficients used in the pico */ ++static struct { ++ short coeff2_2; ++ short coeff2_3; ++ short coeff2_0; ++ short coeff2_1; ++ short coeff1_2; ++ short coeff1_3; ++ short coeff1_0; ++ short coeff1_1; ++ short coeff0_2; ++ short coeff0_3; ++ short coeff0_0; ++ short coeff0_1; ++} pico_coeff; ++ ++ ++static int yuv2bgr24_avr32_pico(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, ++ int srcSliceH, uint8_t* dst[], int dstStride[]){ ++ int y; ++ static int first_time = 1; ++ ++ /* Initialize pico */ ++ PICO_LDCM_D(&pico_coeff, ++ PICO_REGVECT_COEFF0_A, PICO_REGVECT_COEFF0_B, ++ PICO_REGVECT_COEFF1_A, PICO_REGVECT_COEFF1_B, ++ PICO_REGVECT_COEFF2_A, PICO_REGVECT_COEFF2_B); ++ ++ PICO_PUT_W(PICO_CONFIG, ++ (PICO_PACKED_MODE << PICO_OUTPUT_MODE ++ | PICO_TRANSFORMATION_MODE << PICO_INPUT_MODE ++ | OFFSET_FRAC_BITS << PICO_OFFSET_FRAC_BITS ++ | COEFF_FRAC_BITS << PICO_COEFF_FRAC_BITS)); ++ ++ ++ if(c->srcFormat == PIX_FMT_YUV422P){ ++ srcStride[1] *= 2; ++ srcStride[2] *= 2; ++ } ++ ++ for(y=0; y>1)*srcStride[1]; ++ uint8_t *pv= src[2] + (y>>1)*srcStride[2]; ++ unsigned int h_size= c->dstW>>3; ++ int *py_1_int = (int *)py_1; ++ int *py_2_int = (int *)py_2; ++ int *pu_int = (int *)pu; ++ int *pv_int = (int *)pv; ++ while (h_size--) { ++ PICO_PUT_W(PICO_INPIX0, *py_1_int++); ++ PICO_PUT_W(PICO_INPIX1, *pu_int++); ++ PICO_PUT_W(PICO_INPIX2, *pv_int++); ++ PICO_OP(0, 0, 0, 4, 8); ++ PICO_OP(0, 1, 1, 4, 8); ++ PICO_OP(0, 2, 2, 5, 9); ++ PICO_OP(0, 3, 3, 5, 9); ++ PICO_PUT_W(PICO_INPIX0, *py_1_int++); ++ PICO_STCM_W(dst_1, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); ++ PICO_OP(0, 0, 0, 6, 10); ++ PICO_OP(0, 1, 1, 6, 10); ++ PICO_OP(0, 2, 2, 7, 11); ++ PICO_OP(0, 3, 3, 7, 11); ++ PICO_PUT_W(PICO_INPIX0, *py_2_int++); ++ PICO_STCM_W(dst_1 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); ++ ++ PICO_OP(0, 0, 0, 4, 8); ++ PICO_OP(0, 1, 1, 4, 8); ++ PICO_OP(0, 2, 2, 5, 9); ++ PICO_OP(0, 3, 3, 5, 9); ++ PICO_PUT_W(PICO_INPIX0, *py_2_int++); ++ PICO_STCM_W(dst_2, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); ++ PICO_OP(0, 0, 0, 6, 10); ++ PICO_OP(0, 1, 1, 6, 10); ++ PICO_OP(0, 2, 2, 7, 11); ++ PICO_OP(0, 3, 3, 7, 11); ++ PICO_STCM_W(dst_2 + 12, PICO_REGVECT_OUTPIX2, PICO_REGVECT_OUTPIX1, PICO_REGVECT_OUTPIX0); ++ ++ dst_1 += 24; ++ dst_2 += 24; ++ } ++ } ++ return srcSliceH; ++} ++ ++extern int avr32_use_pico; ++ ++SwsFunc yuv2rgb_init_avr32 (SwsContext *c){ ++ switch(c->dstFormat){ ++ case PIX_FMT_BGR24: ++ { ++ if ( avr32_use_pico ){ ++ MSG_ERR("AVR32 BGR24: Using PICO for color space conversion\n"); ++ return yuv2bgr24_avr32_pico; ++ } else { ++ MSG_ERR("AVR32 BGR24: Using optimized color space conversion\n"); ++ return yuv2bgr24_avr32; ++ } ++ } ++ break; ++ case PIX_FMT_RGB24: ++ { ++ if ( avr32_use_pico ){ ++ MSG_ERR("AVR32 RGB24: Using PICO for color space conversion\n"); ++ return yuv2bgr24_avr32_pico; ++ } else { ++ MSG_ERR("AVR32 RGB24: Using optimized color space conversion\n"); ++ return yuv2rgb24_avr32; ++ } ++ } ++ } ++ return NULL; ++} ++ ++ ++int yuv2rgb_c_init_tables_avr32 (SwsContext *c, const int inv_table[4], int fullRange, int brightness, int contrast, int saturation){ ++ const int isRgb = (c->dstFormat == PIX_FMT_RGB24); ++ ++ int64_t crv = inv_table[0]; ++ int64_t cbu = inv_table[1]; ++ int64_t cgu = -inv_table[2]; ++ int64_t cgv = -inv_table[3]; ++ int64_t cy = 1<<16; ++ int64_t oy = 0; ++ ++ if(!fullRange){ ++ cy= (cy*255) / 219; ++ oy= 16<<16; ++ } ++ ++ cy = (cy *contrast )>>16; ++ crv= (crv*contrast * saturation)>>32; ++ cbu= (cbu*contrast * saturation)>>32; ++ cgu= (cgu*contrast * saturation)>>32; ++ cgv= (cgv*contrast * saturation)>>32; ++ ++ oy -= 256*brightness; ++ ++ pico_coeff.coeff1_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* G <- Y */ ++ pico_coeff.coeff1_1 = SCALE(cgu, 16 - COEFF_FRAC_BITS); /* G <- U */ ++ pico_coeff.coeff1_2 = SCALE(cgv, 16 - COEFF_FRAC_BITS); /* G <- V */ ++ pico_coeff.coeff1_3 = (SCALE(-128*cgu - 128*cgv - 16*cy, 16 - OFFSET_FRAC_BITS) ++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* G offset */ ++ ++ if ( isRgb ){ ++ pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */ ++ pico_coeff.coeff0_1 = 0; /* R <- U */ ++ pico_coeff.coeff0_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */ ++ pico_coeff.coeff0_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS) ++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */ ++ ++ pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */ ++ pico_coeff.coeff2_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */ ++ pico_coeff.coeff2_2 = 0; /* B <- V */ ++ pico_coeff.coeff2_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS) ++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1)));/* B offset */ ++ } else { ++ pico_coeff.coeff2_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* R <- Y */ ++ pico_coeff.coeff2_1 = 0; /* R <- U */ ++ pico_coeff.coeff2_2 = SCALE(crv, 16 - COEFF_FRAC_BITS); /* R <- V */ ++ pico_coeff.coeff2_3 = (SCALE(-128*crv - 16*cy, 16 - OFFSET_FRAC_BITS) ++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* R offset */ ++ ++ pico_coeff.coeff0_0 = SCALE(cy, 16 - COEFF_FRAC_BITS); /* B <- Y */ ++ pico_coeff.coeff0_1 = SCALE(cbu, 16 - COEFF_FRAC_BITS); /* B <- U */ ++ pico_coeff.coeff0_2 = 0; /* B <- V */ ++ pico_coeff.coeff0_3 = (SCALE(-128*cbu - 16*cy, 16 - OFFSET_FRAC_BITS) ++ + /*0.5*/(1 << (OFFSET_FRAC_BITS-1))); /* B offset */ ++ } ++ ++} ++ ++ ++#undef RGB +diff --git a/libvo/vo_fbdev2.c b/libvo/vo_fbdev2.c +index 053c193..7017770 100644 +--- a/libvo/vo_fbdev2.c ++++ b/libvo/vo_fbdev2.c +@@ -22,6 +22,9 @@ + #include "sub.h" + #include "mp_msg.h" + ++/* Draw directly to framebuffer */ ++#define USE_CONVERT2FB ++ + static vo_info_t info = { + "Framebuffer Device", + "fbdev2", +@@ -178,6 +181,15 @@ static int fb_preinit(int reset) + } + fb_orig_vinfo = fb_vinfo; + ++ /* Reset panning offset */ ++ fb_vinfo.yoffset = 0; ++ if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) { ++ mp_msg(MSGT_VO, MSGL_ERR, ++ "[fbdev2] FBIOPAN_DISPLAY failed: %s\n", ++ strerror(errno)); ++ return 0; ++ } ++ + fb_bpp = fb_vinfo.bits_per_pixel; + + /* 16 and 15 bpp is reported as 16 bpp */ +@@ -289,6 +301,10 @@ static int config(uint32_t width, uint32_t height, uint32_t d_width, + mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't malloc next_frame: %s\n", strerror(errno)); + return 1; + } ++#else ++ if ((fb_line_len * fb_vinfo.yres) <= (fb_finfo.smem_len / 2) ++ && fb_vinfo.yoffset == 0) ++ center += fb_line_len * fb_vinfo.yres; + #endif + if (fs) memset(frame_buffer, '\0', fb_line_len * fb_vinfo.yres); + +@@ -299,14 +315,22 @@ static int query_format(uint32_t format) + { + // open the device, etc. + if (fb_preinit(0)) return 0; +- if ((format & IMGFMT_BGR_MASK) == IMGFMT_BGR) { ++ if ((format & IMGFMT_RGB_MASK) == IMGFMT_RGB) { + int fb_target_bpp = format & 0xff; + set_bpp(&fb_vinfo, fb_target_bpp); + fb_vinfo.xres_virtual = fb_vinfo.xres; +- fb_vinfo.yres_virtual = fb_vinfo.yres; ++ fb_vinfo.yres_virtual = fb_vinfo.yres * 2; + if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) { +- mp_msg(MSGT_VO, MSGL_ERR, "[fbdev2] Can't put VSCREENINFO: %s\n", strerror(errno)); +- return 0; ++ mp_msg(MSGT_VO, MSGL_WARN, ++ "[fbdev2] Can't double virtual y resolution: %s\n", ++ strerror(errno)); ++ fb_vinfo.yres_virtual = fb_vinfo.yres; ++ if (ioctl(fb_dev_fd, FBIOPUT_VSCREENINFO, &fb_vinfo)) { ++ mp_msg(MSGT_VO, MSGL_ERR, ++ "[fbdev2] Can't put VSCREENINFO: %s\n", ++ strerror(errno)); ++ return -1; ++ } + } + fb_pixel_size = fb_vinfo.bits_per_pixel / 8; + fb_bpp = fb_vinfo.red.length + fb_vinfo.green.length + +@@ -367,16 +391,67 @@ static void check_events(void) + + static void flip_page(void) + { +-#ifndef USE_CONVERT2FB + int i, out_offset = 0, in_offset = 0; + +- for (i = 0; i < in_height; i++) { +- memcpy(center + out_offset, next_frame + in_offset, +- in_width * fb_pixel_size); +- out_offset += fb_line_len; +- in_offset += in_width * fb_pixel_size; +- } ++#ifndef USE_CONVERT2FB ++ if (1) { ++#else ++ if (fb_vinfo.yres_virtual == fb_vinfo.yres) { + #endif ++ for (i = 0; i < in_height; i++) { ++ memcpy(center + out_offset, next_frame + in_offset, ++ in_width * fb_pixel_size); ++ out_offset += fb_line_len; ++ in_offset += in_width * fb_pixel_size; ++ } ++ } else { ++ if (fb_vinfo.yoffset == 0) { ++ fb_vinfo.yoffset += fb_vinfo.yres; ++ center -= fb_line_len * fb_vinfo.yres; ++ } else { ++ fb_vinfo.yoffset = 0; ++ center += fb_line_len * fb_vinfo.yres; ++ } ++ ++ if (ioctl(fb_dev_fd, FBIOPAN_DISPLAY, &fb_vinfo)) { ++ mp_msg(MSGT_VO, MSGL_ERR, ++ "[fbdev2] Can't FBIOPAN_DISPLAY: %s\n", ++ strerror(errno)); ++ } ++ } ++} ++ ++static uint32_t get_image(mp_image_t *mpi) ++{ ++ if(mpi->flags&MP_IMGFLAG_READABLE) ++ return VO_FALSE; // slow video ram ++ if(mpi->type==MP_IMGTYPE_STATIC) ++ return VO_FALSE; // it is not static ++ ++ if (mpi->flags & (MP_IMGFLAG_ACCEPT_STRIDE | MP_IMGFLAG_ACCEPT_WIDTH)) { ++ // we're lucky or codec accepts stride => ok, let's go! ++ ++ //YUY2 and RGB formats ++ mpi->planes[0] = center; ++ mpi->width = in_width; ++ mpi->stride[0] = fb_line_len; ++ ++ // center image ++ ++ mpi->flags |= MP_IMGFLAG_DIRECT; ++ ++ return VO_TRUE; ++ } ++ ++ return VO_FALSE; ++} ++ ++static uint32_t put_image(mp_image_t *mpi) ++{ ++ // already out? ++ if ((mpi->flags & (MP_IMGFLAG_DIRECT | MP_IMGFLAG_DRAW_CALLBACK))) ++ return VO_TRUE; ++ return VO_FALSE; + } + + static void uninit(void) +@@ -403,6 +478,10 @@ static int control(uint32_t request, void *data, ...) + switch (request) { + case VOCTRL_QUERY_FORMAT: + return query_format(*((uint32_t*)data)); ++ case VOCTRL_GET_IMAGE: ++ return get_image(data); ++ case VOCTRL_DRAW_IMAGE: ++ return put_image(data); + } + return VO_NOTIMPL; + } +diff --git a/version.sh b/version.sh +index 44b5c5d..cf22a68 100755 +--- a/version.sh ++++ b/version.sh +@@ -1,2 +1,2 @@ + #!/bin/sh +-echo "#define VERSION \"1.0rc1-$1\"" > version.h ++echo "#define VERSION \"1.0rc1.atmel.2-$1\"" > version.h diff --git a/package/mplayer/mplayer-1.0rc1-remove-configuration-x11-header-search-path.patch b/package/mplayer/mplayer-1.0rc1-remove-configuration-x11-header-search-path.patch deleted file mode 100644 index d2e00f260..000000000 --- a/package/mplayer/mplayer-1.0rc1-remove-configuration-x11-header-search-path.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- MPlayer-1.0rc1-orig/configure 2007-06-01 18:15:59.000000000 +0200 -+++ MPlayer-1.0rc1/configure 2007-06-01 18:17:38.000000000 +0200 -@@ -3803,7 +3803,7 @@ fi - - - echocheck "X11 headers presence" -- for I in `echo $_inc_extra | sed s/-I//g` /usr/X11/include /usr/X11R6/include /usr/include/X11R6 /usr/include /usr/openwin/include ; do -+ for I in `echo $_inc_extra | sed s/-I//g` ; do - if test -f "$I/X11/Xlib.h" ; then - _inc_x11="-I$I" - _x11_headers="yes" -- cgit v1.2.3