summaryrefslogtreecommitdiff
path: root/libavcodec/i386
diff options
context:
space:
mode:
authorKostya Shishkov <kostya.shishkov@gmail.com>2008-07-11 04:48:38 +0000
committerKostya Shishkov <kostya.shishkov@gmail.com>2008-07-11 04:48:38 +0000
commitd7e1fc425434371344d5850ac596037a69df53d0 (patch)
treeaa1f8f295fd49ef94d54b8a937a161d3fd23b291 /libavcodec/i386
parent3a8322b133115276e247951302f30349a0df2ced (diff)
SSE2 optimizations for Monkey's Audio decoder vector functions
Originally committed as revision 14161 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386')
-rw-r--r--libavcodec/i386/dsputil_mmx.c78
1 files changed, 78 insertions, 0 deletions
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index dd6061cb09..9cf2866ef3 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2092,6 +2092,79 @@ extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_st
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+ x86_reg o = -(order << 1);
+ v1 += order;
+ v2 += order;
+ asm volatile(
+ "1: \n\t"
+ "movdqu (%1,%2), %%xmm0 \n\t"
+ "movdqu 16(%1,%2), %%xmm1 \n\t"
+ "paddw (%0,%2), %%xmm0 \n\t"
+ "paddw 16(%0,%2), %%xmm1 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "movdqa %%xmm1, 16(%0,%2) \n\t"
+ "add $32, %2 \n\t"
+ "js 1b \n\t"
+ : "+r"(v1), "+r"(v2), "+r"(o)
+ );
+}
+
+static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
+{
+ x86_reg o = -(order << 1);
+ v1 += order;
+ v2 += order;
+ asm volatile(
+ "1: \n\t"
+ "movdqa (%0,%2), %%xmm0 \n\t"
+ "movdqa 16(%0,%2), %%xmm2 \n\t"
+ "movdqu (%1,%2), %%xmm1 \n\t"
+ "movdqu 16(%1,%2), %%xmm3 \n\t"
+ "psubw %%xmm1, %%xmm0 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "movdqa %%xmm2, 16(%0,%2) \n\t"
+ "add $32, %2 \n\t"
+ "js 1b \n\t"
+ : "+r"(v1), "+r"(v2), "+r"(o)
+ );
+}
+
+static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
+{
+ int res = 0;
+ DECLARE_ALIGNED_16(int64_t, sh);
+ x86_reg o = -(order << 1);
+
+ v1 += order;
+ v2 += order;
+ sh = shift;
+ asm volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "1: \n\t"
+ "movdqu (%0,%3), %%xmm0 \n\t"
+ "movdqu 16(%0,%3), %%xmm1 \n\t"
+ "pmaddwd (%1,%3), %%xmm0 \n\t"
+ "pmaddwd 16(%1,%3), %%xmm1 \n\t"
+ "paddd %%xmm0, %%xmm7 \n\t"
+ "paddd %%xmm1, %%xmm7 \n\t"
+ "add $32, %3 \n\t"
+ "js 1b \n\t"
+ "movhlps %%xmm7, %%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm7 \n\t"
+ "psrad %4, %%xmm7 \n\t"
+ "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm7 \n\t"
+ "movd %%xmm7, %2 \n\t"
+ : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
+ : "m"(sh)
+ );
+ return res;
+}
+
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
mm_flags = mm_support();
@@ -2463,6 +2536,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
}
if(mm_flags & MM_3DNOW)
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
+ if(mm_flags & MM_SSE2){
+ c->add_int16 = add_int16_sse2;
+ c->sub_int16 = sub_int16_sse2;
+ c->scalarproduct_int16 = scalarproduct_int16_sse2;
+ }
}
if (ENABLE_ENCODERS)