summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2014-01-20 03:51:21 +0100
committerMichael Niedermayer <michaelni@gmx.at>2014-01-20 04:06:46 +0100
commita493f8541de20e76073433f39f66da31f3834bc4 (patch)
tree0df302fd7ae6f808d9dd36240ee665f675509ea7
parentda0684820a58ce42a5a2953cbce417e06a54be8f (diff)
avcodec/x86/dsp: add_int16_mmx / add_int16_sse2
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/x86/dsputil.asm65
-rw-r--r--libavcodec/x86/dsputil_init.c3
-rw-r--r--libavcodec/x86/dsputil_x86.h2
3 files changed, 70 insertions, 0 deletions
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 77069e20f8..9450cd8fd6 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -465,6 +465,71 @@ cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
.src_unaligned:
ADD_HFYU_LEFT_LOOP 0, 0
+
+%macro ADD_INT16_LOOP 1 ; %1 = is_aligned
+ movd m4, maskq
+ punpcklwd m4, m4
+ punpcklwd m4, m4
+ punpcklwd m4, m4
+ add wq, wq
+ test wq, 2*mmsize - 1
+ jz %%.tomainloop
+%%.wordloop:
+ sub wq, 2
+ mov ax, [srcq+wq]
+ add ax, [dstq+wq]
+ and ax, maskw
+ mov [dstq+wq], ax
+ test wq, 2*mmsize - 1
+ jnz %%.wordloop
+%%.tomainloop:
+ add srcq, wq
+ add dstq, wq
+ neg wq
+ jz %%.end
+%%.loop:
+%if %1
+ mova m0, [srcq+wq]
+ mova m1, [dstq+wq]
+ mova m2, [srcq+wq+mmsize]
+ mova m3, [dstq+wq+mmsize]
+%else
+ movu m0, [srcq+wq]
+ movu m1, [dstq+wq]
+ movu m2, [srcq+wq+mmsize]
+ movu m3, [dstq+wq+mmsize]
+%endif
+ paddw m0, m1
+ paddw m2, m3
+ pand m0, m4
+ pand m2, m4
+%if %1
+ mova [dstq+wq] , m0
+ mova [dstq+wq+mmsize], m2
+%else
+ movu [dstq+wq] , m0
+ movu [dstq+wq+mmsize], m2
+%endif
+ add wq, 2*mmsize
+ jl %%.loop
+%%.end:
+ RET
+%endmacro
+
+INIT_MMX mmx
+cglobal add_int16, 4,4,5, dst, src, mask, w
+ ADD_INT16_LOOP 1
+
+INIT_XMM sse2
+cglobal add_int16, 4,4,5, dst, src, mask, w
+ test srcq, mmsize-1
+ jnz .unaligned
+ test dstq, mmsize-1
+ jnz .unaligned
+ ADD_INT16_LOOP 1
+.unaligned:
+ ADD_INT16_LOOP 0
+
;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
; int32_t max, unsigned int len)
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index e0b40410a7..08bd29720a 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -542,6 +542,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMX_EXTERNAL
+ c->add_int16 = ff_add_int16_mmx;
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
@@ -625,6 +626,8 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
+
+ c->add_int16 = ff_add_int16_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 356b2c142f..e707e55a59 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -116,6 +116,8 @@ void ff_clear_blocks_mmx(int16_t *blocks);
void ff_clear_blocks_sse(int16_t *blocks);
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+void ff_add_int16_mmx(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
+void ff_add_int16_sse2(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,