From b10fa1bb8b9d4074b1d4dd7364ca236a574c9951 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Thu, 3 Dec 2009 18:53:12 +0000 Subject: port ape dsp functions from sse2 to mmx now requires yasm Originally committed as revision 20722 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/x86/dsputil_mmx.c | 93 ++++++++--------------------------------- libavcodec/x86/dsputil_yasm.asm | 75 +++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 75 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index af33707df0..93d4af5565 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2384,6 +2384,12 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); +void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order); +void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order); +void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order); +void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order); +int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift); +int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift); void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top); int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left); int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left); @@ -2507,78 +2513,6 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8); -static void add_int16_sse2(int16_t * v1, int16_t * v2, int order) -{ - x86_reg o = -(order << 1); - v1 += order; - v2 += order; - __asm__ volatile( - "1: \n\t" - "movdqu (%1,%2), %%xmm0 \n\t" - "movdqu 16(%1,%2), %%xmm1 \n\t" - "paddw (%0,%2), %%xmm0 \n\t" - "paddw 16(%0,%2), %%xmm1 \n\t" - "movdqa %%xmm0, (%0,%2) \n\t" - "movdqa %%xmm1, 16(%0,%2) \n\t" - "add $32, %2 \n\t" - "js 1b \n\t" - : "+r"(v1), "+r"(v2), "+r"(o) - ); -} - -static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order) -{ - x86_reg o = -(order << 1); - v1 += order; - v2 += order; - __asm__ volatile( - "1: \n\t" - "movdqa (%0,%2), %%xmm0 \n\t" - "movdqa 16(%0,%2), %%xmm2 \n\t" - "movdqu (%1,%2), %%xmm1 \n\t" - "movdqu 16(%1,%2), %%xmm3 \n\t" - "psubw %%xmm1, %%xmm0 \n\t" - "psubw %%xmm3, %%xmm2 \n\t" - "movdqa %%xmm0, (%0,%2) \n\t" - "movdqa %%xmm2, 16(%0,%2) \n\t" - "add $32, %2 \n\t" - "js 1b \n\t" - : "+r"(v1), "+r"(v2), "+r"(o) - ); -} - -static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) -{ - int res = 0; - DECLARE_ALIGNED_16(xmm_reg, sh); - x86_reg o = -(order << 1); - - v1 += order; - v2 += order; - sh.a = shift; - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "1: \n\t" - "movdqu (%0,%3), %%xmm0 \n\t" - "movdqu 16(%0,%3), %%xmm1 \n\t" - "pmaddwd (%1,%3), %%xmm0 \n\t" - "pmaddwd 16(%1,%3), %%xmm1 \n\t" - "paddd %%xmm0, %%xmm7 \n\t" - "paddd %%xmm1, %%xmm7 \n\t" - "add $32, %3 \n\t" - "js 1b \n\t" - "movhlps %%xmm7, %%xmm2 \n\t" - "paddd %%xmm2, %%xmm7 \n\t" - "psrad %4, %%xmm7 \n\t" - "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t" - "paddd %%xmm2, %%xmm7 \n\t" - "movd %%xmm7, %2 \n\t" - : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o) - : "m"(sh) - ); - return res; -} - void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) { mm_flags = mm_support(); @@ -3015,6 +2949,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->float_to_int16_interleave = float_to_int16_interleave_3dn2; } } + if(mm_flags & FF_MM_MMX2){ +#if HAVE_YASM + c->add_int16 = ff_add_int16_mmx2; + c->sub_int16 = ff_sub_int16_mmx2; + c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2; +#endif + } if(mm_flags & FF_MM_SSE){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; c->ac3_downmix = ac3_downmix_sse; @@ -3033,9 +2974,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; c->float_to_int16 = float_to_int16_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2; - c->add_int16 = add_int16_sse2; - c->sub_int16 = sub_int16_sse2; - c->scalarproduct_int16 = scalarproduct_int16_sse2; +#if HAVE_YASM + c->add_int16 = ff_add_int16_sse2; + c->sub_int16 = ff_sub_int16_sse2; + c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; +#endif } } diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index bb6376c64f..c8a4230374 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -99,6 +99,81 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2 +%macro SCALARPRODUCT 1 +; void add_int16(int16_t * v1, int16_t * v2, int order) +cglobal add_int16_%1, 3,3,2, v1, v2, order + shl orderq, 1 + add v1q, orderq + add v2q, orderq + neg orderq +.loop: + movu m0, [v2q + orderq] + movu m1, [v2q + orderq + mmsize] + paddw m0, [v1q + orderq] + paddw m1, [v1q + orderq + mmsize] + mova [v1q + orderq], m0 + mova [v1q + orderq + mmsize], m1 + add orderq, mmsize*2 + jl .loop + REP_RET + +; void sub_int16(int16_t * v1, int16_t * v2, int order) +cglobal sub_int16_%1, 3,3,4, v1, v2, order + shl orderq, 1 + add v1q, orderq + add v2q, orderq + neg orderq +.loop: + movu m2, [v2q + orderq] + movu m3, [v2q + orderq + mmsize] + mova m0, [v1q + orderq] + mova m1, [v1q + orderq + mmsize] + psubw m0, m2 + psubw m1, m3 + mova [v1q + orderq], m0 + mova [v1q + orderq + mmsize], m1 + add orderq, mmsize*2 + jl .loop + REP_RET + +; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift) +cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift + shl orderq, 1 + add v1q, orderq + add v2q, orderq + neg orderq + movd m3, shiftm + pxor m2, m2 +.loop: + movu m0, [v1q + orderq] + movu m1, [v1q + orderq + mmsize] + pmaddwd m0, [v2q + orderq] + pmaddwd m1, [v2q + orderq + mmsize] + paddd m2, m0 + paddd m2, m1 + add orderq, mmsize*2 + jl .loop +%if mmsize == 16 + movhlps m0, m2 + paddd m2, m0 + psrad m2, m3 + pshuflw m0, m2, 0x4e +%else + psrad m2, m3 + pshufw m0, m2, 0x4e +%endif + paddd m2, m0 + movd eax, m2 + RET +%endmacro + +INIT_MMX +SCALARPRODUCT mmx2 +INIT_XMM +SCALARPRODUCT sse2 + + + ; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top movq mm0, [topq] -- cgit v1.2.3