summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2009-12-03 18:53:12 +0000
committerLoren Merritt <lorenm@u.washington.edu>2009-12-03 18:53:12 +0000
commitb10fa1bb8b9d4074b1d4dd7364ca236a574c9951 (patch)
treeb7f75f84fb13d8a68ee8079da1f03089cc33b9c5
parentf415be684dda3958332d19e0854c080378eef4c2 (diff)
port ape dsp functions from sse2 to mmx
now requires yasm Originally committed as revision 20722 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/x86/dsputil_mmx.c93
-rw-r--r--libavcodec/x86/dsputil_yasm.asm75
2 files changed, 93 insertions, 75 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index af33707df0..93d4af5565 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2384,6 +2384,12 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
+void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
+void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
+void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
+int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
+int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
@@ -2507,78 +2513,6 @@ void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, ui
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
-static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
- x86_reg o = -(order << 1);
- v1 += order;
- v2 += order;
- __asm__ volatile(
- "1: \n\t"
- "movdqu (%1,%2), %%xmm0 \n\t"
- "movdqu 16(%1,%2), %%xmm1 \n\t"
- "paddw (%0,%2), %%xmm0 \n\t"
- "paddw 16(%0,%2), %%xmm1 \n\t"
- "movdqa %%xmm0, (%0,%2) \n\t"
- "movdqa %%xmm1, 16(%0,%2) \n\t"
- "add $32, %2 \n\t"
- "js 1b \n\t"
- : "+r"(v1), "+r"(v2), "+r"(o)
- );
-}
-
-static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
- x86_reg o = -(order << 1);
- v1 += order;
- v2 += order;
- __asm__ volatile(
- "1: \n\t"
- "movdqa (%0,%2), %%xmm0 \n\t"
- "movdqa 16(%0,%2), %%xmm2 \n\t"
- "movdqu (%1,%2), %%xmm1 \n\t"
- "movdqu 16(%1,%2), %%xmm3 \n\t"
- "psubw %%xmm1, %%xmm0 \n\t"
- "psubw %%xmm3, %%xmm2 \n\t"
- "movdqa %%xmm0, (%0,%2) \n\t"
- "movdqa %%xmm2, 16(%0,%2) \n\t"
- "add $32, %2 \n\t"
- "js 1b \n\t"
- : "+r"(v1), "+r"(v2), "+r"(o)
- );
-}
-
-static int32_t scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
-{
- int res = 0;
- DECLARE_ALIGNED_16(xmm_reg, sh);
- x86_reg o = -(order << 1);
-
- v1 += order;
- v2 += order;
- sh.a = shift;
- __asm__ volatile(
- "pxor %%xmm7, %%xmm7 \n\t"
- "1: \n\t"
- "movdqu (%0,%3), %%xmm0 \n\t"
- "movdqu 16(%0,%3), %%xmm1 \n\t"
- "pmaddwd (%1,%3), %%xmm0 \n\t"
- "pmaddwd 16(%1,%3), %%xmm1 \n\t"
- "paddd %%xmm0, %%xmm7 \n\t"
- "paddd %%xmm1, %%xmm7 \n\t"
- "add $32, %3 \n\t"
- "js 1b \n\t"
- "movhlps %%xmm7, %%xmm2 \n\t"
- "paddd %%xmm2, %%xmm7 \n\t"
- "psrad %4, %%xmm7 \n\t"
- "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
- "paddd %%xmm2, %%xmm7 \n\t"
- "movd %%xmm7, %2 \n\t"
- : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
- : "m"(sh)
- );
- return res;
-}
-
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
mm_flags = mm_support();
@@ -3015,6 +2949,13 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
}
}
+ if(mm_flags & FF_MM_MMX2){
+#if HAVE_YASM
+ c->add_int16 = ff_add_int16_mmx2;
+ c->sub_int16 = ff_sub_int16_mmx2;
+ c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
+#endif
+ }
if(mm_flags & FF_MM_SSE){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->ac3_downmix = ac3_downmix_sse;
@@ -3033,9 +2974,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
- c->add_int16 = add_int16_sse2;
- c->sub_int16 = sub_int16_sse2;
- c->scalarproduct_int16 = scalarproduct_int16_sse2;
+#if HAVE_YASM
+ c->add_int16 = ff_add_int16_sse2;
+ c->sub_int16 = ff_sub_int16_sse2;
+ c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
+#endif
}
}
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index bb6376c64f..c8a4230374 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -99,6 +99,81 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
+%macro SCALARPRODUCT 1
+; void add_int16(int16_t * v1, int16_t * v2, int order)
+cglobal add_int16_%1, 3,3,2, v1, v2, order
+ shl orderq, 1
+ add v1q, orderq
+ add v2q, orderq
+ neg orderq
+.loop:
+ movu m0, [v2q + orderq]
+ movu m1, [v2q + orderq + mmsize]
+ paddw m0, [v1q + orderq]
+ paddw m1, [v1q + orderq + mmsize]
+ mova [v1q + orderq], m0
+ mova [v1q + orderq + mmsize], m1
+ add orderq, mmsize*2
+ jl .loop
+ REP_RET
+
+; void sub_int16(int16_t * v1, int16_t * v2, int order)
+cglobal sub_int16_%1, 3,3,4, v1, v2, order
+ shl orderq, 1
+ add v1q, orderq
+ add v2q, orderq
+ neg orderq
+.loop:
+ movu m2, [v2q + orderq]
+ movu m3, [v2q + orderq + mmsize]
+ mova m0, [v1q + orderq]
+ mova m1, [v1q + orderq + mmsize]
+ psubw m0, m2
+ psubw m1, m3
+ mova [v1q + orderq], m0
+ mova [v1q + orderq + mmsize], m1
+ add orderq, mmsize*2
+ jl .loop
+ REP_RET
+
+; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
+cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
+ shl orderq, 1
+ add v1q, orderq
+ add v2q, orderq
+ neg orderq
+ movd m3, shiftm
+ pxor m2, m2
+.loop:
+ movu m0, [v1q + orderq]
+ movu m1, [v1q + orderq + mmsize]
+ pmaddwd m0, [v2q + orderq]
+ pmaddwd m1, [v2q + orderq + mmsize]
+ paddd m2, m0
+ paddd m2, m1
+ add orderq, mmsize*2
+ jl .loop
+%if mmsize == 16
+ movhlps m0, m2
+ paddd m2, m0
+ psrad m2, m3
+ pshuflw m0, m2, 0x4e
+%else
+ psrad m2, m3
+ pshufw m0, m2, 0x4e
+%endif
+ paddd m2, m0
+ movd eax, m2
+ RET
+%endmacro
+
+INIT_MMX
+SCALARPRODUCT mmx2
+INIT_XMM
+SCALARPRODUCT sse2
+
+
+
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
movq mm0, [topq]