summaryrefslogtreecommitdiff
path: root/libavcodec/i386
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2007-05-12 02:41:25 +0000
committerLoren Merritt <lorenm@u.washington.edu>2007-05-12 02:41:25 +0000
commit1edbfe19948e3852922660fe01252ff7d37ead72 (patch)
treebf4723612da5d004fb35f7ad18dd9e024d6002dc /libavcodec/i386
parent561f940c03de8904433efca63b084ca2d93c3126 (diff)
factor sum_abs_dctelem out of dct_sad, and simd it.
sum_abs_dctelem_* alone: core2: c=186 mmx2=39 sse2=21 ssse3=13 (cycles) k8: c=163 mmx2=33 sse2=31 p4: c=370 mmx2=60 sse2=60 dct_sad including sum_abs_dctelem_*: core2: c=405 mmx2=258 sse2=240 ssse3=232 k8: c=624 mmx2=394 sse2=392 p4: c=849 mmx2=556 sse2=556 Originally committed as revision 9001 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386')
-rw-r--r--libavcodec/i386/dsputil_mmx.c72
1 files changed, 72 insertions, 0 deletions
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index 8a7d85855b..39f85d365d 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -1649,6 +1649,9 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
"movq "#c", "#o"+16(%1) \n\t"\
"movq "#d", "#o"+24(%1) \n\t"\
+/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
+ * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
+ * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
#define HSUM_MMX(a, t, dst)\
"movq "#a", "#t" \n\t"\
"psrlq $32, "#a" \n\t"\
@@ -1802,6 +1805,71 @@ HADAMARD8_DIFF_SSE2(ssse3)
#undef MMABS_SUM_8x8
#endif
+#define DCT_SAD4(m,mm,o)\
+ "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
+ "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
+ "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
+ "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
+ MMABS_SUM(mm##2, mm##6, mm##0)\
+ MMABS_SUM(mm##3, mm##7, mm##1)\
+ MMABS_SUM(mm##4, mm##6, mm##0)\
+ MMABS_SUM(mm##5, mm##7, mm##1)\
+
+#define DCT_SAD_MMX\
+ "pxor %%mm0, %%mm0 \n\t"\
+ "pxor %%mm1, %%mm1 \n\t"\
+ DCT_SAD4(q, %%mm, 0)\
+ DCT_SAD4(q, %%mm, 8)\
+ DCT_SAD4(q, %%mm, 64)\
+ DCT_SAD4(q, %%mm, 72)\
+ "paddusw %%mm1, %%mm0 \n\t"\
+ HSUM(%%mm0, %%mm1, %0)
+
+#define DCT_SAD_SSE2\
+ "pxor %%xmm0, %%xmm0 \n\t"\
+ "pxor %%xmm1, %%xmm1 \n\t"\
+ DCT_SAD4(dqa, %%xmm, 0)\
+ DCT_SAD4(dqa, %%xmm, 64)\
+ "paddusw %%xmm1, %%xmm0 \n\t"\
+ HSUM(%%xmm0, %%xmm1, %0)
+
+#define DCT_SAD_FUNC(cpu) \
+static int sum_abs_dctelem_##cpu(DCTELEM *block){\
+ int sum;\
+ asm volatile(\
+ DCT_SAD\
+ :"=r"(sum)\
+ :"r"(block)\
+ );\
+ return sum&0xFFFF;\
+}
+
+#define DCT_SAD DCT_SAD_MMX
+#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
+#define MMABS(a,z) MMABS_MMX(a,z)
+DCT_SAD_FUNC(mmx)
+#undef MMABS
+#undef HSUM
+
+#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
+#define MMABS(a,z) MMABS_MMX2(a,z)
+DCT_SAD_FUNC(mmx2)
+#undef HSUM
+#undef DCT_SAD
+
+#define DCT_SAD DCT_SAD_SSE2
+#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
+DCT_SAD_FUNC(sse2)
+#undef MMABS
+
+#ifdef HAVE_SSSE3
+#define MMABS(a,z) MMABS_SSSE3(a,z)
+DCT_SAD_FUNC(ssse3)
+#undef MMABS
+#endif
+#undef HSUM
+#undef DCT_SAD
+
static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
int sum;
long i=size;
@@ -3298,6 +3366,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->add_bytes= add_bytes_mmx;
#ifdef CONFIG_ENCODERS
c->diff_bytes= diff_bytes_mmx;
+ c->sum_abs_dctelem= sum_abs_dctelem_mmx;
c->hadamard8_diff[0]= hadamard8_diff16_mmx;
c->hadamard8_diff[1]= hadamard8_diff_mmx;
@@ -3350,6 +3419,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
#ifdef CONFIG_ENCODERS
+ c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
c->hadamard8_diff[1]= hadamard8_diff_mmx2;
c->vsad[4]= vsad_intra16_mmx2;
@@ -3569,12 +3639,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#ifdef CONFIG_ENCODERS
if(mm_flags & MM_SSE2){
+ c->sum_abs_dctelem= sum_abs_dctelem_sse2;
c->hadamard8_diff[0]= hadamard8_diff16_sse2;
c->hadamard8_diff[1]= hadamard8_diff_sse2;
}
#ifdef HAVE_SSSE3
if(mm_flags & MM_SSSE3){
+ c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
}