From 5900637219ccccdd39ddafa4e7181da20b8e1f1b Mon Sep 17 00:00:00 2001
From: Loren Merritt <lorenm@u.washington.edu>
Date: Fri, 30 Mar 2007 19:15:31 +0000
Subject: mmx 16-bit ssd. 2.3x faster svq1 encoding.

Originally committed as revision 8559 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/dsputil.c          | 10 ++++++++++
 libavcodec/dsputil.h          |  2 ++
 libavcodec/i386/dsputil_mmx.c | 34 ++++++++++++++++++++++++++++++++++
 libavcodec/svq1.c             |  9 ++-------
 4 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 6aa54538ec..3f5e845e7e 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3694,6 +3694,14 @@ static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int st
     return score;
 }
 
+static int ssd_int8_vs_int16_c(int8_t *pix1, int16_t *pix2, int size){
+    int score=0;
+    int i;
+    for(i=0; i<size; i++)
+        score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
+    return score;
+}
+
 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
@@ -4076,6 +4084,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->w97[1]= w97_8_c;
 #endif
 
+    c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
+
     c->add_bytes= add_bytes_c;
     c->diff_bytes= diff_bytes_c;
     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 157426748c..19849dd246 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -200,6 +200,8 @@ typedef struct DSPContext {
     me_cmp_func ildct_cmp[5]; //only width 16 used
     me_cmp_func frame_skip_cmp[5]; //only width 8 used
 
+    int (*ssd_int8_vs_int16)(int8_t *pix1, int16_t *pix2, int size);
+
     /**
      * Halfpel motion compensation with rounding (a+b+1)>>1.
      * this is an array[4][4] of motion compensation funcions for 4
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index 19f6ce8a45..23a717acdd 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -1730,6 +1730,38 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride
 
 WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
 WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
+
+static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
+    int sum;
+    long i=size;
+    asm volatile(
+        "pxor %%mm4, %%mm4 \n"
+        "1: \n"
+        "sub $8, %0 \n"
+        "movq (%2,%0), %%mm2 \n"
+        "movq (%3,%0,2), %%mm0 \n"
+        "movq 8(%3,%0,2), %%mm1 \n"
+        "punpckhbw %%mm2, %%mm3 \n"
+        "punpcklbw %%mm2, %%mm2 \n"
+        "psraw $8, %%mm3 \n"
+        "psraw $8, %%mm2 \n"
+        "psubw %%mm3, %%mm1 \n"
+        "psubw %%mm2, %%mm0 \n"
+        "pmaddwd %%mm1, %%mm1 \n"
+        "pmaddwd %%mm0, %%mm0 \n"
+        "paddd %%mm1, %%mm4 \n"
+        "paddd %%mm0, %%mm4 \n"
+        "jg 1b \n"
+        "movq %%mm4, %%mm3 \n"
+        "psrlq $32, %%mm3 \n"
+        "paddd %%mm3, %%mm4 \n"
+        "movd %%mm4, %1 \n"
+        :"+r"(i), "=r"(sum)
+        :"r"(pix1), "r"(pix2)
+    );
+    return sum;
+}
+
 #endif //CONFIG_ENCODERS
 
 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
@@ -3215,6 +3247,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         }
         c->add_8x8basis= add_8x8basis_mmx;
 
+        c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
+
 #endif //CONFIG_ENCODERS
 
         c->h263_v_loop_filter= h263_v_loop_filter_mmx;
diff --git a/libavcodec/svq1.c b/libavcodec/svq1.c
index 55595b7baa..9337dc1897 100644
--- a/libavcodec/svq1.c
+++ b/libavcodec/svq1.c
@@ -992,15 +992,10 @@ static int encode_block(SVQ1Context *s, uint8_t *src, uint8_t *ref, uint8_t *dec
 
             for(i=0; i<16; i++){
                 int sum= codebook_sum[stage*16 + i];
-                int sqr=0;
-                int diff, mean, score;
+                int sqr, diff, mean, score;
 
                 vector = codebook + stage*size*16 + i*size;
-
-                for(j=0; j<size; j++){
-                    int v= vector[j];
-                    sqr += (v - block[stage][j])*(v - block[stage][j]);
-                }
+                sqr = s->dsp.ssd_int8_vs_int16(vector, block[stage], size);
                 diff= block_sum[stage] - sum;
                 mean= (diff + (size>>1)) >> (level+3);
                 assert(mean >-300 && mean<300);
-- 
cgit v1.2.3