Change rounding of the horizontal DWT to match the vertical one.

This allows some simplifications and optimizations and should not have any effect on quality. Originally committed as revision 10172 to svn://svn.ffmpeg.org/ffmpeg/trunk
author: Michael Niedermayer <michaelni@gmx.at> 2007-08-21 16:29:40 +0000
committer: Michael Niedermayer <michaelni@gmx.at> 2007-08-21 16:29:40 +0000
commit: ce611a27be7c9201b5920d8232e68209529065c4 (patch)
tree: 7d02fb42432300f1ca74277ab0aa77abab7b9fd8 /libavcodec
parent: 7506d47aa320beb00369325b2e868f4a84c36af0 (diff)
3 files changed, 25 insertions, 35 deletions
diff --git a/libavcodec/i386/snowdsp_mmx.c b/libavcodec/i386/snowdsp_mmx.c
index f2eb14b2b3..03f622b756 100644
--- a/libavcodec/i386/snowdsp_mmx.c
+++ b/libavcodec/i386/snowdsp_mmx.c
@@ -111,8 +111,7 @@ void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
 
         i = 0;
         asm volatile(
-            "pcmpeqd    %%xmm7, %%xmm7        \n\t"
-            "psrad         $29, %%xmm7        \n\t"
+            "pslld          $1, %%xmm7        \n\t"
         ::);
         for(; i<w_l-7; i+=8){
             asm volatile(
@@ -157,25 +156,21 @@ void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
                 "movdqu 20(%1), %%xmm6        \n\t"
                 "paddd    (%1), %%xmm2        \n\t"
                 "paddd  16(%1), %%xmm6        \n\t"
-                "movdqa %%xmm2, %%xmm0        \n\t"
-                "movdqa %%xmm6, %%xmm4        \n\t"
-                "pslld      $2, %%xmm2        \n\t"
-                "pslld      $2, %%xmm6        \n\t"
-                "psubd  %%xmm2, %%xmm0        \n\t"
-                "psubd  %%xmm6, %%xmm4        \n\t"
-                "psrad      $1, %%xmm0        \n\t"
-                "psrad      $1, %%xmm4        \n\t"
-                "movdqu   (%0), %%xmm2        \n\t"
-                "movdqu 16(%0), %%xmm6        \n\t"
-                "psubd  %%xmm0, %%xmm2        \n\t"
-                "psubd  %%xmm4, %%xmm6        \n\t"
+                "movdqu   (%0), %%xmm0        \n\t"
+                "movdqu 16(%0), %%xmm4        \n\t"
+                "paddd  %%xmm2, %%xmm0        \n\t"
+                "paddd  %%xmm6, %%xmm4        \n\t"
+                "psrad      $1, %%xmm2        \n\t"
+                "psrad      $1, %%xmm6        \n\t"
+                "paddd  %%xmm0, %%xmm2        \n\t"
+                "paddd  %%xmm4, %%xmm6        \n\t"
                 "movdqa %%xmm2, (%2)          \n\t"
                 "movdqa %%xmm6, 16(%2)        \n\t"
                 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
                  : "memory"
                );
         }
-        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
     }
 
     {
@@ -291,10 +286,9 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
         DWTELEM * const ref = b+w2 - 1;
 
         i = 1;
-        b[0] = b[0] + (((2 * ref[1] + W_BO-1) + 4 * b[0]) >> W_BS);
+        b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
         asm volatile(
-            "pcmpeqd     %%mm7, %%mm7        \n\t"
-            "psrld         $29, %%mm7        \n\t"
+            "pslld          $1, %%mm7        \n\t"
            ::);
         for(; i<w_l-3; i+=4){
             asm volatile(
@@ -333,16 +327,12 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
                 "movq   12(%1), %%mm6        \n\t"
                 "paddd    (%1), %%mm2        \n\t"
                 "paddd   8(%1), %%mm6        \n\t"
-                "pxor    %%mm0, %%mm0        \n\t" //note: the 2 xor could be avoided if we would flip the rounding direction
-                "pxor    %%mm4, %%mm4        \n\t"
-                "psubd   %%mm2, %%mm0        \n\t"
-                "psubd   %%mm6, %%mm4        \n\t"
-                "psrad      $1, %%mm0        \n\t"
-                "psrad      $1, %%mm4        \n\t"
-                "psubd   %%mm0, %%mm2        \n\t"
-                "psubd   %%mm4, %%mm6        \n\t"
                 "movq     (%0), %%mm0        \n\t"
                 "movq    8(%0), %%mm4        \n\t"
+                "paddd   %%mm2, %%mm0        \n\t"
+                "paddd   %%mm6, %%mm4        \n\t"
+                "psrad      $1, %%mm2        \n\t"
+                "psrad      $1, %%mm6        \n\t"
                 "paddd   %%mm0, %%mm2        \n\t"
                 "paddd   %%mm4, %%mm6        \n\t"
                 "movq    %%mm2, (%2)         \n\t"
@@ -351,7 +341,7 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
                  : "memory"
                );
         }
-        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS);
+        snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
     }
 
     {
diff --git a/libavcodec/snow.c b/libavcodec/snow.c
index 8f2f4c0944..71049dfae9 100644
--- a/libavcodec/snow.c
+++ b/libavcodec/snow.c
@@ -775,7 +775,7 @@ static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int
     int i;
 
     assert(shift == 4);
-#define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23))
+#define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): -((-16*4*(src) + 4*(ref) + add + 5 + (5<<27))/(5*16) - (1<<23)))
     if(mirror_left){
         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
         dst += dst_step;
@@ -1113,8 +1113,8 @@ static void horizontal_decompose97i(DWTELEM *b, int width){
     DWTELEM temp[width];
     const int w2= (width+1)>>1;
 
-    lift (temp+w2, b    +1, b      , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0);
-    liftS(temp   , b      , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0);
+    lift (temp+w2, b    +1, b      , 1, 2, 2, width,  W_AM, W_AO, W_AS, 1, 1);
+    liftS(temp   , b      , temp+w2, 1, 2, 1, width,  W_BM, W_BO, W_BS, 0, 0);
     lift5(b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
     lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
 }
@@ -1150,7 +1150,7 @@ static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int w
 #ifdef liftS
         b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
 #else
-        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23);
+        b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + W_BO*5 + (5<<27)) / (5*16) - (1<<23);
 #endif
     }
 }
@@ -1344,8 +1344,8 @@ void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
 
     lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
     lift5(temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
-    liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO-1, W_BS, 0, 1);
-    lift (b+1    , temp+w2, b      , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1);
+    liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO, W_BS, 0, 1);
+    lift (b+1    , temp+w2, b      , 2, 1, 2, width,  W_AM, W_AO, W_AS, 1, 0);
 }
 
 static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
diff --git a/libavcodec/snow.h b/libavcodec/snow.h
index 19df4ad791..9dd66031cb 100644
--- a/libavcodec/snow.h
+++ b/libavcodec/snow.h
@@ -165,11 +165,11 @@ static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, DWTELE
 
 static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){
         for(; i<w; i++){
-            dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO-1 + 4 * src[i]) >> W_BS);
+            dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO + 4 * src[i]) >> W_BS);
         }
 
         if(width&1){
-            dst[w] = src[w] + ((2 * ref[w] + W_BO-1 + 4 * src[w]) >> W_BS);
+            dst[w] = src[w] + ((2 * ref[w] + W_BO + 4 * src[w]) >> W_BS);
         }
 }
author	Michael Niedermayer <michaelni@gmx.at>	2007-08-21 16:29:40 +0000
committer	Michael Niedermayer <michaelni@gmx.at>	2007-08-21 16:29:40 +0000
commit	ce611a27be7c9201b5920d8232e68209529065c4 (patch)
tree	7d02fb42432300f1ca74277ab0aa77abab7b9fd8 /libavcodec
parent	7506d47aa320beb00369325b2e868f4a84c36af0 (diff)