From 358b5b1a5958fa12b841aefe09e5e81c7d8bd346 Mon Sep 17 00:00:00 2001
From: Michael Niedermayer <michaelni@gmx.at>
Date: Thu, 25 Feb 2010 23:44:42 +0000
Subject: Get rid of mb2b8_xy and b8_stride, change arrays organized based on
 b8_stride to ones based on mb_stride in h264. about 20 cpu cycles faster
 overall per MB

Originally committed as revision 22065 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/h264_direct.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'libavcodec/h264_direct.c')

diff --git a/libavcodec/h264_direct.c b/libavcodec/h264_direct.c
index 8977be114f..4d716796cd 100644
--- a/libavcodec/h264_direct.c
+++ b/libavcodec/h264_direct.c
@@ -142,7 +142,7 @@ void ff_h264_direct_ref_list_init(H264Context * const h){
 
 static void pred_spatial_direct_motion(H264Context * const h, int *mb_type){
     MpegEncContext * const s = &h->s;
-    int b8_stride = h->b8_stride;
+    int b8_stride = 2;
     int b4_stride = h->b_stride;
     int mb_xy = h->mb_xy;
     int mb_type_col[2];
@@ -228,7 +228,7 @@ static void pred_spatial_direct_motion(H264Context * const h, int *mb_type){
             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
-            b8_stride *= 3;
+            b8_stride = 2+4*s->mb_stride;
             b4_stride *= 6;
 
             sub_mb_type |= MB_TYPE_16x16|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
@@ -262,12 +262,12 @@ single_col:
 
     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
-    l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
-    l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
+    l1ref0 = &h->ref_list[1][0].ref_index [0][4*mb_xy];
+    l1ref1 = &h->ref_list[1][0].ref_index [1][4*mb_xy];
     if(!b8_stride){
         if(s->mb_y&1){
-            l1ref0 += h->b8_stride;
-            l1ref1 += h->b8_stride;
+            l1ref0 += 2;
+            l1ref1 += 2;
             l1mv0  +=  2*b4_stride;
             l1mv1  +=  2*b4_stride;
         }
@@ -342,11 +342,12 @@ single_col:
                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
 
+                assert(b8_stride==2);
                 /* col_zero_flag */
-                if(!IS_INTRA(mb_type_col[0]) && !h->ref_list[1][0].long_ref && (   l1ref0[x8 + y8*b8_stride] == 0
-                                              || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
+                if(!IS_INTRA(mb_type_col[0]) && !h->ref_list[1][0].long_ref && (   l1ref0[i8] == 0
+                                              || (l1ref0[i8] < 0 && l1ref1[i8] == 0
                                                   && h->x264_build>33U))){
-                    const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
+                    const int16_t (*l1mv)[2]= l1ref0[i8] == 0 ? l1mv0 : l1mv1;
                     if(IS_SUB_8X8(sub_mb_type)){
                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
@@ -381,7 +382,7 @@ single_col:
 
 static void pred_temp_direct_motion(H264Context * const h, int *mb_type){
     MpegEncContext * const s = &h->s;
-    int b8_stride = h->b8_stride;
+    int b8_stride = 2;
     int b4_stride = h->b_stride;
     int mb_xy = h->mb_xy;
     int mb_type_col[2];
@@ -406,7 +407,7 @@ static void pred_temp_direct_motion(H264Context * const h, int *mb_type){
             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
-            b8_stride *= 3;
+            b8_stride = 2+4*s->mb_stride;
             b4_stride *= 6;
 
             sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
@@ -441,12 +442,12 @@ single_col:
 
     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
-    l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
-    l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
+    l1ref0 = &h->ref_list[1][0].ref_index [0][4*mb_xy];
+    l1ref1 = &h->ref_list[1][0].ref_index [1][4*mb_xy];
     if(!b8_stride){
         if(s->mb_y&1){
-            l1ref0 += h->b8_stride;
-            l1ref1 += h->b8_stride;
+            l1ref0 += 2;
+            l1ref1 += 2;
             l1mv0  +=  2*b4_stride;
             l1mv1  +=  2*b4_stride;
         }
@@ -549,11 +550,12 @@ single_col:
                     continue;
                 }
 
-                ref0 = l1ref0[x8 + y8*b8_stride];
+                assert(b8_stride == 2);
+                ref0 = l1ref0[i8];
                 if(ref0 >= 0)
                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
                 else{
-                    ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
+                    ref0 = map_col_to_list0[1][l1ref1[i8] + ref_offset];
                     l1mv= l1mv1;
                 }
                 scale = dist_scale_factor[ref0];
-- 
cgit v1.2.3