From 5a29589b8101a90feabbfd5ad6ffc9c88ab1157f Mon Sep 17 00:00:00 2001
From: Måns Rullgård <mans@mansr.com>
Date: Sun, 25 Jan 2009 13:04:41 +0000
Subject: ARM: NEON optimised H.264 biweighted prediction

Originally committed as revision 16770 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/arm/dsputil_neon.c |  34 +++++++++
 libavcodec/arm/h264dsp_neon.S | 168 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 202 insertions(+)

(limited to 'libavcodec/arm')

diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c
index e50e16007d..188ae1fb57 100644
--- a/libavcodec/arm/dsputil_neon.c
+++ b/libavcodec/arm/dsputil_neon.c
@@ -92,6 +92,31 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                        int beta, int8_t *tc0);
 
+void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                        int log2_den, int weightd, int weights,
+                                        int offset);
+void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                       int log2_den, int weightd, int weights,
+                                       int offset);
+void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
+                                       int log2_den, int weightd, int weights,
+                                       int offset);
+void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
+                                      int log2_den, int weightd, int weights,
+                                      int offset);
+
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
@@ -176,6 +201,15 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
     c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
     c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
 
+    c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
+    c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
+    c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
+    c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
+    c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
+    c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
+    c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
+    c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
+
     c->h264_idct_add = ff_h264_idct_add_neon;
     c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
     c->h264_idct_add16      = ff_h264_idct_add16_neon;
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index e57cb6f1e6..616a8132e5 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -1368,3 +1368,171 @@ function ff_put_h264_qpel16_mc33_neon, export=1
         sub             r1,  r1,  #1
         b               put_h264_qpel16_mc11
         .endfunc
+
+@ Biweighted prediction
+
+        .macro  biweight_16 macs, macd
+        vdup.8          d0,  r4
+        vdup.8          d1,  r5
+        vmov            q2,  q8
+        vmov            q3,  q8
+1:      subs            ip,  ip,  #2
+        vld1.8          {d20-d21},[r0,:128], r2
+        \macd           q2,  d0,  d20
+        pld             [r0]
+        \macd           q3,  d0,  d21
+        vld1.8          {d22-d23},[r1,:128], r2
+        \macs           q2,  d1,  d22
+        pld             [r1]
+        \macs           q3,  d1,  d23
+        vmov            q12, q8
+        vld1.8          {d28-d29},[r0,:128], r2
+        vmov            q13, q8
+        \macd           q12, d0,  d28
+        pld             [r0]
+        \macd           q13, d0,  d29
+        vld1.8          {d30-d31},[r1,:128], r2
+        \macs           q12, d1,  d30
+        pld             [r1]
+        \macs           q13, d1,  d31
+        vshl.s16        q2,  q2,  q9
+        vshl.s16        q3,  q3,  q9
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d5,  q3
+        vshl.s16        q12, q12, q9
+        vshl.s16        q13, q13, q9
+        vqmovun.s16     d24, q12
+        vqmovun.s16     d25, q13
+        vmov            q3,  q8
+        vst1.8          {d4- d5}, [r6,:128], r2
+        vmov            q2,  q8
+        vst1.8          {d24-d25},[r6,:128], r2
+        bne             1b
+        pop             {r4-r6, pc}
+        .endm
+
+        .macro  biweight_8 macs, macd
+        vdup.8          d0,  r4
+        vdup.8          d1,  r5
+        vmov            q1,  q8
+        vmov            q10, q8
+1:      subs            ip,  ip,  #2
+        vld1.8          {d4},[r0,:64], r2
+        \macd           q1,  d0,  d4
+        pld             [r0]
+        vld1.8          {d5},[r1,:64], r2
+        \macs           q1,  d1,  d5
+        pld             [r1]
+        vld1.8          {d6},[r0,:64], r2
+        \macd           q10, d0,  d6
+        pld             [r0]
+        vld1.8          {d7},[r1,:64], r2
+        \macs           q10, d1,  d7
+        pld             [r1]
+        vshl.s16        q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        vshl.s16        q10, q10, q9
+        vqmovun.s16     d4,  q10
+        vmov            q10, q8
+        vst1.8          {d2},[r6,:64], r2
+        vmov            q1,  q8
+        vst1.8          {d4},[r6,:64], r2
+        bne             1b
+        pop             {r4-r6, pc}
+        .endm
+
+        .macro  biweight_4 macs, macd
+        vdup.8          d0,  r4
+        vdup.8          d1,  r5
+        vmov            q1,  q8
+        vmov            q10, q8
+1:      subs            ip,  ip,  #4
+        vld1.32         {d4[0]},[r0,:32], r2
+        vld1.32         {d4[1]},[r0,:32], r2
+        \macd           q1,  d0,  d4
+        pld             [r0]
+        vld1.32         {d5[0]},[r1,:32], r2
+        vld1.32         {d5[1]},[r1,:32], r2
+        \macs           q1,  d1,  d5
+        pld             [r1]
+        blt             2f
+        vld1.32         {d6[0]},[r0,:32], r2
+        vld1.32         {d6[1]},[r0,:32], r2
+        \macd           q10, d0,  d6
+        pld             [r0]
+        vld1.32         {d7[0]},[r1,:32], r2
+        vld1.32         {d7[1]},[r1,:32], r2
+        \macs           q10, d1,  d7
+        pld             [r1]
+        vshl.s16        q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        vshl.s16        q10, q10, q9
+        vqmovun.s16     d4,  q10
+        vmov            q10, q8
+        vst1.32         {d2[0]},[r6,:32], r2
+        vst1.32         {d2[1]},[r6,:32], r2
+        vmov            q1,  q8
+        vst1.32         {d4[0]},[r6,:32], r2
+        vst1.32         {d4[1]},[r6,:32], r2
+        bne             1b
+        pop             {r4-r6, pc}
+2:      vshl.s16        q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        vst1.32         {d2[0]},[r6,:32], r2
+        vst1.32         {d2[1]},[r6,:32], r2
+        pop             {r4-r6, pc}
+        .endm
+
+        .macro  biweight_func w
+function biweight_h264_pixels_\w\()_neon
+        push            {r4-r6, lr}
+        add             r4,  sp,  #16
+        ldm             r4,  {r4-r6}
+        lsr             lr,  r4,  #31
+        add             r6,  r6,  #1
+        eors            lr,  lr,  r5,  lsr #30
+        orr             r6,  r6,  #1
+        vdup.16         q9,  r3
+        lsl             r6,  r6,  r3
+        vmvn            q9,  q9
+        vdup.16         q8,  r6
+        mov             r6,  r0
+        beq             10f
+        subs            lr,  lr,  #1
+        beq             20f
+        subs            lr,  lr,  #1
+        beq             30f
+        b               40f
+10:     biweight_\w     vmlal.u8, vmlal.u8
+20:     rsb             r4,  r4,  #0
+        biweight_\w     vmlal.u8, vmlsl.u8
+30:     rsb             r4,  r4,  #0
+        rsb             r5,  r5,  #0
+        biweight_\w     vmlsl.u8, vmlsl.u8
+40:     rsb             r5,  r5,  #0
+        biweight_\w     vmlsl.u8, vmlal.u8
+        .endfunc
+        .endm
+
+        .macro  biweight_entry w, h, b=1
+function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
+        mov             ip,  #\h
+.if \b
+        b               biweight_h264_pixels_\w\()_neon
+.endif
+        .endfunc
+        .endm
+
+        biweight_entry  16, 8
+        biweight_entry  16, 16, b=0
+        biweight_func   16
+
+        biweight_entry  8,  16
+        biweight_entry  8,  4
+        biweight_entry  8,  8,  b=0
+        biweight_func   8
+
+        biweight_entry  4,  8
+        biweight_entry  4,  2
+        biweight_entry  4,  4,  b=0
+        biweight_func   4
-- 
cgit v1.2.3