From 5a29589b8101a90feabbfd5ad6ffc9c88ab1157f Mon Sep 17 00:00:00 2001 From: Måns Rullgård Date: Sun, 25 Jan 2009 13:04:41 +0000 Subject: ARM: NEON optimised H.264 biweighted prediction Originally committed as revision 16770 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/arm/dsputil_neon.c | 34 +++++++++ libavcodec/arm/h264dsp_neon.S | 168 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) (limited to 'libavcodec/arm') diff --git a/libavcodec/arm/dsputil_neon.c b/libavcodec/arm/dsputil_neon.c index e50e16007d..188ae1fb57 100644 --- a/libavcodec/arm/dsputil_neon.c +++ b/libavcodec/arm/dsputil_neon.c @@ -92,6 +92,31 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); +void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); +void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride, + int log2_den, int weightd, int weights, + int offset); + void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, @@ -176,6 +201,15 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; + c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon; + c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon; + c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon; + c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon; + c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon; + c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon; + c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon; + c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon; + c->h264_idct_add = ff_h264_idct_add_neon; c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; c->h264_idct_add16 = ff_h264_idct_add16_neon; diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S index e57cb6f1e6..616a8132e5 100644 --- a/libavcodec/arm/h264dsp_neon.S +++ b/libavcodec/arm/h264dsp_neon.S @@ -1368,3 +1368,171 @@ function ff_put_h264_qpel16_mc33_neon, export=1 sub r1, r1, #1 b put_h264_qpel16_mc11 .endfunc + +@ Biweighted prediction + + .macro biweight_16 macs, macd + vdup.8 d0, r4 + vdup.8 d1, r5 + vmov q2, q8 + vmov q3, q8 +1: subs ip, ip, #2 + vld1.8 {d20-d21},[r0,:128], r2 + \macd q2, d0, d20 + pld [r0] + \macd q3, d0, d21 + vld1.8 {d22-d23},[r1,:128], r2 + \macs q2, d1, d22 + pld [r1] + \macs q3, d1, d23 + vmov q12, q8 + vld1.8 {d28-d29},[r0,:128], r2 + vmov q13, q8 + \macd q12, d0, d28 + pld [r0] + \macd q13, d0, d29 + vld1.8 {d30-d31},[r1,:128], r2 + \macs q12, d1, d30 + pld [r1] + \macs q13, d1, d31 + vshl.s16 q2, q2, q9 + vshl.s16 q3, q3, q9 + vqmovun.s16 d4, q2 + vqmovun.s16 d5, q3 + vshl.s16 q12, q12, q9 + vshl.s16 q13, q13, q9 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vmov q3, q8 + vst1.8 {d4- d5}, [r6,:128], r2 + vmov q2, q8 + vst1.8 {d24-d25},[r6,:128], r2 + bne 1b + pop {r4-r6, pc} + .endm + + .macro biweight_8 macs, macd + vdup.8 d0, r4 + vdup.8 d1, r5 + vmov q1, q8 + vmov q10, q8 +1: subs ip, ip, #2 + vld1.8 {d4},[r0,:64], r2 + \macd q1, d0, d4 + pld [r0] + vld1.8 {d5},[r1,:64], r2 + \macs q1, d1, d5 + pld [r1] + vld1.8 {d6},[r0,:64], r2 + \macd q10, d0, d6 + pld [r0] + vld1.8 {d7},[r1,:64], r2 + \macs q10, d1, d7 + pld [r1] + vshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vmov q10, q8 + vst1.8 {d2},[r6,:64], r2 + vmov q1, q8 + vst1.8 {d4},[r6,:64], r2 + bne 1b + pop {r4-r6, pc} + .endm + + .macro biweight_4 macs, macd + vdup.8 d0, r4 + vdup.8 d1, r5 + vmov q1, q8 + vmov q10, q8 +1: subs ip, ip, #4 + vld1.32 {d4[0]},[r0,:32], r2 + vld1.32 {d4[1]},[r0,:32], r2 + \macd q1, d0, d4 + pld [r0] + vld1.32 {d5[0]},[r1,:32], r2 + vld1.32 {d5[1]},[r1,:32], r2 + \macs q1, d1, d5 + pld [r1] + blt 2f + vld1.32 {d6[0]},[r0,:32], r2 + vld1.32 {d6[1]},[r0,:32], r2 + \macd q10, d0, d6 + pld [r0] + vld1.32 {d7[0]},[r1,:32], r2 + vld1.32 {d7[1]},[r1,:32], r2 + \macs q10, d1, d7 + pld [r1] + vshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vshl.s16 q10, q10, q9 + vqmovun.s16 d4, q10 + vmov q10, q8 + vst1.32 {d2[0]},[r6,:32], r2 + vst1.32 {d2[1]},[r6,:32], r2 + vmov q1, q8 + vst1.32 {d4[0]},[r6,:32], r2 + vst1.32 {d4[1]},[r6,:32], r2 + bne 1b + pop {r4-r6, pc} +2: vshl.s16 q1, q1, q9 + vqmovun.s16 d2, q1 + vst1.32 {d2[0]},[r6,:32], r2 + vst1.32 {d2[1]},[r6,:32], r2 + pop {r4-r6, pc} + .endm + + .macro biweight_func w +function biweight_h264_pixels_\w\()_neon + push {r4-r6, lr} + add r4, sp, #16 + ldm r4, {r4-r6} + lsr lr, r4, #31 + add r6, r6, #1 + eors lr, lr, r5, lsr #30 + orr r6, r6, #1 + vdup.16 q9, r3 + lsl r6, r6, r3 + vmvn q9, q9 + vdup.16 q8, r6 + mov r6, r0 + beq 10f + subs lr, lr, #1 + beq 20f + subs lr, lr, #1 + beq 30f + b 40f +10: biweight_\w vmlal.u8, vmlal.u8 +20: rsb r4, r4, #0 + biweight_\w vmlal.u8, vmlsl.u8 +30: rsb r4, r4, #0 + rsb r5, r5, #0 + biweight_\w vmlsl.u8, vmlsl.u8 +40: rsb r5, r5, #0 + biweight_\w vmlsl.u8, vmlal.u8 + .endfunc + .endm + + .macro biweight_entry w, h, b=1 +function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1 + mov ip, #\h +.if \b + b biweight_h264_pixels_\w\()_neon +.endif + .endfunc + .endm + + biweight_entry 16, 8 + biweight_entry 16, 16, b=0 + biweight_func 16 + + biweight_entry 8, 16 + biweight_entry 8, 4 + biweight_entry 8, 8, b=0 + biweight_func 8 + + biweight_entry 4, 8 + biweight_entry 4, 2 + biweight_entry 4, 4, b=0 + biweight_func 4 -- cgit v1.2.3