From 1c67ad9d93031839e6e5032190cd95f6dfda8f59 Mon Sep 17 00:00:00 2001 From: Yu Xiaolei Date: Sat, 14 Dec 2013 15:31:42 +0800 Subject: swscale: NEON optimized unscaled rgba to nv12 conversion Signed-off-by: Yu Xiaolei Signed-off-by: Michael Niedermayer --- libswscale/arm/Makefile | 4 + libswscale/arm/rgb2yuv_neon_16.S | 80 ++++++++++ libswscale/arm/rgb2yuv_neon_32.S | 119 ++++++++++++++ libswscale/arm/rgb2yuv_neon_common.S | 291 +++++++++++++++++++++++++++++++++++ libswscale/arm/swscale_unscaled.c | 79 ++++++++++ 5 files changed, 573 insertions(+) create mode 100644 libswscale/arm/Makefile create mode 100644 libswscale/arm/rgb2yuv_neon_16.S create mode 100644 libswscale/arm/rgb2yuv_neon_32.S create mode 100644 libswscale/arm/rgb2yuv_neon_common.S create mode 100644 libswscale/arm/swscale_unscaled.c (limited to 'libswscale/arm') diff --git a/libswscale/arm/Makefile b/libswscale/arm/Makefile new file mode 100644 index 0000000000..41ff6ff714 --- /dev/null +++ b/libswscale/arm/Makefile @@ -0,0 +1,4 @@ +OBJS += arm/swscale_unscaled.o + +NEON-OBJS += arm/rgb2yuv_neon_32.o +NEON-OBJS += arm/rgb2yuv_neon_16.o diff --git a/libswscale/arm/rgb2yuv_neon_16.S b/libswscale/arm/rgb2yuv_neon_16.S new file mode 100644 index 0000000000..601bc9a9b7 --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_16.S @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2013 Xiaolei Yu + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "rgb2yuv_neon_common.S" + +/* downsampled R16G16B16 x8 */ +alias_qw r16x8, q7 +alias_qw g16x8, q8 +alias_qw b16x8, q9 + +alias n16x16_l, q11 +alias n16x16_h, q12 + +alias y16x16_l, q13 +alias y16x16_h, q14 + +alias_qw y8x16, q15 + +.macro init src + vld3.i32 {q13_l, q14_l, q15_l}, [\src]! + vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] + vrshrn.i32 CO_R, q13, #7 + vrshrn.i32 CO_G, q14, #7 + vrshrn.i32 CO_B, q15, #7 + + vmov.u8 BIAS_Y, #16 + vmov.u8 BIAS_U, #128 +.endm + + +.macro compute_y_16x1_step action, s8x16, coeff + vmovl.u8 n16x16_l, \s8x16\()_l + vmovl.u8 n16x16_h, \s8x16\()_h + + \action y16x16_l, n16x16_l, \coeff + \action y16x16_h, n16x16_h, \coeff +.endm + +.macro compute_y_16x1 + compute_y_16x1_step vmul, r8x16, CO_RY + compute_y_16x1_step vmla, g8x16, CO_GY + compute_y_16x1_step vmla, b8x16, CO_BY + + vrshrn.i16 y8x16_l, y16x16_l, #8 + vrshrn.i16 y8x16_h, y16x16_h, #8 + + vadd.u8 y8x16, y8x16, BIAS_Y +.endm + +alias c16x8, q15 +alias_qw c8x8x2, q10 + + +.macro compute_chroma_8x1 c, C + vmul c16x8, r16x8, CO_R\C + vmla c16x8, g16x8, CO_G\C + vmla c16x8, b16x8, CO_B\C + + vrshrn.i16 \c\()8x8, c16x8, #8 + vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C +.endm + + loop_420sp rgbx, nv12, init, kernel_420_16x2, 16 diff --git a/libswscale/arm/rgb2yuv_neon_32.S b/libswscale/arm/rgb2yuv_neon_32.S new file mode 100644 index 0000000000..f51a5f149f --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_32.S @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2013 Xiaolei Yu + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "rgb2yuv_neon_common.S" + +/* downsampled R16G16B16 x8 */ +alias_qw r16x8, q7 +alias_qw g16x8, q8 +alias_qw b16x8, q9 + +alias n16x16_o, q11 +alias n16x16_ol, q11_l +alias n16x16_oh, q11_h + +alias y32x16_el, q12 +alias y32x16_eh, q13 +alias y32x16_ol, q14 +alias y32x16_oh, q15 + +alias y16x16_e, q12 +alias y16x16_el, q12_l +alias y16x16_eh, q12_h +alias y16x16_o, q13 +alias y16x16_ol, q13_l +alias y16x16_oh, q13_h + + +alias y8x16, y16x16_e + + +.macro init src + // load s32x3x3, narrow to s16x3x3 + vld3.i32 {q13_l, q14_l, q15_l}, [\src]! + vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] + + vmovn.i32 CO_R, q13 + vmovn.i32 CO_G, q14 + vmovn.i32 CO_B, q15 + + vmov.u8 BIAS_Y, #16 + vmov.u8 BIAS_U, #128 +.endm + + +.macro compute_y_16x1_step action, s8x16, coeff + vmov.u8 n16x16_o, #0 + vtrn.u8 \s8x16, n16x16_o + + \action y32x16_el, \s8x16\()_l, \coeff + \action y32x16_eh, \s8x16\()_h, \coeff + \action y32x16_ol, n16x16_ol, \coeff + \action y32x16_oh, n16x16_oh, \coeff +.endm + +/* + * in: r8x16, g8x16, b8x16 + * out: y8x16 + * clobber: q11-q15, r8x16, g8x16, b8x16 + */ +.macro compute_y_16x1 + compute_y_16x1_step vmull, r8x16, CO_RY + compute_y_16x1_step vmlal, g8x16, CO_GY + compute_y_16x1_step vmlal, b8x16, CO_BY + + vrshrn.i32 y16x16_el, y32x16_el, #15 + vrshrn.i32 y16x16_eh, y32x16_eh, #15 + vrshrn.i32 y16x16_ol, y32x16_ol, #15 + vrshrn.i32 y16x16_oh, y32x16_oh, #15 + + vtrn.8 y16x16_e, y16x16_o + vadd.u8 y8x16, y8x16, BIAS_Y +.endm + +alias c32x8_l, q14 +alias c32x8_h, q15 + +alias_qw c16x8, q13 +alias_qw c8x8x2, q10 + +.macro compute_chroma_8x1_step action, s16x8, coeff + \action c32x8_l, \s16x8\()_l, \coeff + \action c32x8_h, \s16x8\()_h, \coeff +.endm + +/* + * in: r16x8, g16x8, b16x8 + * out: c8x8 + * clobber: q14-q15 + */ +.macro compute_chroma_8x1 c, C + compute_chroma_8x1_step vmull, r16x8, CO_R\C + compute_chroma_8x1_step vmlal, g16x8, CO_G\C + compute_chroma_8x1_step vmlal, b16x8, CO_B\C + + vrshrn.i32 c16x8_l, c32x8_l, #15 + vrshrn.i32 c16x8_h, c32x8_h, #15 + vmovn.i16 \c\()8x8, c16x8 + vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C +.endm + + + loop_420sp rgbx, nv12, init, kernel_420_16x2, 32 diff --git a/libswscale/arm/rgb2yuv_neon_common.S b/libswscale/arm/rgb2yuv_neon_common.S new file mode 100644 index 0000000000..32662292bc --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_common.S @@ -0,0 +1,291 @@ +/* + * Copyright (C) 2013 Xiaolei Yu + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro alias name, tgt, set=1 +.if \set != 0 + \name .req \tgt +.else + .unreq \name +.endif +.endm + +.altmacro + +.macro alias_dw_all qw, dw_l, dw_h + alias q\qw\()_l, d\dw_l + alias q\qw\()_h, d\dw_h + .if \qw < 15 + alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2) + .endif +.endm + +alias_dw_all 0, 0, 1 + +.noaltmacro + +.macro alias_qw name, qw, set=1 + alias \name\(), \qw, \set + alias \name\()_l, \qw\()_l, \set + alias \name\()_h, \qw\()_h, \set +.endm + +.macro prologue + push {r4-r12, lr} + vpush {q4-q7} +.endm + +.macro epilogue + vpop {q4-q7} + pop {r4-r12, pc} +.endm + +.macro load_arg reg, ix + ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)] +.endm + + +/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma + * int width, int height, + * int y_stride, int c_stride, int src_stride, + * int32_t coeff_table[9]); + */ +.macro alias_loop_420sp set=1 + alias src, r0, \set + alias src0, src, \set + alias y, r1, \set + alias y0, y, \set + alias chroma, r2, \set + alias width, r3, \set + alias header, width, \set + + alias height, r4, \set + alias y_stride, r5, \set + alias c_stride, r6, \set + alias c_padding, c_stride, \set + alias src_stride, r7, \set + + alias y0_end, r8, \set + + alias src_padding,r9, \set + alias y_padding, r10, \set + + alias src1, r11, \set + alias y1, r12, \set + + alias coeff_table,r12, \set +.endm + + +.macro loop_420sp s_fmt, d_fmt, init, kernel, precision + +function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1 + prologue + + alias_loop_420sp + + load_arg height, 4 + load_arg y_stride, 5 + load_arg c_stride, 6 + load_arg src_stride, 7 + load_arg coeff_table, 8 + + \init coeff_table + + sub y_padding, y_stride, width + sub c_padding, c_stride, width + sub src_padding, src_stride, width, LSL #2 + + add y0_end, y0, width + and header, width, #15 + + add y1, y0, y_stride + add src1, src0, src_stride + +0: + cmp header, #0 + beq 1f + + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header + +1: + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma + + cmp y0, y0_end + blt 1b +2: + add y0, y1, y_padding + add y0_end, y1, y_stride + add chroma, chroma, c_padding + add src0, src1, src_padding + + add y1, y0, y_stride + add src1, src0, src_stride + + subs height, height, #2 + + bgt 0b + + epilogue + + alias_loop_420sp 0 + +endfunc +.endm + +.macro downsample + vpaddl.u8 r16x8, r8x16 + vpaddl.u8 g16x8, g8x16 + vpaddl.u8 b16x8, b8x16 +.endm + + +/* acculumate and right shift by 2 */ +.macro downsample_ars2 + vpadal.u8 r16x8, r8x16 + vpadal.u8 g16x8, g8x16 + vpadal.u8 b16x8, b8x16 + + vrshr.u16 r16x8, r16x8, #2 + vrshr.u16 g16x8, g16x8, #2 + vrshr.u16 b16x8, b16x8, #2 +.endm + +.macro store_y8_16x1 dst, count +.if \count == 0 + vstmia \dst!, {y8x16} +.else + vstmia \dst, {y8x16} + add \dst, \dst, \count +.endif +.endm + +.macro store_chroma_nv12_8x1 dst, count +.if \count == 0 + vst2.i8 {u8x8, v8x8}, [\dst]! +.else + vst2.i8 {u8x8, v8x8}, [\dst], \count +.endif +.endm + +.macro store_chroma_nv21_8x1 dst, count +.if \count == 0 + vst2.i8 {v8x8, u8x8}, [\dst]! +.else + vst2.i8 {v8x8, u8x8}, [\dst], \count +.endif +.endm + +.macro load_8888_16x1 a, b, c, d, src, count +.if \count == 0 + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]! +.else + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src] + sub \src, \src, #32 + add \src, \src, \count, LSL #2 +.endif +.endm + +.macro load_rgbx_16x1 src, count + load_8888_16x1 r, g, b, x, \src, \count +.endm + +.macro load_bgrx_16x1 src, count + load_8888_16x1 b, g, r, x, \src, \count +.endm + +.macro alias_src_rgbx set + alias_src_8888 r, g, b, x, \set +.endm + +.macro alias_src_bgrx set + alias_src_8888 b, g, r, x, \set +.endm + +.macro alias_dst_nv12 set + alias u8x8, c8x8x2_l, \set + alias v8x8, c8x8x2_h, \set +.endm + +.macro alias_dst_nv21 set + alias v8x8, c8x8x2_l, \set + alias u8x8, c8x8x2_h, \set +.endm + + +// common aliases + +alias CO_R d0 +CO_RY .dn d0.s16[0] +CO_RU .dn d0.s16[1] +CO_RV .dn d0.s16[2] + +alias CO_G d1 +CO_GY .dn d1.s16[0] +CO_GU .dn d1.s16[1] +CO_GV .dn d1.s16[2] + +alias CO_B d2 +CO_BY .dn d2.s16[0] +CO_BU .dn d2.s16[1] +CO_BV .dn d2.s16[2] + +alias BIAS_U, d3 +alias BIAS_V, BIAS_U + +alias BIAS_Y, q2 + + +/* q3-q6 R8G8B8X8 x16 */ + +.macro alias_src_8888 a, b, c, d, set + alias_qw \a\()8x16, q3, \set + alias_qw \b\()8x16, q4, \set + alias_qw \c\()8x16, q5, \set + alias_qw \d\()8x16, q6, \set +.endm + +.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count=0 + alias_src_\rgb_fmt + alias_dst_\yuv_fmt + + load_\rgb_fmt\()_16x1 \rgb0, \count + + downsample + compute_y_16x1 + store_y8_16x1 \y0, \count + + + load_\rgb_fmt\()_16x1 \rgb1, \count + downsample_ars2 + compute_y_16x1 + store_y8_16x1 \y1, \count + + compute_chroma_8x1 u, U + compute_chroma_8x1 v, V + + store_chroma_\yuv_fmt\()_8x1 \chroma, \count + + alias_dst_\yuv_fmt 0 + alias_src_\rgb_fmt 0 +.endm diff --git a/libswscale/arm/swscale_unscaled.c b/libswscale/arm/swscale_unscaled.c new file mode 100644 index 0000000000..04be7622bc --- /dev/null +++ b/libswscale/arm/swscale_unscaled.c @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2013 Xiaolei Yu + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libswscale/swscale.h" +#include "libswscale/swscale_internal.h" +#include "libavutil/arm/cpu.h" + +extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma, + int width, int height, + int y_stride, int c_stride, int src_stride, + int32_t coeff_tbl[9]); + +extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma, + int width, int height, + int y_stride, int c_stride, int src_stride, + int32_t coeff_tbl[9]); + +static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) { + + rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0], + dst[0] + srcSliceY * dstStride[0], + dst[1] + (srcSliceY / 2) * dstStride[1], + context->srcW, srcSliceH, + dstStride[0], dstStride[1], srcStride[0], + context->input_rgb2yuv_table); + + return 0; +} + +static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) { + + rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0], + dst[0] + srcSliceY * dstStride[0], + dst[1] + (srcSliceY / 2) * dstStride[1], + context->srcW, srcSliceH, + dstStride[0], dstStride[1], srcStride[0], + context->input_rgb2yuv_table); + + return 0; +} + +static void get_unscaled_swscale_neon(SwsContext *c) { + int accurate_rnd = c->flags & SWS_ACCURATE_RND; + if (c->srcFormat == AV_PIX_FMT_RGBA + && c->dstFormat == AV_PIX_FMT_NV12 + && (c->srcW >= 16)) { + c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper + : rgbx_to_nv12_neon_16_wrapper; + } +} + +void ff_get_unscaled_swscale_arm(SwsContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + if (have_neon(cpu_flags)) + get_unscaled_swscale_neon(c); +} -- cgit v1.2.3