diff options
Diffstat (limited to 'libswscale/arm')
-rw-r--r-- | libswscale/arm/Makefile | 8 | ||||
-rw-r--r-- | libswscale/arm/hscale.S | 70 | ||||
-rw-r--r-- | libswscale/arm/output.S | 78 | ||||
-rw-r--r-- | libswscale/arm/rgb2yuv_neon_16.S | 80 | ||||
-rw-r--r-- | libswscale/arm/rgb2yuv_neon_32.S | 119 | ||||
-rw-r--r-- | libswscale/arm/rgb2yuv_neon_common.S | 291 | ||||
-rw-r--r-- | libswscale/arm/swscale.c | 44 | ||||
-rw-r--r-- | libswscale/arm/swscale_unscaled.c | 180 | ||||
-rw-r--r-- | libswscale/arm/yuv2rgb_neon.S | 280 |
9 files changed, 1150 insertions, 0 deletions
diff --git a/libswscale/arm/Makefile b/libswscale/arm/Makefile new file mode 100644 index 0000000000..792da6b715 --- /dev/null +++ b/libswscale/arm/Makefile @@ -0,0 +1,8 @@ +OBJS += arm/swscale.o \ + arm/swscale_unscaled.o \ + +NEON-OBJS += arm/rgb2yuv_neon_32.o +NEON-OBJS += arm/rgb2yuv_neon_16.o +NEON-OBJS += arm/hscale.o \ + arm/output.o \ + arm/yuv2rgb_neon.o \ diff --git a/libswscale/arm/hscale.S b/libswscale/arm/hscale.S new file mode 100644 index 0000000000..dd4d453957 --- /dev/null +++ b/libswscale/arm/hscale.S @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com> + * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_hscale_8_to_15_neon, export=1 + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ filter + ldr r5, [sp, #108] @ filterPos + ldr r6, [sp, #112] @ filterSize + add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2 +1: ldr r8, [r5], #4 @ filterPos[0] + ldr r9, [r5], #4 @ filterPos[1] + vmov.s32 q4, #0 @ val accumulator + vmov.s32 q5, #0 @ val accumulator + mov r7, r6 @ tmpfilterSize = filterSize + mov r0, r3 @ srcp +2: add r11, r0, r8 @ srcp + filterPos[0] + add r12, r0, r9 @ srcp + filterPos[1] + vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}] + vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}] + vld1.16 {q2}, [r4]! @ load 8x16-bit filter values + vld1.16 {q3}, [r10]! @ load 8x16-bit filter values + vmovl.u8 q0, d0 @ unpack src values to 16-bit + vmovl.u8 q1, d2 @ unpack src values to 16-bit + vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1) + vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2) + vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1) + vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2) + vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1) + vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2) + vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1) + vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2) + vadd.s32 q4, q8 @ update val accumulator + vadd.s32 q5, q10 @ update val accumulator + add r0, #8 @ srcp += 8 + subs r7, #8 @ tmpfilterSize -= 8 + bgt 2b @ loop until tmpfilterSize is consumed + mov r4, r10 @ filter = filter2 + add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2 + vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1) + vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2) + vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit + vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values + vst1.32 {d8[0]},[r1]! @ write destination + subs r2, #2 @ dstW -= 2 + bgt 1b @ loop until end of line + vpop {q4-q7} + pop {r4-r12, lr} + mov pc, lr +endfunc diff --git a/libswscale/arm/output.S b/libswscale/arm/output.S new file mode 100644 index 0000000000..70846dee1f --- /dev/null +++ b/libswscale/arm/output.S @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com> + * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_yuv2planeX_8_neon, export=1 + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ dstW + ldr r5, [sp, #108] @ dither + ldr r6, [sp, #112] @ offset + vld1.8 {d0}, [r5] @ load 8x8-bit dither values + cmp r6, #0 @ check offsetting which can be 0 or 3 only + beq 1f + vext.u8 d0, d0, d0, #3 @ honor offseting which can be 3 only +1: vmovl.u8 q0, d0 @ extend dither to 16-bit + vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1) + vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2) + mov r7, #0 @ i = 0 +2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1) + vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2) + mov r8, r1 @ tmpFilterSize = filterSize + mov r9, r2 @ srcp + mov r10, r0 @ filterp +3: ldr r11, [r9], #4 @ get pointer @ src[j] + ldr r12, [r9], #4 @ get pointer @ src[j+1] + add r11, r11, r7, lsl #1 @ &src[j][i] + add r12, r12, r7, lsl #1 @ &src[j+1][i] + vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H + vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P + ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1]) + vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction + vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction + vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P + vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y + vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y + vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y + vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y + vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y + vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y + vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y + vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y + vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y + vadd.s32 q3, q5 @ update val accumulator (part 1) + vadd.s32 q4, q6 @ update val accumulator (part 2) + subs r8, #2 @ tmpFilterSize -= 2 + bgt 3b @ loop until filterSize is consumed + vshr.s32 q3, q3, #19 @ val>>19 (part 1) + vshr.s32 q4, q4, #19 @ val>>19 (part 2) + vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1) + vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2) + vqmovn.u16 d6, q3 @ merge part 1 and part 2 + vst1.8 {d6}, [r3]! @ write destination + add r7, #8 @ i += 8 + subs r4, r4, #8 @ dstW -= 8 + bgt 2b @ loop until width is consumed + vpop {q4-q7} + pop {r4-r12, lr} + mov pc, lr +endfunc diff --git a/libswscale/arm/rgb2yuv_neon_16.S b/libswscale/arm/rgb2yuv_neon_16.S new file mode 100644 index 0000000000..601bc9a9b7 --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_16.S @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "rgb2yuv_neon_common.S" + +/* downsampled R16G16B16 x8 */ +alias_qw r16x8, q7 +alias_qw g16x8, q8 +alias_qw b16x8, q9 + +alias n16x16_l, q11 +alias n16x16_h, q12 + +alias y16x16_l, q13 +alias y16x16_h, q14 + +alias_qw y8x16, q15 + +.macro init src + vld3.i32 {q13_l, q14_l, q15_l}, [\src]! + vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] + vrshrn.i32 CO_R, q13, #7 + vrshrn.i32 CO_G, q14, #7 + vrshrn.i32 CO_B, q15, #7 + + vmov.u8 BIAS_Y, #16 + vmov.u8 BIAS_U, #128 +.endm + + +.macro compute_y_16x1_step action, s8x16, coeff + vmovl.u8 n16x16_l, \s8x16\()_l + vmovl.u8 n16x16_h, \s8x16\()_h + + \action y16x16_l, n16x16_l, \coeff + \action y16x16_h, n16x16_h, \coeff +.endm + +.macro compute_y_16x1 + compute_y_16x1_step vmul, r8x16, CO_RY + compute_y_16x1_step vmla, g8x16, CO_GY + compute_y_16x1_step vmla, b8x16, CO_BY + + vrshrn.i16 y8x16_l, y16x16_l, #8 + vrshrn.i16 y8x16_h, y16x16_h, #8 + + vadd.u8 y8x16, y8x16, BIAS_Y +.endm + +alias c16x8, q15 +alias_qw c8x8x2, q10 + + +.macro compute_chroma_8x1 c, C + vmul c16x8, r16x8, CO_R\C + vmla c16x8, g16x8, CO_G\C + vmla c16x8, b16x8, CO_B\C + + vrshrn.i16 \c\()8x8, c16x8, #8 + vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C +.endm + + loop_420sp rgbx, nv12, init, kernel_420_16x2, 16 diff --git a/libswscale/arm/rgb2yuv_neon_32.S b/libswscale/arm/rgb2yuv_neon_32.S new file mode 100644 index 0000000000..f51a5f149f --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_32.S @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "rgb2yuv_neon_common.S" + +/* downsampled R16G16B16 x8 */ +alias_qw r16x8, q7 +alias_qw g16x8, q8 +alias_qw b16x8, q9 + +alias n16x16_o, q11 +alias n16x16_ol, q11_l +alias n16x16_oh, q11_h + +alias y32x16_el, q12 +alias y32x16_eh, q13 +alias y32x16_ol, q14 +alias y32x16_oh, q15 + +alias y16x16_e, q12 +alias y16x16_el, q12_l +alias y16x16_eh, q12_h +alias y16x16_o, q13 +alias y16x16_ol, q13_l +alias y16x16_oh, q13_h + + +alias y8x16, y16x16_e + + +.macro init src + // load s32x3x3, narrow to s16x3x3 + vld3.i32 {q13_l, q14_l, q15_l}, [\src]! + vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src] + + vmovn.i32 CO_R, q13 + vmovn.i32 CO_G, q14 + vmovn.i32 CO_B, q15 + + vmov.u8 BIAS_Y, #16 + vmov.u8 BIAS_U, #128 +.endm + + +.macro compute_y_16x1_step action, s8x16, coeff + vmov.u8 n16x16_o, #0 + vtrn.u8 \s8x16, n16x16_o + + \action y32x16_el, \s8x16\()_l, \coeff + \action y32x16_eh, \s8x16\()_h, \coeff + \action y32x16_ol, n16x16_ol, \coeff + \action y32x16_oh, n16x16_oh, \coeff +.endm + +/* + * in: r8x16, g8x16, b8x16 + * out: y8x16 + * clobber: q11-q15, r8x16, g8x16, b8x16 + */ +.macro compute_y_16x1 + compute_y_16x1_step vmull, r8x16, CO_RY + compute_y_16x1_step vmlal, g8x16, CO_GY + compute_y_16x1_step vmlal, b8x16, CO_BY + + vrshrn.i32 y16x16_el, y32x16_el, #15 + vrshrn.i32 y16x16_eh, y32x16_eh, #15 + vrshrn.i32 y16x16_ol, y32x16_ol, #15 + vrshrn.i32 y16x16_oh, y32x16_oh, #15 + + vtrn.8 y16x16_e, y16x16_o + vadd.u8 y8x16, y8x16, BIAS_Y +.endm + +alias c32x8_l, q14 +alias c32x8_h, q15 + +alias_qw c16x8, q13 +alias_qw c8x8x2, q10 + +.macro compute_chroma_8x1_step action, s16x8, coeff + \action c32x8_l, \s16x8\()_l, \coeff + \action c32x8_h, \s16x8\()_h, \coeff +.endm + +/* + * in: r16x8, g16x8, b16x8 + * out: c8x8 + * clobber: q14-q15 + */ +.macro compute_chroma_8x1 c, C + compute_chroma_8x1_step vmull, r16x8, CO_R\C + compute_chroma_8x1_step vmlal, g16x8, CO_G\C + compute_chroma_8x1_step vmlal, b16x8, CO_B\C + + vrshrn.i32 c16x8_l, c32x8_l, #15 + vrshrn.i32 c16x8_h, c32x8_h, #15 + vmovn.i16 \c\()8x8, c16x8 + vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C +.endm + + + loop_420sp rgbx, nv12, init, kernel_420_16x2, 32 diff --git a/libswscale/arm/rgb2yuv_neon_common.S b/libswscale/arm/rgb2yuv_neon_common.S new file mode 100644 index 0000000000..30bcecd5bb --- /dev/null +++ b/libswscale/arm/rgb2yuv_neon_common.S @@ -0,0 +1,291 @@ +/* + * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +.macro alias name, tgt, set=1 +.if \set != 0 + \name .req \tgt +.else + .unreq \name +.endif +.endm + +.altmacro + +.macro alias_dw_all qw, dw_l, dw_h + alias q\qw\()_l, d\dw_l + alias q\qw\()_h, d\dw_h + .if \qw < 15 + alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2) + .endif +.endm + +alias_dw_all 0, 0, 1 + +.noaltmacro + +.macro alias_qw name, qw, set=1 + alias \name\(), \qw, \set + alias \name\()_l, \qw\()_l, \set + alias \name\()_h, \qw\()_h, \set +.endm + +.macro prologue + push {r4-r12, lr} + vpush {q4-q7} +.endm + +.macro epilogue + vpop {q4-q7} + pop {r4-r12, pc} +.endm + +.macro load_arg reg, ix + ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)] +.endm + + +/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma + * int width, int height, + * int y_stride, int c_stride, int src_stride, + * int32_t coeff_table[9]); + */ +.macro alias_loop_420sp set=1 + alias src, r0, \set + alias src0, src, \set + alias y, r1, \set + alias y0, y, \set + alias chroma, r2, \set + alias width, r3, \set + alias header, width, \set + + alias height, r4, \set + alias y_stride, r5, \set + alias c_stride, r6, \set + alias c_padding, c_stride, \set + alias src_stride, r7, \set + + alias y0_end, r8, \set + + alias src_padding,r9, \set + alias y_padding, r10, \set + + alias src1, r11, \set + alias y1, r12, \set + + alias coeff_table,r12, \set +.endm + + +.macro loop_420sp s_fmt, d_fmt, init, kernel, precision + +function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1 + prologue + + alias_loop_420sp + + load_arg height, 4 + load_arg y_stride, 5 + load_arg c_stride, 6 + load_arg src_stride, 7 + load_arg coeff_table, 8 + + \init coeff_table + + sub y_padding, y_stride, width + sub c_padding, c_stride, width + sub src_padding, src_stride, width, LSL #2 + + add y0_end, y0, width + and header, width, #15 + + add y1, y0, y_stride + add src1, src0, src_stride + +0: + cmp header, #0 + beq 1f + + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header + +1: + \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma + + cmp y0, y0_end + blt 1b +2: + add y0, y1, y_padding + add y0_end, y1, y_stride + add chroma, chroma, c_padding + add src0, src1, src_padding + + add y1, y0, y_stride + add src1, src0, src_stride + + subs height, height, #2 + + bgt 0b + + epilogue + + alias_loop_420sp 0 + +endfunc +.endm + +.macro downsample + vpaddl.u8 r16x8, r8x16 + vpaddl.u8 g16x8, g8x16 + vpaddl.u8 b16x8, b8x16 +.endm + + +/* acculumate and right shift by 2 */ +.macro downsample_ars2 + vpadal.u8 r16x8, r8x16 + vpadal.u8 g16x8, g8x16 + vpadal.u8 b16x8, b8x16 + + vrshr.u16 r16x8, r16x8, #2 + vrshr.u16 g16x8, g16x8, #2 + vrshr.u16 b16x8, b16x8, #2 +.endm + +.macro store_y8_16x1 dst, count +.ifc "\count","" + vstmia \dst!, {y8x16} +.else + vstmia \dst, {y8x16} + add \dst, \dst, \count +.endif +.endm + +.macro store_chroma_nv12_8x1 dst, count +.ifc "\count","" + vst2.i8 {u8x8, v8x8}, [\dst]! +.else + vst2.i8 {u8x8, v8x8}, [\dst], \count +.endif +.endm + +.macro store_chroma_nv21_8x1 dst, count +.ifc "\count","" + vst2.i8 {v8x8, u8x8}, [\dst]! +.else + vst2.i8 {v8x8, u8x8}, [\dst], \count +.endif +.endm + +.macro load_8888_16x1 a, b, c, d, src, count +.ifc "\count","" + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]! +.else + vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]! + vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src] + sub \src, \src, #32 + add \src, \src, \count, LSL #2 +.endif +.endm + +.macro load_rgbx_16x1 src, count + load_8888_16x1 r, g, b, x, \src, \count +.endm + +.macro load_bgrx_16x1 src, count + load_8888_16x1 b, g, r, x, \src, \count +.endm + +.macro alias_src_rgbx set=1 + alias_src_8888 r, g, b, x, \set +.endm + +.macro alias_src_bgrx set=1 + alias_src_8888 b, g, r, x, \set +.endm + +.macro alias_dst_nv12 set=1 + alias u8x8, c8x8x2_l, \set + alias v8x8, c8x8x2_h, \set +.endm + +.macro alias_dst_nv21 set=1 + alias v8x8, c8x8x2_l, \set + alias u8x8, c8x8x2_h, \set +.endm + + +// common aliases + +alias CO_R d0 +CO_RY .dn d0.s16[0] +CO_RU .dn d0.s16[1] +CO_RV .dn d0.s16[2] + +alias CO_G d1 +CO_GY .dn d1.s16[0] +CO_GU .dn d1.s16[1] +CO_GV .dn d1.s16[2] + +alias CO_B d2 +CO_BY .dn d2.s16[0] +CO_BU .dn d2.s16[1] +CO_BV .dn d2.s16[2] + +alias BIAS_U, d3 +alias BIAS_V, BIAS_U + +alias BIAS_Y, q2 + + +/* q3-q6 R8G8B8X8 x16 */ + +.macro alias_src_8888 a, b, c, d, set + alias_qw \a\()8x16, q3, \set + alias_qw \b\()8x16, q4, \set + alias_qw \c\()8x16, q5, \set + alias_qw \d\()8x16, q6, \set +.endm + +.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count + alias_src_\rgb_fmt + alias_dst_\yuv_fmt + + load_\rgb_fmt\()_16x1 \rgb0, \count + + downsample + compute_y_16x1 + store_y8_16x1 \y0, \count + + + load_\rgb_fmt\()_16x1 \rgb1, \count + downsample_ars2 + compute_y_16x1 + store_y8_16x1 \y1, \count + + compute_chroma_8x1 u, U + compute_chroma_8x1 v, V + + store_chroma_\yuv_fmt\()_8x1 \chroma, \count + + alias_dst_\yuv_fmt 0 + alias_src_\rgb_fmt 0 +.endm diff --git a/libswscale/arm/swscale.c b/libswscale/arm/swscale.c new file mode 100644 index 0000000000..1ec360fe24 --- /dev/null +++ b/libswscale/arm/swscale.c @@ -0,0 +1,44 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libswscale/swscale.h" +#include "libswscale/swscale_internal.h" +#include "libavutil/arm/cpu.h" + +void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW, + const uint8_t *src, const int16_t *filter, + const int32_t *filterPos, int filterSize); + +void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize, + const int16_t **src, uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + +av_cold void ff_sws_init_swscale_arm(SwsContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + if (c->srcBpc == 8 && c->dstBpc <= 14) { + c->hyScale = c->hcScale = ff_hscale_8_to_15_neon; + } + if (c->dstBpc == 8) { + c->yuv2planeX = ff_yuv2planeX_8_neon; + } + } +} diff --git a/libswscale/arm/swscale_unscaled.c b/libswscale/arm/swscale_unscaled.c new file mode 100644 index 0000000000..e1597ab42d --- /dev/null +++ b/libswscale/arm/swscale_unscaled.c @@ -0,0 +1,180 @@ +/* + * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libswscale/swscale.h" +#include "libswscale/swscale_internal.h" +#include "libavutil/arm/cpu.h" + +extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma, + int width, int height, + int y_stride, int c_stride, int src_stride, + int32_t coeff_tbl[9]); + +extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma, + int width, int height, + int y_stride, int c_stride, int src_stride, + int32_t coeff_tbl[9]); + +static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) { + + rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0], + dst[0] + srcSliceY * dstStride[0], + dst[1] + (srcSliceY / 2) * dstStride[1], + context->srcW, srcSliceH, + dstStride[0], dstStride[1], srcStride[0], + context->input_rgb2yuv_table); + + return 0; +} + +static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[], + int srcStride[], int srcSliceY, int srcSliceH, + uint8_t *dst[], int dstStride[]) { + + rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0], + dst[0] + srcSliceY * dstStride[0], + dst[1] + (srcSliceY / 2) * dstStride[1], + context->srcW, srcSliceH, + dstStride[0], dstStride[1], srcStride[0], + context->input_rgb2yuv_table); + + return 0; +} + +#define YUV_TO_RGB_TABLE \ + c->yuv2rgb_v2r_coeff, \ + c->yuv2rgb_u2g_coeff, \ + c->yuv2rgb_v2g_coeff, \ + c->yuv2rgb_u2b_coeff, \ + +#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt) \ +int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \ + uint8_t *dst, int linesize, \ + const uint8_t *srcY, int linesizeY, \ + const uint8_t *srcU, int linesizeU, \ + const uint8_t *srcV, int linesizeV, \ + const int16_t *table, \ + int y_offset, \ + int y_coeff); \ + \ +static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \ + int srcStride[], int srcSliceY, int srcSliceH, \ + uint8_t *dst[], int dstStride[]) { \ + const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \ + \ + ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \ + dst[0] + srcSliceY * dstStride[0], dstStride[0], \ + src[0], srcStride[0], \ + src[1], srcStride[1], \ + src[2], srcStride[2], \ + yuv2rgb_table, \ + c->yuv2rgb_y_offset >> 6, \ + c->yuv2rgb_y_coeff); \ + \ + return 0; \ +} \ + +#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \ + +DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p) +DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p) + +#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \ +int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \ + uint8_t *dst, int linesize, \ + const uint8_t *srcY, int linesizeY, \ + const uint8_t *srcC, int linesizeC, \ + const int16_t *table, \ + int y_offset, \ + int y_coeff); \ + \ +static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \ + int srcStride[], int srcSliceY, int srcSliceH, \ + uint8_t *dst[], int dstStride[]) { \ + const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \ + \ + ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \ + dst[0] + srcSliceY * dstStride[0], dstStride[0], \ + src[0], srcStride[0], src[1], srcStride[1], \ + yuv2rgb_table, \ + c->yuv2rgb_y_offset >> 6, \ + c->yuv2rgb_y_coeff); \ + \ + return 0; \ +} \ + +#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \ + +DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12) +DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21) + +/* We need a 16 pixel width alignment. This constraint can easily be removed + * for input reading but for the output which is 4-bytes per pixel (RGBA) the + * assembly might be writing as much as 4*15=60 extra bytes at the end of the + * line, which won't fit the 32-bytes buffer alignment. */ +#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \ + if (c->srcFormat == AV_PIX_FMT_##IFMT \ + && c->dstFormat == AV_PIX_FMT_##OFMT \ + && !(c->srcH & 1) \ + && !(c->srcW & 15) \ + && !accurate_rnd) { \ + c->swscale = ifmt##_to_##ofmt##_neon_wrapper; \ + } \ +} while (0) + +#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd) do { \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB, accurate_rnd); \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \ +} while (0) + +static void get_unscaled_swscale_neon(SwsContext *c) { + int accurate_rnd = c->flags & SWS_ACCURATE_RND; + if (c->srcFormat == AV_PIX_FMT_RGBA + && c->dstFormat == AV_PIX_FMT_NV12 + && (c->srcW >= 16)) { + c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper + : rgbx_to_nv12_neon_16_wrapper; + } + + SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd); + SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd); + SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd); + SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd); +} + +void ff_get_unscaled_swscale_arm(SwsContext *c) +{ + int cpu_flags = av_get_cpu_flags(); + if (have_neon(cpu_flags)) + get_unscaled_swscale_neon(c); +} diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S new file mode 100644 index 0000000000..10950e70b4 --- /dev/null +++ b/libswscale/arm/yuv2rgb_neon.S @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com> + * Copyright (c) 2015 Clément Bœsch <clement stupeflix.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + + +.macro compute_premult + vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3) + vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3) + vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r + vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g + vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g + vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g + vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b +.endm + +.macro compute_color dst_comp1 dst_comp2 pre + vadd.s16 q1, q14, \pre + vadd.s16 q2, q15, \pre + vqrshrun.s16 \dst_comp1, q1, #1 + vqrshrun.s16 \dst_comp2, q2, #1 +.endm + +.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2 + compute_color \r1, \r2, q8 + compute_color \g1, \g2, q9 + compute_color \b1, \b2, q10 + vmov.u8 \a1, #255 + vmov.u8 \a2, #255 +.endm + +.macro compute dst ofmt + vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3) + vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3) + vsub.s16 q14, q12 @ q14 = (Y - y_offset) + vsub.s16 q15, q12 @ q15 = (Y - y_offset) + vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff + vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff + +.ifc \ofmt,argb + compute_rgba d7, d8, d9, d6, d11, d12, d13, d10 +.endif + +.ifc \ofmt,rgba + compute_rgba d6, d7, d8, d9, d10, d11, d12, d13 +.endif + +.ifc \ofmt,abgr + compute_rgba d9, d8, d7, d6, d13, d12, d11, d10 +.endif + +.ifc \ofmt,bgra + compute_rgba d8, d7, d6, d9, d12, d11, d10, d13 +.endif + + vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16 + vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16 + vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16 + vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16 + vst4.8 {q3, q4}, [\dst,:128]! + vst4.8 {q5, q6}, [\dst,:128]! +.endm + +.macro process_1l_internal dst src ofmt + vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved) + compute \dst, \ofmt +.endm + +.macro process_1l ofmt + compute_premult + process_1l_internal r2, r4, \ofmt +.endm + +.macro process_2l ofmt + compute_premult + process_1l_internal r2, r4, \ofmt + process_1l_internal r11,r12,\ofmt +.endm + +.macro load_args_nv12 + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ r4 = srcY + ldr r5, [sp, #108] @ r5 = linesizeY + ldr r6, [sp, #112] @ r6 = srcC + ldr r7, [sp, #116] @ r7 = linesizeC + ldr r8, [sp, #120] @ r8 = table + ldr r9, [sp, #124] @ r9 = y_offset + ldr r10,[sp, #128] @ r10 = y_coeff + vdup.16 d0, r10 @ d0 = y_coeff + vld1.16 {d1}, [r8] @ d1 = *table + add r11, r2, r3 @ r11 = dst + linesize (dst2) + add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) + lsl r3, r3, #1 + lsl r5, r5, #1 + sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) + sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) + sub r7, r7, r0 @ r7 = linesizeC - width (paddingC) +.endm + +.macro load_args_nv21 + load_args_nv12 +.endm + +.macro load_args_yuv420p + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ r4 = srcY + ldr r5, [sp, #108] @ r5 = linesizeY + ldr r6, [sp, #112] @ r6 = srcU + ldr r8, [sp, #128] @ r8 = table + ldr r9, [sp, #132] @ r9 = y_offset + ldr r10,[sp, #136] @ r10 = y_coeff + vdup.16 d0, r10 @ d0 = y_coeff + vld1.16 {d1}, [r8] @ d1 = *table + add r11, r2, r3 @ r11 = dst + linesize (dst2) + add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) + lsl r3, r3, #1 + lsl r5, r5, #1 + sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding) + sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) + ldr r10,[sp, #120] @ r10 = srcV +.endm + +.macro load_args_yuv422p + push {r4-r12, lr} + vpush {q4-q7} + ldr r4, [sp, #104] @ r4 = srcY + ldr r5, [sp, #108] @ r5 = linesizeY + ldr r6, [sp, #112] @ r6 = srcU + ldr r7, [sp, #116] @ r7 = linesizeU + ldr r12,[sp, #124] @ r12 = linesizeV + ldr r8, [sp, #128] @ r8 = table + ldr r9, [sp, #132] @ r9 = y_offset + ldr r10,[sp, #136] @ r10 = y_coeff + vdup.16 d0, r10 @ d0 = y_coeff + vld1.16 {d1}, [r8] @ d1 = *table + sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding) + sub r5, r5, r0 @ r5 = linesizeY - width (paddingY) + sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) + sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV) + ldr r10,[sp, #120] @ r10 = srcV +.endm + +.macro load_chroma_nv12 + pld [r12, #64*3] + + vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line + vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) + vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) +.endm + +.macro load_chroma_nv21 + pld [r12, #64*3] + + vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line + vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3) + vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3) +.endm + +.macro load_chroma_yuv420p + pld [r10, #64*3] + pld [r12, #64*3] + + vld1.8 d2, [r6]! @ d2: chroma red line + vld1.8 d3, [r10]! @ d3: chroma blue line + vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) + vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) +.endm + +.macro load_chroma_yuv422p + pld [r10, #64*3] + + vld1.8 d2, [r6]! @ d2: chroma red line + vld1.8 d3, [r10]! @ d3: chroma blue line + vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3) + vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3) +.endm + +.macro increment_and_test_nv12 + add r11, r11, r3 @ dst2 += padding + add r12, r12, r5 @ srcY2 += paddingY + add r6, r6, r7 @ srcC += paddingC + subs r1, r1, #2 @ height -= 2 +.endm + +.macro increment_and_test_nv21 + increment_and_test_nv12 +.endm + +.macro increment_and_test_yuv420p + add r11, r11, r3 @ dst2 += padding + add r12, r12, r5 @ srcY2 += paddingY + ldr r7, [sp, #116] @ r7 = linesizeU + sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU) + add r6, r6, r7 @ srcU += paddingU + ldr r7, [sp, #124] @ r7 = linesizeV + sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV) + add r10, r10, r7 @ srcV += paddingV + subs r1, r1, #2 @ height -= 2 +.endm + +.macro increment_and_test_yuv422p + add r6, r6, r7 @ srcU += paddingU + add r10,r10,r12 @ srcV += paddingV + subs r1, r1, #1 @ height -= 1 +.endm + +.macro process_nv12 ofmt + process_2l \ofmt +.endm + +.macro process_nv21 ofmt + process_2l \ofmt +.endm + +.macro process_yuv420p ofmt + process_2l \ofmt +.endm + +.macro process_yuv422p ofmt + process_1l \ofmt +.endm + +.macro declare_func ifmt ofmt +function ff_\ifmt\()_to_\ofmt\()_neon, export=1 + load_args_\ifmt + vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3) + vdup.16 q12, r9 @ q12 = y_offset + vmov d26, d0 @ q13 = y_coeff + vmov d27, d0 @ q13 = y_coeff +1: + mov r8, r0 @ r8 = width +2: + pld [r6, #64*3] + pld [r4, #64*3] + vmov.i8 d10, #128 + load_chroma_\ifmt + process_\ifmt \ofmt + subs r8, r8, #16 @ width -= 16 + bgt 2b + add r2, r2, r3 @ dst += padding + add r4, r4, r5 @ srcY += paddingY + increment_and_test_\ifmt + bgt 1b + vpop {q4-q7} + pop {r4-r12, lr} + mov pc, lr +endfunc +.endm + +.macro declare_rgb_funcs ifmt + declare_func \ifmt, argb + declare_func \ifmt, rgba + declare_func \ifmt, abgr + declare_func \ifmt, bgra +.endm + +declare_rgb_funcs nv12 +declare_rgb_funcs nv21 +declare_rgb_funcs yuv420p +declare_rgb_funcs yuv422p |