summaryrefslogtreecommitdiff
path: root/libswscale/arm
diff options
context:
space:
mode:
Diffstat (limited to 'libswscale/arm')
-rw-r--r--libswscale/arm/Makefile8
-rw-r--r--libswscale/arm/hscale.S70
-rw-r--r--libswscale/arm/output.S78
-rw-r--r--libswscale/arm/rgb2yuv_neon_16.S80
-rw-r--r--libswscale/arm/rgb2yuv_neon_32.S119
-rw-r--r--libswscale/arm/rgb2yuv_neon_common.S291
-rw-r--r--libswscale/arm/swscale.c44
-rw-r--r--libswscale/arm/swscale_unscaled.c180
-rw-r--r--libswscale/arm/yuv2rgb_neon.S280
9 files changed, 1150 insertions, 0 deletions
diff --git a/libswscale/arm/Makefile b/libswscale/arm/Makefile
new file mode 100644
index 0000000000..792da6b715
--- /dev/null
+++ b/libswscale/arm/Makefile
@@ -0,0 +1,8 @@
+OBJS += arm/swscale.o \
+ arm/swscale_unscaled.o \
+
+NEON-OBJS += arm/rgb2yuv_neon_32.o
+NEON-OBJS += arm/rgb2yuv_neon_16.o
+NEON-OBJS += arm/hscale.o \
+ arm/output.o \
+ arm/yuv2rgb_neon.o \
diff --git a/libswscale/arm/hscale.S b/libswscale/arm/hscale.S
new file mode 100644
index 0000000000..dd4d453957
--- /dev/null
+++ b/libswscale/arm/hscale.S
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
+ * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_hscale_8_to_15_neon, export=1
+ push {r4-r12, lr}
+ vpush {q4-q7}
+ ldr r4, [sp, #104] @ filter
+ ldr r5, [sp, #108] @ filterPos
+ ldr r6, [sp, #112] @ filterSize
+ add r10, r4, r6, lsl #1 @ filter2 = filter + filterSize * 2
+1: ldr r8, [r5], #4 @ filterPos[0]
+ ldr r9, [r5], #4 @ filterPos[1]
+ vmov.s32 q4, #0 @ val accumulator
+ vmov.s32 q5, #0 @ val accumulator
+ mov r7, r6 @ tmpfilterSize = filterSize
+ mov r0, r3 @ srcp
+2: add r11, r0, r8 @ srcp + filterPos[0]
+ add r12, r0, r9 @ srcp + filterPos[1]
+ vld1.8 d0, [r11] @ srcp[filterPos[0] + {0..7}]
+ vld1.8 d2, [r12] @ srcp[filterPos[1] + {0..7}]
+ vld1.16 {q2}, [r4]! @ load 8x16-bit filter values
+ vld1.16 {q3}, [r10]! @ load 8x16-bit filter values
+ vmovl.u8 q0, d0 @ unpack src values to 16-bit
+ vmovl.u8 q1, d2 @ unpack src values to 16-bit
+ vmull.s16 q8, d0, d4 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 1)
+ vmull.s16 q9, d1, d5 @ srcp[filterPos[0] + {0..7}] * filter[{0..7}] (part 2)
+ vmull.s16 q10, d2, d6 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 1)
+ vmull.s16 q11, d3, d7 @ srcp[filterPos[1] + {0..7}] * filter[{0..7}] (part 2)
+ vpadd.s32 d16, d16, d17 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
+ vpadd.s32 d17, d18, d19 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
+ vpadd.s32 d20, d20, d21 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 1)
+ vpadd.s32 d21, d22, d23 @ horizontal pair adding of the 8x32-bit multiplied values into 4x32-bit (part 2)
+ vadd.s32 q4, q8 @ update val accumulator
+ vadd.s32 q5, q10 @ update val accumulator
+ add r0, #8 @ srcp += 8
+ subs r7, #8 @ tmpfilterSize -= 8
+ bgt 2b @ loop until tmpfilterSize is consumed
+ mov r4, r10 @ filter = filter2
+ add r10, r10, r6, lsl #1 @ filter2 += filterSize * 2
+ vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 1)
+ vpadd.s32 d9, d10, d11 @ horizontal pair adding of the 8x32-bit sums into 4x32-bit (part 2)
+ vpadd.s32 d8, d8, d9 @ horizontal pair adding of the 4x32-bit sums into 2x32-bit
+ vqshrn.s32 d8, q4, #7 @ shift and clip the 2x16-bit final values
+ vst1.32 {d8[0]},[r1]! @ write destination
+ subs r2, #2 @ dstW -= 2
+ bgt 1b @ loop until end of line
+ vpop {q4-q7}
+ pop {r4-r12, lr}
+ mov pc, lr
+endfunc
diff --git a/libswscale/arm/output.S b/libswscale/arm/output.S
new file mode 100644
index 0000000000..70846dee1f
--- /dev/null
+++ b/libswscale/arm/output.S
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
+ * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_yuv2planeX_8_neon, export=1
+ push {r4-r12, lr}
+ vpush {q4-q7}
+ ldr r4, [sp, #104] @ dstW
+ ldr r5, [sp, #108] @ dither
+ ldr r6, [sp, #112] @ offset
+ vld1.8 {d0}, [r5] @ load 8x8-bit dither values
+ cmp r6, #0 @ check offsetting which can be 0 or 3 only
+ beq 1f
+ vext.u8 d0, d0, d0, #3 @ honor offseting which can be 3 only
+1: vmovl.u8 q0, d0 @ extend dither to 16-bit
+ vshll.u16 q1, d0, #12 @ extend dither to 32-bit with left shift by 12 (part 1)
+ vshll.u16 q2, d1, #12 @ extend dither to 32-bit with left shift by 12 (part 2)
+ mov r7, #0 @ i = 0
+2: vmov.u8 q3, q1 @ initialize accumulator with dithering values (part 1)
+ vmov.u8 q4, q2 @ initialize accumulator with dithering values (part 2)
+ mov r8, r1 @ tmpFilterSize = filterSize
+ mov r9, r2 @ srcp
+ mov r10, r0 @ filterp
+3: ldr r11, [r9], #4 @ get pointer @ src[j]
+ ldr r12, [r9], #4 @ get pointer @ src[j+1]
+ add r11, r11, r7, lsl #1 @ &src[j][i]
+ add r12, r12, r7, lsl #1 @ &src[j+1][i]
+ vld1.16 {q5}, [r11] @ read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
+ vld1.16 {q6}, [r12] @ read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
+ ldr r11, [r10], #4 @ read 2x16-bit coeffs (X, Y) at (filter[j], filter[j+1])
+ vmov.16 q7, q5 @ copy 8x16-bit @ src[j ][i + {0..7}] for following inplace zip instruction
+ vmov.16 q8, q6 @ copy 8x16-bit @ src[j+1][i + {0..7}] for following inplace zip instruction
+ vzip.16 q7, q8 @ A,I,B,J,C,K,D,L,E,M,F,N,G,O,H,P
+ vdup.32 q15, r11 @ X,Y,X,Y,X,Y,X,Y
+ vmull.s16 q9, d14, d30 @ A*X,I*Y,B*X,J*Y
+ vmull.s16 q10, d15, d31 @ C*X,K*Y,D*X,L*Y
+ vmull.s16 q11, d16, d30 @ E*X,M*Y,F*X,N*Y
+ vmull.s16 q12, d17, d31 @ G*X,O*Y,H*X,P*Y
+ vpadd.s32 d10, d18, d19 @ A*X+I*Y,B*X+J*Y
+ vpadd.s32 d11, d20, d21 @ C*X+K*Y,D*X+L*Y
+ vpadd.s32 d12, d22, d23 @ E*X+M*Y,F*X+N*Y
+ vpadd.s32 d13, d24, d25 @ G*X+O*Y,H*X+P*Y
+ vadd.s32 q3, q5 @ update val accumulator (part 1)
+ vadd.s32 q4, q6 @ update val accumulator (part 2)
+ subs r8, #2 @ tmpFilterSize -= 2
+ bgt 3b @ loop until filterSize is consumed
+ vshr.s32 q3, q3, #19 @ val>>19 (part 1)
+ vshr.s32 q4, q4, #19 @ val>>19 (part 2)
+ vqmovun.s32 d6, q3 @ clip16(val>>19) (part 1)
+ vqmovun.s32 d7, q4 @ clip16(val>>19) (part 2)
+ vqmovn.u16 d6, q3 @ merge part 1 and part 2
+ vst1.8 {d6}, [r3]! @ write destination
+ add r7, #8 @ i += 8
+ subs r4, r4, #8 @ dstW -= 8
+ bgt 2b @ loop until width is consumed
+ vpop {q4-q7}
+ pop {r4-r12, lr}
+ mov pc, lr
+endfunc
diff --git a/libswscale/arm/rgb2yuv_neon_16.S b/libswscale/arm/rgb2yuv_neon_16.S
new file mode 100644
index 0000000000..601bc9a9b7
--- /dev/null
+++ b/libswscale/arm/rgb2yuv_neon_16.S
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "rgb2yuv_neon_common.S"
+
+/* downsampled R16G16B16 x8 */
+alias_qw r16x8, q7
+alias_qw g16x8, q8
+alias_qw b16x8, q9
+
+alias n16x16_l, q11
+alias n16x16_h, q12
+
+alias y16x16_l, q13
+alias y16x16_h, q14
+
+alias_qw y8x16, q15
+
+.macro init src
+ vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
+ vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
+ vrshrn.i32 CO_R, q13, #7
+ vrshrn.i32 CO_G, q14, #7
+ vrshrn.i32 CO_B, q15, #7
+
+ vmov.u8 BIAS_Y, #16
+ vmov.u8 BIAS_U, #128
+.endm
+
+
+.macro compute_y_16x1_step action, s8x16, coeff
+ vmovl.u8 n16x16_l, \s8x16\()_l
+ vmovl.u8 n16x16_h, \s8x16\()_h
+
+ \action y16x16_l, n16x16_l, \coeff
+ \action y16x16_h, n16x16_h, \coeff
+.endm
+
+.macro compute_y_16x1
+ compute_y_16x1_step vmul, r8x16, CO_RY
+ compute_y_16x1_step vmla, g8x16, CO_GY
+ compute_y_16x1_step vmla, b8x16, CO_BY
+
+ vrshrn.i16 y8x16_l, y16x16_l, #8
+ vrshrn.i16 y8x16_h, y16x16_h, #8
+
+ vadd.u8 y8x16, y8x16, BIAS_Y
+.endm
+
+alias c16x8, q15
+alias_qw c8x8x2, q10
+
+
+.macro compute_chroma_8x1 c, C
+ vmul c16x8, r16x8, CO_R\C
+ vmla c16x8, g16x8, CO_G\C
+ vmla c16x8, b16x8, CO_B\C
+
+ vrshrn.i16 \c\()8x8, c16x8, #8
+ vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
+.endm
+
+ loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
diff --git a/libswscale/arm/rgb2yuv_neon_32.S b/libswscale/arm/rgb2yuv_neon_32.S
new file mode 100644
index 0000000000..f51a5f149f
--- /dev/null
+++ b/libswscale/arm/rgb2yuv_neon_32.S
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "rgb2yuv_neon_common.S"
+
+/* downsampled R16G16B16 x8 */
+alias_qw r16x8, q7
+alias_qw g16x8, q8
+alias_qw b16x8, q9
+
+alias n16x16_o, q11
+alias n16x16_ol, q11_l
+alias n16x16_oh, q11_h
+
+alias y32x16_el, q12
+alias y32x16_eh, q13
+alias y32x16_ol, q14
+alias y32x16_oh, q15
+
+alias y16x16_e, q12
+alias y16x16_el, q12_l
+alias y16x16_eh, q12_h
+alias y16x16_o, q13
+alias y16x16_ol, q13_l
+alias y16x16_oh, q13_h
+
+
+alias y8x16, y16x16_e
+
+
+.macro init src
+ // load s32x3x3, narrow to s16x3x3
+ vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
+ vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
+
+ vmovn.i32 CO_R, q13
+ vmovn.i32 CO_G, q14
+ vmovn.i32 CO_B, q15
+
+ vmov.u8 BIAS_Y, #16
+ vmov.u8 BIAS_U, #128
+.endm
+
+
+.macro compute_y_16x1_step action, s8x16, coeff
+ vmov.u8 n16x16_o, #0
+ vtrn.u8 \s8x16, n16x16_o
+
+ \action y32x16_el, \s8x16\()_l, \coeff
+ \action y32x16_eh, \s8x16\()_h, \coeff
+ \action y32x16_ol, n16x16_ol, \coeff
+ \action y32x16_oh, n16x16_oh, \coeff
+.endm
+
+/*
+ * in: r8x16, g8x16, b8x16
+ * out: y8x16
+ * clobber: q11-q15, r8x16, g8x16, b8x16
+ */
+.macro compute_y_16x1
+ compute_y_16x1_step vmull, r8x16, CO_RY
+ compute_y_16x1_step vmlal, g8x16, CO_GY
+ compute_y_16x1_step vmlal, b8x16, CO_BY
+
+ vrshrn.i32 y16x16_el, y32x16_el, #15
+ vrshrn.i32 y16x16_eh, y32x16_eh, #15
+ vrshrn.i32 y16x16_ol, y32x16_ol, #15
+ vrshrn.i32 y16x16_oh, y32x16_oh, #15
+
+ vtrn.8 y16x16_e, y16x16_o
+ vadd.u8 y8x16, y8x16, BIAS_Y
+.endm
+
+alias c32x8_l, q14
+alias c32x8_h, q15
+
+alias_qw c16x8, q13
+alias_qw c8x8x2, q10
+
+.macro compute_chroma_8x1_step action, s16x8, coeff
+ \action c32x8_l, \s16x8\()_l, \coeff
+ \action c32x8_h, \s16x8\()_h, \coeff
+.endm
+
+/*
+ * in: r16x8, g16x8, b16x8
+ * out: c8x8
+ * clobber: q14-q15
+ */
+.macro compute_chroma_8x1 c, C
+ compute_chroma_8x1_step vmull, r16x8, CO_R\C
+ compute_chroma_8x1_step vmlal, g16x8, CO_G\C
+ compute_chroma_8x1_step vmlal, b16x8, CO_B\C
+
+ vrshrn.i32 c16x8_l, c32x8_l, #15
+ vrshrn.i32 c16x8_h, c32x8_h, #15
+ vmovn.i16 \c\()8x8, c16x8
+ vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
+.endm
+
+
+ loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
diff --git a/libswscale/arm/rgb2yuv_neon_common.S b/libswscale/arm/rgb2yuv_neon_common.S
new file mode 100644
index 0000000000..30bcecd5bb
--- /dev/null
+++ b/libswscale/arm/rgb2yuv_neon_common.S
@@ -0,0 +1,291 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro alias name, tgt, set=1
+.if \set != 0
+ \name .req \tgt
+.else
+ .unreq \name
+.endif
+.endm
+
+.altmacro
+
+.macro alias_dw_all qw, dw_l, dw_h
+ alias q\qw\()_l, d\dw_l
+ alias q\qw\()_h, d\dw_h
+ .if \qw < 15
+ alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
+ .endif
+.endm
+
+alias_dw_all 0, 0, 1
+
+.noaltmacro
+
+.macro alias_qw name, qw, set=1
+ alias \name\(), \qw, \set
+ alias \name\()_l, \qw\()_l, \set
+ alias \name\()_h, \qw\()_h, \set
+.endm
+
+.macro prologue
+ push {r4-r12, lr}
+ vpush {q4-q7}
+.endm
+
+.macro epilogue
+ vpop {q4-q7}
+ pop {r4-r12, pc}
+.endm
+
+.macro load_arg reg, ix
+ ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
+.endm
+
+
+/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
+ * int width, int height,
+ * int y_stride, int c_stride, int src_stride,
+ * int32_t coeff_table[9]);
+ */
+.macro alias_loop_420sp set=1
+ alias src, r0, \set
+ alias src0, src, \set
+ alias y, r1, \set
+ alias y0, y, \set
+ alias chroma, r2, \set
+ alias width, r3, \set
+ alias header, width, \set
+
+ alias height, r4, \set
+ alias y_stride, r5, \set
+ alias c_stride, r6, \set
+ alias c_padding, c_stride, \set
+ alias src_stride, r7, \set
+
+ alias y0_end, r8, \set
+
+ alias src_padding,r9, \set
+ alias y_padding, r10, \set
+
+ alias src1, r11, \set
+ alias y1, r12, \set
+
+ alias coeff_table,r12, \set
+.endm
+
+
+.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
+
+function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
+ prologue
+
+ alias_loop_420sp
+
+ load_arg height, 4
+ load_arg y_stride, 5
+ load_arg c_stride, 6
+ load_arg src_stride, 7
+ load_arg coeff_table, 8
+
+ \init coeff_table
+
+ sub y_padding, y_stride, width
+ sub c_padding, c_stride, width
+ sub src_padding, src_stride, width, LSL #2
+
+ add y0_end, y0, width
+ and header, width, #15
+
+ add y1, y0, y_stride
+ add src1, src0, src_stride
+
+0:
+ cmp header, #0
+ beq 1f
+
+ \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
+
+1:
+ \kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
+
+ cmp y0, y0_end
+ blt 1b
+2:
+ add y0, y1, y_padding
+ add y0_end, y1, y_stride
+ add chroma, chroma, c_padding
+ add src0, src1, src_padding
+
+ add y1, y0, y_stride
+ add src1, src0, src_stride
+
+ subs height, height, #2
+
+ bgt 0b
+
+ epilogue
+
+ alias_loop_420sp 0
+
+endfunc
+.endm
+
+.macro downsample
+ vpaddl.u8 r16x8, r8x16
+ vpaddl.u8 g16x8, g8x16
+ vpaddl.u8 b16x8, b8x16
+.endm
+
+
+/* acculumate and right shift by 2 */
+.macro downsample_ars2
+ vpadal.u8 r16x8, r8x16
+ vpadal.u8 g16x8, g8x16
+ vpadal.u8 b16x8, b8x16
+
+ vrshr.u16 r16x8, r16x8, #2
+ vrshr.u16 g16x8, g16x8, #2
+ vrshr.u16 b16x8, b16x8, #2
+.endm
+
+.macro store_y8_16x1 dst, count
+.ifc "\count",""
+ vstmia \dst!, {y8x16}
+.else
+ vstmia \dst, {y8x16}
+ add \dst, \dst, \count
+.endif
+.endm
+
+.macro store_chroma_nv12_8x1 dst, count
+.ifc "\count",""
+ vst2.i8 {u8x8, v8x8}, [\dst]!
+.else
+ vst2.i8 {u8x8, v8x8}, [\dst], \count
+.endif
+.endm
+
+.macro store_chroma_nv21_8x1 dst, count
+.ifc "\count",""
+ vst2.i8 {v8x8, u8x8}, [\dst]!
+.else
+ vst2.i8 {v8x8, u8x8}, [\dst], \count
+.endif
+.endm
+
+.macro load_8888_16x1 a, b, c, d, src, count
+.ifc "\count",""
+ vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
+ vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
+.else
+ vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
+ vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
+ sub \src, \src, #32
+ add \src, \src, \count, LSL #2
+.endif
+.endm
+
+.macro load_rgbx_16x1 src, count
+ load_8888_16x1 r, g, b, x, \src, \count
+.endm
+
+.macro load_bgrx_16x1 src, count
+ load_8888_16x1 b, g, r, x, \src, \count
+.endm
+
+.macro alias_src_rgbx set=1
+ alias_src_8888 r, g, b, x, \set
+.endm
+
+.macro alias_src_bgrx set=1
+ alias_src_8888 b, g, r, x, \set
+.endm
+
+.macro alias_dst_nv12 set=1
+ alias u8x8, c8x8x2_l, \set
+ alias v8x8, c8x8x2_h, \set
+.endm
+
+.macro alias_dst_nv21 set=1
+ alias v8x8, c8x8x2_l, \set
+ alias u8x8, c8x8x2_h, \set
+.endm
+
+
+// common aliases
+
+alias CO_R d0
+CO_RY .dn d0.s16[0]
+CO_RU .dn d0.s16[1]
+CO_RV .dn d0.s16[2]
+
+alias CO_G d1
+CO_GY .dn d1.s16[0]
+CO_GU .dn d1.s16[1]
+CO_GV .dn d1.s16[2]
+
+alias CO_B d2
+CO_BY .dn d2.s16[0]
+CO_BU .dn d2.s16[1]
+CO_BV .dn d2.s16[2]
+
+alias BIAS_U, d3
+alias BIAS_V, BIAS_U
+
+alias BIAS_Y, q2
+
+
+/* q3-q6 R8G8B8X8 x16 */
+
+.macro alias_src_8888 a, b, c, d, set
+ alias_qw \a\()8x16, q3, \set
+ alias_qw \b\()8x16, q4, \set
+ alias_qw \c\()8x16, q5, \set
+ alias_qw \d\()8x16, q6, \set
+.endm
+
+.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
+ alias_src_\rgb_fmt
+ alias_dst_\yuv_fmt
+
+ load_\rgb_fmt\()_16x1 \rgb0, \count
+
+ downsample
+ compute_y_16x1
+ store_y8_16x1 \y0, \count
+
+
+ load_\rgb_fmt\()_16x1 \rgb1, \count
+ downsample_ars2
+ compute_y_16x1
+ store_y8_16x1 \y1, \count
+
+ compute_chroma_8x1 u, U
+ compute_chroma_8x1 v, V
+
+ store_chroma_\yuv_fmt\()_8x1 \chroma, \count
+
+ alias_dst_\yuv_fmt 0
+ alias_src_\rgb_fmt 0
+.endm
diff --git a/libswscale/arm/swscale.c b/libswscale/arm/swscale.c
new file mode 100644
index 0000000000..1ec360fe24
--- /dev/null
+++ b/libswscale/arm/swscale.c
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/arm/cpu.h"
+
+void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
+ const uint8_t *src, const int16_t *filter,
+ const int32_t *filterPos, int filterSize);
+
+void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+av_cold void ff_sws_init_swscale_arm(SwsContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ if (c->srcBpc == 8 && c->dstBpc <= 14) {
+ c->hyScale = c->hcScale = ff_hscale_8_to_15_neon;
+ }
+ if (c->dstBpc == 8) {
+ c->yuv2planeX = ff_yuv2planeX_8_neon;
+ }
+ }
+}
diff --git a/libswscale/arm/swscale_unscaled.c b/libswscale/arm/swscale_unscaled.c
new file mode 100644
index 0000000000..e1597ab42d
--- /dev/null
+++ b/libswscale/arm/swscale_unscaled.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libswscale/swscale.h"
+#include "libswscale/swscale_internal.h"
+#include "libavutil/arm/cpu.h"
+
+extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma,
+ int width, int height,
+ int y_stride, int c_stride, int src_stride,
+ int32_t coeff_tbl[9]);
+
+extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma,
+ int width, int height,
+ int y_stride, int c_stride, int src_stride,
+ int32_t coeff_tbl[9]);
+
+static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[]) {
+
+ rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0],
+ dst[0] + srcSliceY * dstStride[0],
+ dst[1] + (srcSliceY / 2) * dstStride[1],
+ context->srcW, srcSliceH,
+ dstStride[0], dstStride[1], srcStride[0],
+ context->input_rgb2yuv_table);
+
+ return 0;
+}
+
+static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[],
+ int srcStride[], int srcSliceY, int srcSliceH,
+ uint8_t *dst[], int dstStride[]) {
+
+ rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0],
+ dst[0] + srcSliceY * dstStride[0],
+ dst[1] + (srcSliceY / 2) * dstStride[1],
+ context->srcW, srcSliceH,
+ dstStride[0], dstStride[1], srcStride[0],
+ context->input_rgb2yuv_table);
+
+ return 0;
+}
+
+#define YUV_TO_RGB_TABLE \
+ c->yuv2rgb_v2r_coeff, \
+ c->yuv2rgb_u2g_coeff, \
+ c->yuv2rgb_v2g_coeff, \
+ c->yuv2rgb_u2b_coeff, \
+
+#define DECLARE_FF_YUVX_TO_RGBX_FUNCS(ifmt, ofmt) \
+int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
+ uint8_t *dst, int linesize, \
+ const uint8_t *srcY, int linesizeY, \
+ const uint8_t *srcU, int linesizeU, \
+ const uint8_t *srcV, int linesizeV, \
+ const int16_t *table, \
+ int y_offset, \
+ int y_coeff); \
+ \
+static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
+ int srcStride[], int srcSliceY, int srcSliceH, \
+ uint8_t *dst[], int dstStride[]) { \
+ const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
+ \
+ ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
+ dst[0] + srcSliceY * dstStride[0], dstStride[0], \
+ src[0], srcStride[0], \
+ src[1], srcStride[1], \
+ src[2], srcStride[2], \
+ yuv2rgb_table, \
+ c->yuv2rgb_y_offset >> 6, \
+ c->yuv2rgb_y_coeff); \
+ \
+ return 0; \
+} \
+
+#define DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuvx) \
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, argb) \
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgba) \
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, abgr) \
+DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgra) \
+
+DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p)
+DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p)
+
+#define DECLARE_FF_NVX_TO_RGBX_FUNCS(ifmt, ofmt) \
+int ff_##ifmt##_to_##ofmt##_neon(int w, int h, \
+ uint8_t *dst, int linesize, \
+ const uint8_t *srcY, int linesizeY, \
+ const uint8_t *srcC, int linesizeC, \
+ const int16_t *table, \
+ int y_offset, \
+ int y_coeff); \
+ \
+static int ifmt##_to_##ofmt##_neon_wrapper(SwsContext *c, const uint8_t *src[], \
+ int srcStride[], int srcSliceY, int srcSliceH, \
+ uint8_t *dst[], int dstStride[]) { \
+ const int16_t yuv2rgb_table[] = { YUV_TO_RGB_TABLE }; \
+ \
+ ff_##ifmt##_to_##ofmt##_neon(c->srcW, srcSliceH, \
+ dst[0] + srcSliceY * dstStride[0], dstStride[0], \
+ src[0], srcStride[0], src[1], srcStride[1], \
+ yuv2rgb_table, \
+ c->yuv2rgb_y_offset >> 6, \
+ c->yuv2rgb_y_coeff); \
+ \
+ return 0; \
+} \
+
+#define DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nvx) \
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, argb) \
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgba) \
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, abgr) \
+DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgra) \
+
+DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12)
+DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21)
+
+/* We need a 16 pixel width alignment. This constraint can easily be removed
+ * for input reading but for the output which is 4-bytes per pixel (RGBA) the
+ * assembly might be writing as much as 4*15=60 extra bytes at the end of the
+ * line, which won't fit the 32-bytes buffer alignment. */
+#define SET_FF_NVX_TO_RGBX_FUNC(ifmt, IFMT, ofmt, OFMT, accurate_rnd) do { \
+ if (c->srcFormat == AV_PIX_FMT_##IFMT \
+ && c->dstFormat == AV_PIX_FMT_##OFMT \
+ && !(c->srcH & 1) \
+ && !(c->srcW & 15) \
+ && !accurate_rnd) { \
+ c->swscale = ifmt##_to_##ofmt##_neon_wrapper; \
+ } \
+} while (0)
+
+#define SET_FF_NVX_TO_ALL_RGBX_FUNC(nvx, NVX, accurate_rnd) do { \
+ SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, argb, ARGB, accurate_rnd); \
+ SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgba, RGBA, accurate_rnd); \
+ SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, abgr, ABGR, accurate_rnd); \
+ SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgra, BGRA, accurate_rnd); \
+} while (0)
+
+static void get_unscaled_swscale_neon(SwsContext *c) {
+ int accurate_rnd = c->flags & SWS_ACCURATE_RND;
+ if (c->srcFormat == AV_PIX_FMT_RGBA
+ && c->dstFormat == AV_PIX_FMT_NV12
+ && (c->srcW >= 16)) {
+ c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper
+ : rgbx_to_nv12_neon_16_wrapper;
+ }
+
+ SET_FF_NVX_TO_ALL_RGBX_FUNC(nv12, NV12, accurate_rnd);
+ SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd);
+ SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd);
+ SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd);
+}
+
+void ff_get_unscaled_swscale_arm(SwsContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+ if (have_neon(cpu_flags))
+ get_unscaled_swscale_neon(c);
+}
diff --git a/libswscale/arm/yuv2rgb_neon.S b/libswscale/arm/yuv2rgb_neon.S
new file mode 100644
index 0000000000..10950e70b4
--- /dev/null
+++ b/libswscale/arm/yuv2rgb_neon.S
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
+ * Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+
+.macro compute_premult
+ vsub.u16 q14,q11 @ q14 = U * (1 << 3) - 128 * (1 << 3)
+ vsub.u16 q15,q11 @ q15 = V * (1 << 3) - 128 * (1 << 3)
+ vqdmulh.s16 q8, q15, d1[0] @ q8 = V * v2r
+ vqdmulh.s16 q9, q14, d1[1] @ q9 = U * u2g
+ vqdmulh.s16 q5, q15, d1[2] @ q5 = V * v2g
+ vadd.s16 q9, q5 @ q9 = U * u2g + V * v2g
+ vqdmulh.s16 q10,q14, d1[3] @ q10 = U * u2b
+.endm
+
+.macro compute_color dst_comp1 dst_comp2 pre
+ vadd.s16 q1, q14, \pre
+ vadd.s16 q2, q15, \pre
+ vqrshrun.s16 \dst_comp1, q1, #1
+ vqrshrun.s16 \dst_comp2, q2, #1
+.endm
+
+.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
+ compute_color \r1, \r2, q8
+ compute_color \g1, \g2, q9
+ compute_color \b1, \b2, q10
+ vmov.u8 \a1, #255
+ vmov.u8 \a2, #255
+.endm
+
+.macro compute dst ofmt
+ vshll.u8 q14, d14, #3 @ q14 = Y * (1 << 3)
+ vshll.u8 q15, d15, #3 @ q15 = Y * (1 << 3)
+ vsub.s16 q14, q12 @ q14 = (Y - y_offset)
+ vsub.s16 q15, q12 @ q15 = (Y - y_offset)
+ vqdmulh.s16 q14, q13 @ q14 = (Y - y_offset) * y_coeff
+ vqdmulh.s16 q15, q13 @ q15 = (Y - y_offset) * y_coeff
+
+.ifc \ofmt,argb
+ compute_rgba d7, d8, d9, d6, d11, d12, d13, d10
+.endif
+
+.ifc \ofmt,rgba
+ compute_rgba d6, d7, d8, d9, d10, d11, d12, d13
+.endif
+
+.ifc \ofmt,abgr
+ compute_rgba d9, d8, d7, d6, d13, d12, d11, d10
+.endif
+
+.ifc \ofmt,bgra
+ compute_rgba d8, d7, d6, d9, d12, d11, d10, d13
+.endif
+
+ vzip.8 d6, d10 @ d6 = R1R2R3R4R5R6R7R8 d10 = R9R10R11R12R13R14R15R16
+ vzip.8 d7, d11 @ d7 = G1G2G3G4G5G6G7G8 d11 = G9G10G11G12G13G14G15G16
+ vzip.8 d8, d12 @ d8 = B1B2B3B4B5B6B7B8 d12 = B9B10B11B12B13B14B15B16
+ vzip.8 d9, d13 @ d9 = A1A2A3A4A5A6A7A8 d13 = A9A10A11A12A13A14A15A16
+ vst4.8 {q3, q4}, [\dst,:128]!
+ vst4.8 {q5, q6}, [\dst,:128]!
+.endm
+
+.macro process_1l_internal dst src ofmt
+ vld2.8 {d14, d15}, [\src]! @ q7 = Y (interleaved)
+ compute \dst, \ofmt
+.endm
+
+.macro process_1l ofmt
+ compute_premult
+ process_1l_internal r2, r4, \ofmt
+.endm
+
+.macro process_2l ofmt
+ compute_premult
+ process_1l_internal r2, r4, \ofmt
+ process_1l_internal r11,r12,\ofmt
+.endm
+
+.macro load_args_nv12
+ push {r4-r12, lr}
+ vpush {q4-q7}
+ ldr r4, [sp, #104] @ r4 = srcY
+ ldr r5, [sp, #108] @ r5 = linesizeY
+ ldr r6, [sp, #112] @ r6 = srcC
+ ldr r7, [sp, #116] @ r7 = linesizeC
+ ldr r8, [sp, #120] @ r8 = table
+ ldr r9, [sp, #124] @ r9 = y_offset
+ ldr r10,[sp, #128] @ r10 = y_coeff
+ vdup.16 d0, r10 @ d0 = y_coeff
+ vld1.16 {d1}, [r8] @ d1 = *table
+ add r11, r2, r3 @ r11 = dst + linesize (dst2)
+ add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
+ lsl r3, r3, #1
+ lsl r5, r5, #1
+ sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
+ sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
+ sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
+.endm
+
+.macro load_args_nv21
+ load_args_nv12
+.endm
+
+.macro load_args_yuv420p
+ push {r4-r12, lr}
+ vpush {q4-q7}
+ ldr r4, [sp, #104] @ r4 = srcY
+ ldr r5, [sp, #108] @ r5 = linesizeY
+ ldr r6, [sp, #112] @ r6 = srcU
+ ldr r8, [sp, #128] @ r8 = table
+ ldr r9, [sp, #132] @ r9 = y_offset
+ ldr r10,[sp, #136] @ r10 = y_coeff
+ vdup.16 d0, r10 @ d0 = y_coeff
+ vld1.16 {d1}, [r8] @ d1 = *table
+ add r11, r2, r3 @ r11 = dst + linesize (dst2)
+ add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
+ lsl r3, r3, #1
+ lsl r5, r5, #1
+ sub r3, r3, r0, lsl #2 @ r3 = linesize * 2 - width * 4 (padding)
+ sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
+ ldr r10,[sp, #120] @ r10 = srcV
+.endm
+
+.macro load_args_yuv422p
+ push {r4-r12, lr}
+ vpush {q4-q7}
+ ldr r4, [sp, #104] @ r4 = srcY
+ ldr r5, [sp, #108] @ r5 = linesizeY
+ ldr r6, [sp, #112] @ r6 = srcU
+ ldr r7, [sp, #116] @ r7 = linesizeU
+ ldr r12,[sp, #124] @ r12 = linesizeV
+ ldr r8, [sp, #128] @ r8 = table
+ ldr r9, [sp, #132] @ r9 = y_offset
+ ldr r10,[sp, #136] @ r10 = y_coeff
+ vdup.16 d0, r10 @ d0 = y_coeff
+ vld1.16 {d1}, [r8] @ d1 = *table
+ sub r3, r3, r0, lsl #2 @ r3 = linesize - width * 4 (padding)
+ sub r5, r5, r0 @ r5 = linesizeY - width (paddingY)
+ sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
+ sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
+ ldr r10,[sp, #120] @ r10 = srcV
+.endm
+
+.macro load_chroma_nv12
+ pld [r12, #64*3]
+
+ vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
+ vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
+ vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
+.endm
+
+.macro load_chroma_nv21
+ pld [r12, #64*3]
+
+ vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
+ vshll.u8 q14, d3, #3 @ q14 = U * (1 << 3)
+ vshll.u8 q15, d2, #3 @ q15 = V * (1 << 3)
+.endm
+
+.macro load_chroma_yuv420p
+ pld [r10, #64*3]
+ pld [r12, #64*3]
+
+ vld1.8 d2, [r6]! @ d2: chroma red line
+ vld1.8 d3, [r10]! @ d3: chroma blue line
+ vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
+ vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
+.endm
+
+.macro load_chroma_yuv422p
+ pld [r10, #64*3]
+
+ vld1.8 d2, [r6]! @ d2: chroma red line
+ vld1.8 d3, [r10]! @ d3: chroma blue line
+ vshll.u8 q14, d2, #3 @ q14 = U * (1 << 3)
+ vshll.u8 q15, d3, #3 @ q15 = V * (1 << 3)
+.endm
+
+.macro increment_and_test_nv12
+ add r11, r11, r3 @ dst2 += padding
+ add r12, r12, r5 @ srcY2 += paddingY
+ add r6, r6, r7 @ srcC += paddingC
+ subs r1, r1, #2 @ height -= 2
+.endm
+
+.macro increment_and_test_nv21
+ increment_and_test_nv12
+.endm
+
+.macro increment_and_test_yuv420p
+ add r11, r11, r3 @ dst2 += padding
+ add r12, r12, r5 @ srcY2 += paddingY
+ ldr r7, [sp, #116] @ r7 = linesizeU
+ sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
+ add r6, r6, r7 @ srcU += paddingU
+ ldr r7, [sp, #124] @ r7 = linesizeV
+ sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
+ add r10, r10, r7 @ srcV += paddingV
+ subs r1, r1, #2 @ height -= 2
+.endm
+
+.macro increment_and_test_yuv422p
+ add r6, r6, r7 @ srcU += paddingU
+ add r10,r10,r12 @ srcV += paddingV
+ subs r1, r1, #1 @ height -= 1
+.endm
+
+.macro process_nv12 ofmt
+ process_2l \ofmt
+.endm
+
+.macro process_nv21 ofmt
+ process_2l \ofmt
+.endm
+
+.macro process_yuv420p ofmt
+ process_2l \ofmt
+.endm
+
+.macro process_yuv422p ofmt
+ process_1l \ofmt
+.endm
+
+.macro declare_func ifmt ofmt
+function ff_\ifmt\()_to_\ofmt\()_neon, export=1
+ load_args_\ifmt
+ vmov.u16 q11, #1024 @ q11 = 128 * (1 << 3)
+ vdup.16 q12, r9 @ q12 = y_offset
+ vmov d26, d0 @ q13 = y_coeff
+ vmov d27, d0 @ q13 = y_coeff
+1:
+ mov r8, r0 @ r8 = width
+2:
+ pld [r6, #64*3]
+ pld [r4, #64*3]
+ vmov.i8 d10, #128
+ load_chroma_\ifmt
+ process_\ifmt \ofmt
+ subs r8, r8, #16 @ width -= 16
+ bgt 2b
+ add r2, r2, r3 @ dst += padding
+ add r4, r4, r5 @ srcY += paddingY
+ increment_and_test_\ifmt
+ bgt 1b
+ vpop {q4-q7}
+ pop {r4-r12, lr}
+ mov pc, lr
+endfunc
+.endm
+
+.macro declare_rgb_funcs ifmt
+ declare_func \ifmt, argb
+ declare_func \ifmt, rgba
+ declare_func \ifmt, abgr
+ declare_func \ifmt, bgra
+.endm
+
+declare_rgb_funcs nv12
+declare_rgb_funcs nv21
+declare_rgb_funcs yuv420p
+declare_rgb_funcs yuv422p