/* * Copyright (c) 2008 Mans Rullgard * Copyright (c) 2013 Janne Grunau * * This file is part of Libav. * * Libav is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * Libav is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with Libav; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/aarch64/asm.S" #include "neon.S" .macro h264_loop_filter_start cmp w2, #0 ldr w6, [x4] ccmp w3, #0, #0, ne mov v24.S[0], w6 and w6, w6, w6, lsl #16 b.eq 1f ands w6, w6, w6, lsl #8 b.ge 2f 1: ret 2: .endm .macro h264_loop_filter_luma dup v22.16B, w2 // alpha uxtl v24.8H, v24.8B uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) uxtl v24.4S, v24.4H uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) sli v24.8H, v24.8H, #8 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) sli v24.4S, v24.4S, #16 cmhi v21.16B, v22.16B, v21.16B // < alpha dup v22.16B, w3 // beta cmlt v23.16B, v24.16B, #0 cmhi v28.16B, v22.16B, v28.16B // < beta cmhi v30.16B, v22.16B, v30.16B // < beta bic v21.16B, v21.16B, v23.16B uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) and v21.16B, v21.16B, v28.16B uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) cmhi v17.16B, v22.16B, v17.16B // < beta and v21.16B, v21.16B, v30.16B cmhi v19.16B, v22.16B, v19.16B // < beta and v17.16B, v17.16B, v21.16B and v19.16B, v19.16B, v21.16B and v24.16B, v24.16B, v21.16B urhadd v28.16B, v16.16B, v0.16B sub v21.16B, v24.16B, v17.16B uqadd v23.16B, v18.16B, v24.16B uhadd v20.16B, v20.16B, v28.16B sub v21.16B, v21.16B, v19.16B uhadd v28.16B, v4.16B, v28.16B umin v23.16B, v23.16B, v20.16B uqsub v22.16B, v18.16B, v24.16B uqadd v4.16B, v2.16B, v24.16B umax v23.16B, v23.16B, v22.16B uqsub v22.16B, v2.16B, v24.16B umin v28.16B, v4.16B, v28.16B uxtl v4.8H, v0.8B umax v28.16B, v28.16B, v22.16B uxtl2 v20.8H, v0.16B usubw v4.8H, v4.8H, v16.8B usubw2 v20.8H, v20.8H, v16.16B shl v4.8H, v4.8H, #2 shl v20.8H, v20.8H, #2 uaddw v4.8H, v4.8H, v18.8B uaddw2 v20.8H, v20.8H, v18.16B usubw v4.8H, v4.8H, v2.8B usubw2 v20.8H, v20.8H, v2.16B rshrn v4.8B, v4.8H, #3 rshrn2 v4.16B, v20.8H, #3 bsl v17.16B, v23.16B, v18.16B bsl v19.16B, v28.16B, v2.16B neg v23.16B, v21.16B uxtl v28.8H, v16.8B smin v4.16B, v4.16B, v21.16B uxtl2 v21.8H, v16.16B smax v4.16B, v4.16B, v23.16B uxtl v22.8H, v0.8B uxtl2 v24.8H, v0.16B saddw v28.8H, v28.8H, v4.8B saddw2 v21.8H, v21.8H, v4.16B ssubw v22.8H, v22.8H, v4.8B ssubw2 v24.8H, v24.8H, v4.16B sqxtun v16.8B, v28.8H sqxtun2 v16.16B, v21.8H sqxtun v0.8B, v22.8H sqxtun2 v0.16B, v24.8H .endm function ff_h264_v_loop_filter_luma_neon, export=1 h264_loop_filter_start sxtw x1, w1 ld1 {v0.16B}, [x0], x1 ld1 {v2.16B}, [x0], x1 ld1 {v4.16B}, [x0], x1 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 ld1 {v20.16B}, [x0], x1 ld1 {v18.16B}, [x0], x1 ld1 {v16.16B}, [x0], x1 h264_loop_filter_luma sub x0, x0, x1, lsl #1 st1 {v17.16B}, [x0], x1 st1 {v16.16B}, [x0], x1 st1 {v0.16B}, [x0], x1 st1 {v19.16B}, [x0] ret endfunc function ff_h264_h_loop_filter_luma_neon, export=1 h264_loop_filter_start sub x0, x0, #4 ld1 {v6.8B}, [x0], x1 ld1 {v20.8B}, [x0], x1 ld1 {v18.8B}, [x0], x1 ld1 {v16.8B}, [x0], x1 ld1 {v0.8B}, [x0], x1 ld1 {v2.8B}, [x0], x1 ld1 {v4.8B}, [x0], x1 ld1 {v26.8B}, [x0], x1 ld1 {v6.D}[1], [x0], x1 ld1 {v20.D}[1], [x0], x1 ld1 {v18.D}[1], [x0], x1 ld1 {v16.D}[1], [x0], x1 ld1 {v0.D}[1], [x0], x1 ld1 {v2.D}[1], [x0], x1 ld1 {v4.D}[1], [x0], x1 ld1 {v26.D}[1], [x0], x1 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 h264_loop_filter_luma transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27 sub x0, x0, x1, lsl #4 add x0, x0, #2 st1 {v17.S}[0], [x0], x1 st1 {v16.S}[0], [x0], x1 st1 {v0.S}[0], [x0], x1 st1 {v19.S}[0], [x0], x1 st1 {v17.S}[1], [x0], x1 st1 {v16.S}[1], [x0], x1 st1 {v0.S}[1], [x0], x1 st1 {v19.S}[1], [x0], x1 st1 {v17.S}[2], [x0], x1 st1 {v16.S}[2], [x0], x1 st1 {v0.S}[2], [x0], x1 st1 {v19.S}[2], [x0], x1 st1 {v17.S}[3], [x0], x1 st1 {v16.S}[3], [x0], x1 st1 {v0.S}[3], [x0], x1 st1 {v19.S}[3], [x0], x1 ret endfunc .macro h264_loop_filter_chroma dup v22.8B, w2 // alpha uxtl v24.8H, v24.8B uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) uxtl v4.8H, v0.8B uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) usubw v4.8H, v4.8H, v16.8B sli v24.8H, v24.8H, #8 shl v4.8H, v4.8H, #2 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) uaddw v4.8H, v4.8H, v18.8B cmhi v26.8B, v22.8B, v26.8B // < alpha usubw v4.8H, v4.8H, v2.8B dup v22.8B, w3 // beta rshrn v4.8B, v4.8H, #3 cmhi v28.8B, v22.8B, v28.8B // < beta cmhi v30.8B, v22.8B, v30.8B // < beta smin v4.8B, v4.8B, v24.8B neg v25.8B, v24.8B and v26.8B, v26.8B, v28.8B smax v4.8B, v4.8B, v25.8B and v26.8B, v26.8B, v30.8B uxtl v22.8H, v0.8B and v4.8B, v4.8B, v26.8B uxtl v28.8H, v16.8B saddw v28.8H, v28.8H, v4.8B ssubw v22.8H, v22.8H, v4.8B sqxtun v16.8B, v28.8H sqxtun v0.8B, v22.8H .endm function ff_h264_v_loop_filter_chroma_neon, export=1 h264_loop_filter_start sub x0, x0, x1, lsl #1 ld1 {v18.8B}, [x0], x1 ld1 {v16.8B}, [x0], x1 ld1 {v0.8B}, [x0], x1 ld1 {v2.8B}, [x0] h264_loop_filter_chroma sub x0, x0, x1, lsl #1 st1 {v16.8B}, [x0], x1 st1 {v0.8B}, [x0], x1 ret endfunc function ff_h264_h_loop_filter_chroma_neon, export=1 h264_loop_filter_start sub x0, x0, #2 ld1 {v18.S}[0], [x0], x1 ld1 {v16.S}[0], [x0], x1 ld1 {v0.S}[0], [x0], x1 ld1 {v2.S}[0], [x0], x1 ld1 {v18.S}[1], [x0], x1 ld1 {v16.S}[1], [x0], x1 ld1 {v0.S}[1], [x0], x1 ld1 {v2.S}[1], [x0], x1 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 h264_loop_filter_chroma transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 sub x0, x0, x1, lsl #3 st1 {v18.S}[0], [x0], x1 st1 {v16.S}[0], [x0], x1 st1 {v0.S}[0], [x0], x1 st1 {v2.S}[0], [x0], x1 st1 {v18.S}[1], [x0], x1 st1 {v16.S}[1], [x0], x1 st1 {v0.S}[1], [x0], x1 st1 {v2.S}[1], [x0], x1 ret endfunc