From 8438b3f09f6b225d0886cc385117c38eb44ca0c1 Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Fri, 13 Dec 2013 00:33:48 +0100 Subject: aarch64: h264 idct NEON assembler optimizations Ported from ARMv7 NEON. --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/h264dsp_init_aarch64.c | 62 +++++ libavcodec/aarch64/h264idct_neon.S | 408 ++++++++++++++++++++++++++++++ libavcodec/aarch64/neon.S | 61 +++++ 4 files changed, 533 insertions(+) create mode 100644 libavcodec/aarch64/h264dsp_init_aarch64.c create mode 100644 libavcodec/aarch64/h264idct_neon.S create mode 100644 libavcodec/aarch64/neon.S (limited to 'libavcodec/aarch64') diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 879e6befee..06e3c778ad 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -1,5 +1,7 @@ OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o +OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o +NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264idct_neon.o diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c new file mode 100644 index 0000000000..8148336e66 --- /dev/null +++ b/libavcodec/aarch64/h264dsp_init_aarch64.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/h264dsp.h" + +void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); +void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); + +void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride); +void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, + const uint8_t nnzc[6*8]); + +av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags) && bit_depth == 8) { + c->h264_idct_add = ff_h264_idct_add_neon; + c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; + c->h264_idct_add16 = ff_h264_idct_add16_neon; + c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; + if (chroma_format_idc <= 1) + c->h264_idct_add8 = ff_h264_idct_add8_neon; + c->h264_idct8_add = ff_h264_idct8_add_neon; + c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; + c->h264_idct8_add4 = ff_h264_idct8_add4_neon; + } +} diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S new file mode 100644 index 0000000000..1c90c4c7ec --- /dev/null +++ b/libavcodec/aarch64/h264idct_neon.S @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2008 Mans Rullgard + * Copyright (c) 2013 Janne Grunau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +function ff_h264_idct_add_neon, export=1 + ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] + sxtw x2, w2 + movi v30.8H, #0 + + add v4.4H, v0.4H, v2.4H + sshr v16.4H, v1.4H, #1 + st1 {v30.8H}, [x1], #16 + sshr v17.4H, v3.4H, #1 + st1 {v30.8H}, [x1], #16 + sub v5.4H, v0.4H, v2.4H + add v6.4H, v1.4H, v17.4H + sub v7.4H, v16.4H, v3.4H + add v0.4H, v4.4H, v6.4H + add v1.4H, v5.4H, v7.4H + sub v2.4H, v4.4H, v6.4H + sub v3.4H, v5.4H, v7.4H + + transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 + + add v4.4H, v0.4H, v3.4H + ld1 {v18.S}[0], [x0], x2 + sshr v16.4H, v2.4H, #1 + sshr v17.4H, v1.4H, #1 + ld1 {v19.S}[1], [x0], x2 + sub v5.4H, v0.4H, v3.4H + ld1 {v18.S}[1], [x0], x2 + add v6.4H, v16.4H, v1.4H + ins v4.D[1], v5.D[0] + sub v7.4H, v2.4H, v17.4H + ld1 {v19.S}[0], [x0], x2 + ins v6.D[1], v7.D[0] + sub x0, x0, x2, lsl #2 + add v0.8H, v4.8H, v6.8H + sub v1.8H, v4.8H, v6.8H + + srshr v0.8H, v0.8H, #6 + srshr v1.8H, v1.8H, #6 + + uaddw v0.8H, v0.8H, v18.8B + uaddw v1.8H, v1.8H, v19.8B + + sqxtun v0.8B, v0.8H + sqxtun v1.8B, v1.8H + + st1 {v0.S}[0], [x0], x2 + st1 {v1.S}[1], [x0], x2 + st1 {v0.S}[1], [x0], x2 + st1 {v1.S}[0], [x0], x2 + + sub x1, x1, #32 + ret +endfunc + +function ff_h264_idct_dc_add_neon, export=1 + sxtw x2, w2 + mov w3, #0 + ld1r {v2.8H}, [x1] + strh w3, [x1] + srshr v2.8H, v2.8H, #6 + ld1 {v0.S}[0], [x0], x2 + ld1 {v0.S}[1], [x0], x2 + uaddw v3.8H, v2.8H, v0.8B + ld1 {v1.S}[0], [x0], x2 + ld1 {v1.S}[1], [x0], x2 + uaddw v4.8H, v2.8H, v1.8B + sqxtun v0.8B, v3.8H + sqxtun v1.8B, v4.8H + sub x0, x0, x2, lsl #2 + st1 {v0.S}[0], [x0], x2 + st1 {v0.S}[1], [x0], x2 + st1 {v1.S}[0], [x0], x2 + st1 {v1.S}[1], [x0], x2 + ret +endfunc + +function ff_h264_idct_add16_neon, export=1 + mov x12, x30 + mov x6, x0 // dest + mov x5, x1 // block_offset + mov x1, x2 // block + mov w9, w3 // stride + movrel x7, scan8 + mov x10, #16 + movrel x13, ff_h264_idct_dc_add_neon + movrel x14, ff_h264_idct_add_neon +1: mov w2, w9 + ldrb w3, [x7], #1 + ldrsw x0, [x5], #4 + ldrb w3, [x4, w3, uxtw] + subs w3, w3, #1 + b.lt 2f + ldrsh w3, [x1] + add x0, x0, x6 + ccmp w3, #0, #4, eq + csel x15, x13, x14, ne + blr x15 +2: subs x10, x10, #1 + add x1, x1, #32 + b.ne 1b + ret x12 +endfunc + +function ff_h264_idct_add16intra_neon, export=1 + mov x12, x30 + mov x6, x0 // dest + mov x5, x1 // block_offset + mov x1, x2 // block + mov w9, w3 // stride + movrel x7, scan8 + mov x10, #16 + movrel x13, ff_h264_idct_dc_add_neon + movrel x14, ff_h264_idct_add_neon +1: mov w2, w9 + ldrb w3, [x7], #1 + ldrsw x0, [x5], #4 + ldrb w3, [x4, w3, uxtw] + add x0, x0, x6 + cmp w3, #0 + ldrsh w3, [x1] + csel x15, x13, x14, eq + ccmp w3, #0, #0, eq + b.eq 2f + blr x15 +2: subs x10, x10, #1 + add x1, x1, #32 + b.ne 1b + ret x12 +endfunc + +function ff_h264_idct_add8_neon, export=1 + sub sp, sp, #0x40 + stp x19, x20, [sp] + mov x12, x30 + ldp x6, x15, [x0] // dest[0], dest[1] + add x5, x1, #16*4 // block_offset + add x9, x2, #16*32 // block + mov w19, w3 // stride + movrel x13, ff_h264_idct_dc_add_neon + movrel x14, ff_h264_idct_add_neon + movrel x7, scan8+16 + mov x10, #0 + mov x11, #16 +1: mov w2, w19 + ldrb w3, [x7, x10] // scan8[i] + ldrsw x0, [x5, x10, lsl #2] // block_offset[i] + ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ] + add x0, x0, x6 // block_offset[i] + dst[j-1] + add x1, x9, x10, lsl #5 // block + i * 16 + cmp w3, #0 + ldrsh w3, [x1] // block[i*16] + csel x20, x13, x14, eq + ccmp w3, #0, #0, eq + b.eq 2f + blr x20 +2: add x10, x10, #1 + cmp x10, #4 + csel x10, x11, x10, eq // mov x10, #16 + csel x6, x15, x6, eq + cmp x10, #20 + b.lt 1b + ldp x19, x20, [sp] + add sp, sp, #0x40 + ret x12 +endfunc + +.macro idct8x8_cols pass + .if \pass == 0 + va .req v18 + vb .req v30 + sshr v18.8H, v26.8H, #1 + add v16.8H, v24.8H, v28.8H + ld1 {v30.8H, v31.8H}, [x1] + st1 {v19.8H}, [x1], #16 + st1 {v19.8H}, [x1], #16 + sub v17.8H, v24.8H, v28.8H + sshr v19.8H, v30.8H, #1 + sub v18.8H, v18.8H, v30.8H + add v19.8H, v19.8H, v26.8H + .else + va .req v30 + vb .req v18 + sshr v30.8H, v26.8H, #1 + sshr v19.8H, v18.8H, #1 + add v16.8H, v24.8H, v28.8H + sub v17.8H, v24.8H, v28.8H + sub v30.8H, v30.8H, v18.8H + add v19.8H, v19.8H, v26.8H + .endif + add v26.8H, v17.8H, va.8H + sub v28.8H, v17.8H, va.8H + add v24.8H, v16.8H, v19.8H + sub vb.8H, v16.8H, v19.8H + sub v16.8H, v29.8H, v27.8H + add v17.8H, v31.8H, v25.8H + sub va.8H, v31.8H, v25.8H + add v19.8H, v29.8H, v27.8H + sub v16.8H, v16.8H, v31.8H + sub v17.8H, v17.8H, v27.8H + add va.8H, va.8H, v29.8H + add v19.8H, v19.8H, v25.8H + sshr v25.8H, v25.8H, #1 + sshr v27.8H, v27.8H, #1 + sshr v29.8H, v29.8H, #1 + sshr v31.8H, v31.8H, #1 + sub v16.8H, v16.8H, v31.8H + sub v17.8H, v17.8H, v27.8H + add va.8H, va.8H, v29.8H + add v19.8H, v19.8H, v25.8H + sshr v25.8H, v16.8H, #2 + sshr v27.8H, v17.8H, #2 + sshr v29.8H, va.8H, #2 + sshr v31.8H, v19.8H, #2 + sub v19.8H, v19.8H, v25.8H + sub va.8H, v27.8H, va.8H + add v17.8H, v17.8H, v29.8H + add v16.8H, v16.8H, v31.8H + .if \pass == 0 + sub v31.8H, v24.8H, v19.8H + add v24.8H, v24.8H, v19.8H + add v25.8H, v26.8H, v18.8H + sub v18.8H, v26.8H, v18.8H + add v26.8H, v28.8H, v17.8H + add v27.8H, v30.8H, v16.8H + sub v29.8H, v28.8H, v17.8H + sub v28.8H, v30.8H, v16.8H + .else + sub v31.8H, v24.8H, v19.8H + add v24.8H, v24.8H, v19.8H + add v25.8H, v26.8H, v30.8H + sub v30.8H, v26.8H, v30.8H + add v26.8H, v28.8H, v17.8H + sub v29.8H, v28.8H, v17.8H + add v27.8H, v18.8H, v16.8H + sub v28.8H, v18.8H, v16.8H + .endif + .unreq va + .unreq vb +.endm + +function ff_h264_idct8_add_neon, export=1 + movi v19.8H, #0 + ld1 {v24.8H, v25.8H}, [x1] + st1 {v19.8H}, [x1], #16 + st1 {v19.8H}, [x1], #16 + ld1 {v26.8H, v27.8H}, [x1] + st1 {v19.8H}, [x1], #16 + st1 {v19.8H}, [x1], #16 + ld1 {v28.8H, v29.8H}, [x1] + st1 {v19.8H}, [x1], #16 + st1 {v19.8H}, [x1], #16 + + idct8x8_cols 0 + transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 + idct8x8_cols 1 + + mov x3, x0 + srshr v24.8H, v24.8H, #6 + ld1 {v0.8B}, [x0], x2 + srshr v25.8H, v25.8H, #6 + ld1 {v1.8B}, [x0], x2 + srshr v26.8H, v26.8H, #6 + ld1 {v2.8B}, [x0], x2 + srshr v27.8H, v27.8H, #6 + ld1 {v3.8B}, [x0], x2 + srshr v28.8H, v28.8H, #6 + ld1 {v4.8B}, [x0], x2 + srshr v29.8H, v29.8H, #6 + ld1 {v5.8B}, [x0], x2 + srshr v30.8H, v30.8H, #6 + ld1 {v6.8B}, [x0], x2 + srshr v31.8H, v31.8H, #6 + ld1 {v7.8B}, [x0], x2 + uaddw v24.8H, v24.8H, v0.8B + uaddw v25.8H, v25.8H, v1.8B + uaddw v26.8H, v26.8H, v2.8B + sqxtun v0.8B, v24.8H + uaddw v27.8H, v27.8H, v3.8B + sqxtun v1.8B, v25.8H + uaddw v28.8H, v28.8H, v4.8B + sqxtun v2.8B, v26.8H + st1 {v0.8B}, [x3], x2 + uaddw v29.8H, v29.8H, v5.8B + sqxtun v3.8B, v27.8H + st1 {v1.8B}, [x3], x2 + uaddw v30.8H, v30.8H, v6.8B + sqxtun v4.8B, v28.8H + st1 {v2.8B}, [x3], x2 + uaddw v31.8H, v31.8H, v7.8B + sqxtun v5.8B, v29.8H + st1 {v3.8B}, [x3], x2 + sqxtun v6.8B, v30.8H + sqxtun v7.8B, v31.8H + st1 {v4.8B}, [x3], x2 + st1 {v5.8B}, [x3], x2 + st1 {v6.8B}, [x3], x2 + st1 {v7.8B}, [x3], x2 + + sub x1, x1, #128 + ret +endfunc + +function ff_h264_idct8_dc_add_neon, export=1 + mov w3, #0 + sxtw x2, w2 + ld1r {v31.8H}, [x1] + strh w3, [x1] + ld1 {v0.8B}, [x0], x2 + srshr v31.8H, v31.8H, #6 + ld1 {v1.8B}, [x0], x2 + ld1 {v2.8B}, [x0], x2 + uaddw v24.8H, v31.8H, v0.8B + ld1 {v3.8B}, [x0], x2 + uaddw v25.8H, v31.8H, v1.8B + ld1 {v4.8B}, [x0], x2 + uaddw v26.8H, v31.8H, v2.8B + ld1 {v5.8B}, [x0], x2 + uaddw v27.8H, v31.8H, v3.8B + ld1 {v6.8B}, [x0], x2 + uaddw v28.8H, v31.8H, v4.8B + ld1 {v7.8B}, [x0], x2 + uaddw v29.8H, v31.8H, v5.8B + uaddw v30.8H, v31.8H, v6.8B + uaddw v31.8H, v31.8H, v7.8B + sqxtun v0.8B, v24.8H + sqxtun v1.8B, v25.8H + sqxtun v2.8B, v26.8H + sqxtun v3.8B, v27.8H + sub x0, x0, x2, lsl #3 + st1 {v0.8B}, [x0], x2 + sqxtun v4.8B, v28.8H + st1 {v1.8B}, [x0], x2 + sqxtun v5.8B, v29.8H + st1 {v2.8B}, [x0], x2 + sqxtun v6.8B, v30.8H + st1 {v3.8B}, [x0], x2 + sqxtun v7.8B, v31.8H + st1 {v4.8B}, [x0], x2 + st1 {v5.8B}, [x0], x2 + st1 {v6.8B}, [x0], x2 + st1 {v7.8B}, [x0], x2 + ret +endfunc + +function ff_h264_idct8_add4_neon, export=1 + mov x12, x30 + mov x6, x0 + mov x5, x1 + mov x1, x2 + mov w2, w3 + movrel x7, scan8 + mov w10, #16 + movrel x13, ff_h264_idct8_dc_add_neon + movrel x14, ff_h264_idct8_add_neon +1: ldrb w9, [x7], #4 + ldrsw x0, [x5], #16 + ldrb w9, [x4, w9, UXTW] + subs w9, w9, #1 + b.lt 2f + ldrsh w11, [x1] + add x0, x6, x0 + ccmp w11, #0, #4, eq + csel x15, x13, x14, ne + blr x15 +2: subs w10, w10, #4 + add x1, x1, #128 + b.ne 1b + ret x12 +endfunc + +const scan8 + .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 + .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 + .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 + .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 + .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 + .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 + .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 + .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 + .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 + .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 + .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 + .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 +endconst diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S new file mode 100644 index 0000000000..3af9bcdabd --- /dev/null +++ b/libavcodec/aarch64/neon.S @@ -0,0 +1,61 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7 + trn1 \r4\().4H, \r0\().4H, \r1\().4H + trn2 \r5\().4H, \r0\().4H, \r1\().4H + trn1 \r7\().4H, \r3\().4H, \r2\().4H + trn2 \r6\().4H, \r3\().4H, \r2\().4H + trn1 \r0\().2S, \r4\().2S, \r7\().2S + trn2 \r3\().2S, \r4\().2S, \r7\().2S + trn1 \r1\().2S, \r5\().2S, \r6\().2S + trn2 \r2\().2S, \r5\().2S, \r6\().2S +.endm + +.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 + trn1 \r8\().8H, \r0\().8H, \r1\().8H + trn2 \r9\().8H, \r0\().8H, \r1\().8H + trn1 \r1\().8H, \r2\().8H, \r3\().8H + trn2 \r3\().8H, \r2\().8H, \r3\().8H + trn1 \r0\().8H, \r4\().8H, \r5\().8H + trn2 \r5\().8H, \r4\().8H, \r5\().8H + trn1 \r2\().8H, \r6\().8H, \r7\().8H + trn2 \r7\().8H, \r6\().8H, \r7\().8H + + trn1 \r4\().4S, \r0\().4S, \r2\().4S + trn2 \r2\().4S, \r0\().4S, \r2\().4S + trn1 \r6\().4S, \r5\().4S, \r7\().4S + trn2 \r7\().4S, \r5\().4S, \r7\().4S + trn1 \r5\().4S, \r9\().4S, \r3\().4S + trn2 \r9\().4S, \r9\().4S, \r3\().4S + trn1 \r3\().4S, \r8\().4S, \r1\().4S + trn2 \r8\().4S, \r8\().4S, \r1\().4S + + trn1 \r0\().2D, \r3\().2D, \r4\().2D + trn2 \r4\().2D, \r3\().2D, \r4\().2D + + trn1 \r1\().2D, \r5\().2D, \r6\().2D + trn2 \r5\().2D, \r5\().2D, \r6\().2D + + trn2 \r6\().2D, \r8\().2D, \r2\().2D + trn1 \r2\().2D, \r8\().2D, \r2\().2D + + trn1 \r3\().2D, \r9\().2D, \r7\().2D + trn2 \r7\().2D, \r9\().2D, \r7\().2D + +.endm -- cgit v1.2.3