1 files changed, 408 insertions, 0 deletions
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
new file mode 100644
index 0000000000..1c90c4c7ec
--- /dev/null
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+function ff_h264_idct_add_neon, export=1
+        ld1             {v0.4H, v1.4H, v2.4H, v3.4H},  [x1]
+        sxtw            x2,     w2
+        movi            v30.8H, #0
+
+        add             v4.4H,  v0.4H,  v2.4H
+        sshr            v16.4H, v1.4H,  #1
+        st1             {v30.8H},    [x1], #16
+        sshr            v17.4H, v3.4H,  #1
+        st1             {v30.8H},    [x1], #16
+        sub             v5.4H,  v0.4H,  v2.4H
+        add             v6.4H,  v1.4H,  v17.4H
+        sub             v7.4H,  v16.4H, v3.4H
+        add             v0.4H,  v4.4H,  v6.4H
+        add             v1.4H,  v5.4H,  v7.4H
+        sub             v2.4H,  v4.4H,  v6.4H
+        sub             v3.4H,  v5.4H,  v7.4H
+
+        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
+
+        add             v4.4H,  v0.4H,  v3.4H
+        ld1             {v18.S}[0], [x0], x2
+        sshr            v16.4H,  v2.4H,  #1
+        sshr            v17.4H,  v1.4H,  #1
+        ld1             {v19.S}[1], [x0], x2
+        sub             v5.4H,  v0.4H,  v3.4H
+        ld1             {v18.S}[1], [x0], x2
+        add             v6.4H,  v16.4H, v1.4H
+        ins             v4.D[1],  v5.D[0]
+        sub             v7.4H,  v2.4H,  v17.4H
+        ld1             {v19.S}[0], [x0], x2
+        ins             v6.D[1],  v7.D[0]
+        sub             x0,  x0,  x2, lsl #2
+        add             v0.8H,  v4.8H,  v6.8H
+        sub             v1.8H,  v4.8H,  v6.8H
+
+        srshr           v0.8H,  v0.8H,  #6
+        srshr           v1.8H,  v1.8H,  #6
+
+        uaddw           v0.8H,  v0.8H,  v18.8B
+        uaddw           v1.8H,  v1.8H,  v19.8B
+
+        sqxtun          v0.8B, v0.8H
+        sqxtun          v1.8B, v1.8H
+
+        st1             {v0.S}[0],  [x0], x2
+        st1             {v1.S}[1],  [x0], x2
+        st1             {v0.S}[1],  [x0], x2
+        st1             {v1.S}[0],  [x0], x2
+
+        sub             x1,  x1,  #32
+        ret
+endfunc
+
+function ff_h264_idct_dc_add_neon, export=1
+        sxtw            x2,  w2
+        mov             w3,       #0
+        ld1r            {v2.8H},  [x1]
+        strh            w3,       [x1]
+        srshr           v2.8H,  v2.8H,  #6
+        ld1             {v0.S}[0],  [x0], x2
+        ld1             {v0.S}[1],  [x0], x2
+        uaddw           v3.8H,  v2.8H,  v0.8B
+        ld1             {v1.S}[0],  [x0], x2
+        ld1             {v1.S}[1],  [x0], x2
+        uaddw           v4.8H,  v2.8H,  v1.8B
+        sqxtun          v0.8B,  v3.8H
+        sqxtun          v1.8B,  v4.8H
+        sub             x0,  x0,  x2, lsl #2
+        st1             {v0.S}[0],  [x0], x2
+        st1             {v0.S}[1],  [x0], x2
+        st1             {v1.S}[0],  [x0], x2
+        st1             {v1.S}[1],  [x0], x2
+        ret
+endfunc
+
+function ff_h264_idct_add16_neon, export=1
+        mov             x12, x30
+        mov             x6,  x0         // dest
+        mov             x5,  x1         // block_offset
+        mov             x1,  x2         // block
+        mov             w9,  w3         // stride
+        movrel          x7,  scan8
+        mov             x10, #16
+        movrel          x13, ff_h264_idct_dc_add_neon
+        movrel          x14, ff_h264_idct_add_neon
+1:      mov             w2,  w9
+        ldrb            w3,  [x7], #1
+        ldrsw           x0,  [x5], #4
+        ldrb            w3,  [x4,  w3,  uxtw]
+        subs            w3,  w3,  #1
+        b.lt            2f
+        ldrsh           w3,  [x1]
+        add             x0,  x0,  x6
+        ccmp            w3,  #0,  #4,  eq
+        csel            x15, x13, x14, ne
+        blr             x15
+2:      subs            x10, x10, #1
+        add             x1,  x1,  #32
+        b.ne            1b
+        ret             x12
+endfunc
+
+function ff_h264_idct_add16intra_neon, export=1
+        mov             x12, x30
+        mov             x6,  x0         // dest
+        mov             x5,  x1         // block_offset
+        mov             x1,  x2         // block
+        mov             w9,  w3         // stride
+        movrel          x7,  scan8
+        mov             x10, #16
+        movrel          x13, ff_h264_idct_dc_add_neon
+        movrel          x14, ff_h264_idct_add_neon
+1:      mov             w2,  w9
+        ldrb            w3,  [x7], #1
+        ldrsw           x0,  [x5], #4
+        ldrb            w3,  [x4,  w3,  uxtw]
+        add             x0,  x0,  x6
+        cmp             w3,  #0
+        ldrsh           w3,  [x1]
+        csel            x15, x13, x14, eq
+        ccmp            w3,  #0,  #0,  eq
+        b.eq            2f
+        blr             x15
+2:      subs            x10, x10, #1
+        add             x1,  x1,  #32
+        b.ne            1b
+        ret             x12
+endfunc
+
+function ff_h264_idct_add8_neon, export=1
+        sub             sp,  sp, #0x40
+        stp             x19, x20, [sp]
+        mov             x12, x30
+        ldp             x6,  x15, [x0]          // dest[0], dest[1]
+        add             x5,  x1,  #16*4         // block_offset
+        add             x9,  x2,  #16*32        // block
+        mov             w19, w3                 // stride
+        movrel          x13, ff_h264_idct_dc_add_neon
+        movrel          x14, ff_h264_idct_add_neon
+        movrel          x7,  scan8+16
+        mov             x10, #0
+        mov             x11, #16
+1:      mov             w2,  w19
+        ldrb            w3,  [x7, x10]          // scan8[i]
+        ldrsw           x0,  [x5, x10, lsl #2]  // block_offset[i]
+        ldrb            w3,  [x4, w3,  uxtw]    // nnzc[ scan8[i] ]
+        add             x0,  x0,  x6            // block_offset[i] + dst[j-1]
+        add             x1,  x9,  x10, lsl #5   // block + i * 16
+        cmp             w3,  #0
+        ldrsh           w3,  [x1]               // block[i*16]
+        csel            x20, x13, x14, eq
+        ccmp            w3,  #0,  #0,  eq
+        b.eq            2f
+        blr             x20
+2:      add             x10, x10, #1
+        cmp             x10, #4
+        csel            x10, x11, x10, eq     // mov x10, #16
+        csel            x6,  x15, x6,  eq
+        cmp             x10, #20
+        b.lt            1b
+        ldp             x19, x20, [sp]
+        add             sp,  sp,  #0x40
+        ret             x12
+endfunc
+
+.macro  idct8x8_cols    pass
+  .if \pass == 0
+        va      .req    v18
+        vb      .req    v30
+        sshr            v18.8H, v26.8H, #1
+        add             v16.8H, v24.8H, v28.8H
+        ld1             {v30.8H, v31.8H}, [x1]
+        st1             {v19.8H}, [x1],  #16
+        st1             {v19.8H}, [x1],  #16
+        sub             v17.8H,  v24.8H, v28.8H
+        sshr            v19.8H,  v30.8H, #1
+        sub             v18.8H,  v18.8H,  v30.8H
+        add             v19.8H,  v19.8H,  v26.8H
+  .else
+        va      .req    v30
+        vb      .req    v18
+        sshr            v30.8H, v26.8H, #1
+        sshr            v19.8H, v18.8H, #1
+        add             v16.8H, v24.8H, v28.8H
+        sub             v17.8H, v24.8H, v28.8H
+        sub             v30.8H, v30.8H, v18.8H
+        add             v19.8H, v19.8H, v26.8H
+  .endif
+        add             v26.8H, v17.8H, va.8H
+        sub             v28.8H, v17.8H, va.8H
+        add             v24.8H, v16.8H, v19.8H
+        sub             vb.8H,  v16.8H, v19.8H
+        sub             v16.8H, v29.8H, v27.8H
+        add             v17.8H, v31.8H, v25.8H
+        sub             va.8H,  v31.8H, v25.8H
+        add             v19.8H, v29.8H, v27.8H
+        sub             v16.8H, v16.8H, v31.8H
+        sub             v17.8H, v17.8H, v27.8H
+        add             va.8H,  va.8H,  v29.8H
+        add             v19.8H, v19.8H, v25.8H
+        sshr            v25.8H, v25.8H, #1
+        sshr            v27.8H, v27.8H, #1
+        sshr            v29.8H, v29.8H, #1
+        sshr            v31.8H, v31.8H, #1
+        sub             v16.8H, v16.8H, v31.8H
+        sub             v17.8H, v17.8H, v27.8H
+        add             va.8H,  va.8H,  v29.8H
+        add             v19.8H, v19.8H, v25.8H
+        sshr            v25.8H, v16.8H, #2
+        sshr            v27.8H, v17.8H, #2
+        sshr            v29.8H, va.8H,  #2
+        sshr            v31.8H, v19.8H, #2
+        sub             v19.8H, v19.8H, v25.8H
+        sub             va.8H,  v27.8H, va.8H
+        add             v17.8H, v17.8H, v29.8H
+        add             v16.8H, v16.8H, v31.8H
+  .if \pass == 0
+        sub             v31.8H, v24.8H, v19.8H
+        add             v24.8H, v24.8H, v19.8H
+        add             v25.8H, v26.8H, v18.8H
+        sub             v18.8H, v26.8H, v18.8H
+        add             v26.8H, v28.8H, v17.8H
+        add             v27.8H, v30.8H, v16.8H
+        sub             v29.8H, v28.8H, v17.8H
+        sub             v28.8H, v30.8H, v16.8H
+  .else
+        sub             v31.8H, v24.8H, v19.8H
+        add             v24.8H, v24.8H, v19.8H
+        add             v25.8H, v26.8H, v30.8H
+        sub             v30.8H, v26.8H, v30.8H
+        add             v26.8H, v28.8H, v17.8H
+        sub             v29.8H, v28.8H, v17.8H
+        add             v27.8H, v18.8H, v16.8H
+        sub             v28.8H, v18.8H, v16.8H
+  .endif
+        .unreq          va
+        .unreq          vb
+.endm
+
+function ff_h264_idct8_add_neon, export=1
+        movi            v19.8H,   #0
+        ld1             {v24.8H, v25.8H}, [x1]
+        st1             {v19.8H},  [x1],   #16
+        st1             {v19.8H},  [x1],   #16
+        ld1             {v26.8H, v27.8H}, [x1]
+        st1             {v19.8H},  [x1],   #16
+        st1             {v19.8H},  [x1],   #16
+        ld1             {v28.8H, v29.8H}, [x1]
+        st1             {v19.8H},  [x1],   #16
+        st1             {v19.8H},  [x1],   #16
+
+        idct8x8_cols    0
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
+        idct8x8_cols    1
+
+        mov             x3,  x0
+        srshr           v24.8H, v24.8H, #6
+        ld1             {v0.8B},     [x0], x2
+        srshr           v25.8H, v25.8H, #6
+        ld1             {v1.8B},     [x0], x2
+        srshr           v26.8H, v26.8H, #6
+        ld1             {v2.8B},     [x0], x2
+        srshr           v27.8H, v27.8H, #6
+        ld1             {v3.8B},     [x0], x2
+        srshr           v28.8H, v28.8H, #6
+        ld1             {v4.8B},     [x0], x2
+        srshr           v29.8H, v29.8H, #6
+        ld1             {v5.8B},     [x0], x2
+        srshr           v30.8H, v30.8H, #6
+        ld1             {v6.8B},     [x0], x2
+        srshr           v31.8H, v31.8H, #6
+        ld1             {v7.8B},     [x0], x2
+        uaddw           v24.8H, v24.8H, v0.8B
+        uaddw           v25.8H, v25.8H, v1.8B
+        uaddw           v26.8H, v26.8H, v2.8B
+        sqxtun          v0.8B,  v24.8H
+        uaddw           v27.8H, v27.8H, v3.8B
+        sqxtun          v1.8B,  v25.8H
+        uaddw           v28.8H, v28.8H, v4.8B
+        sqxtun          v2.8B,  v26.8H
+        st1             {v0.8B},     [x3], x2
+        uaddw           v29.8H, v29.8H, v5.8B
+        sqxtun          v3.8B,  v27.8H
+        st1             {v1.8B},     [x3], x2
+        uaddw           v30.8H, v30.8H, v6.8B
+        sqxtun          v4.8B,  v28.8H
+        st1             {v2.8B},     [x3], x2
+        uaddw           v31.8H, v31.8H, v7.8B
+        sqxtun          v5.8B,  v29.8H
+        st1             {v3.8B},     [x3], x2
+        sqxtun          v6.8B,  v30.8H
+        sqxtun          v7.8B,  v31.8H
+        st1             {v4.8B},     [x3], x2
+        st1             {v5.8B},     [x3], x2
+        st1             {v6.8B},     [x3], x2
+        st1             {v7.8B},     [x3], x2
+
+        sub             x1,  x1,  #128
+        ret
+endfunc
+
+function ff_h264_idct8_dc_add_neon, export=1
+        mov             w3,       #0
+        sxtw            x2,       w2
+        ld1r            {v31.8H}, [x1]
+        strh            w3,       [x1]
+        ld1             {v0.8B},  [x0], x2
+        srshr           v31.8H, v31.8H, #6
+        ld1             {v1.8B},     [x0], x2
+        ld1             {v2.8B},     [x0], x2
+        uaddw           v24.8H, v31.8H, v0.8B
+        ld1             {v3.8B},     [x0], x2
+        uaddw           v25.8H, v31.8H, v1.8B
+        ld1             {v4.8B},     [x0], x2
+        uaddw           v26.8H, v31.8H, v2.8B
+        ld1             {v5.8B},     [x0], x2
+        uaddw           v27.8H, v31.8H, v3.8B
+        ld1             {v6.8B},     [x0], x2
+        uaddw           v28.8H, v31.8H, v4.8B
+        ld1             {v7.8B},     [x0], x2
+        uaddw           v29.8H, v31.8H, v5.8B
+        uaddw           v30.8H, v31.8H, v6.8B
+        uaddw           v31.8H, v31.8H, v7.8B
+        sqxtun          v0.8B,  v24.8H
+        sqxtun          v1.8B,  v25.8H
+        sqxtun          v2.8B,  v26.8H
+        sqxtun          v3.8B,  v27.8H
+        sub             x0,  x0,  x2, lsl #3
+        st1             {v0.8B},     [x0], x2
+        sqxtun          v4.8B,  v28.8H
+        st1             {v1.8B},     [x0], x2
+        sqxtun          v5.8B,  v29.8H
+        st1             {v2.8B},     [x0], x2
+        sqxtun          v6.8B,  v30.8H
+        st1             {v3.8B},     [x0], x2
+        sqxtun          v7.8B,  v31.8H
+        st1             {v4.8B},     [x0], x2
+        st1             {v5.8B},     [x0], x2
+        st1             {v6.8B},     [x0], x2
+        st1             {v7.8B},     [x0], x2
+        ret
+endfunc
+
+function ff_h264_idct8_add4_neon, export=1
+        mov             x12, x30
+        mov             x6,  x0
+        mov             x5,  x1
+        mov             x1,  x2
+        mov             w2,  w3
+        movrel          x7,  scan8
+        mov             w10, #16
+        movrel          x13, ff_h264_idct8_dc_add_neon
+        movrel          x14, ff_h264_idct8_add_neon
+1:      ldrb            w9,  [x7], #4
+        ldrsw           x0,  [x5], #16
+        ldrb            w9,  [x4, w9, UXTW]
+        subs            w9,  w9,  #1
+        b.lt            2f
+        ldrsh           w11,  [x1]
+        add             x0,  x6,  x0
+        ccmp            w11, #0,  #4,  eq
+        csel            x15, x13, x14, ne
+        blr             x15
+2:      subs            w10, w10, #4
+        add             x1,  x1,  #128
+        b.ne            1b
+        ret             x12
+endfunc
+
+const   scan8
+        .byte           4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
+        .byte           6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
+        .byte           4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
+        .byte           6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
+        .byte           4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
+        .byte           6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
+        .byte           4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
+        .byte           6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
+        .byte           4+11*8, 5+11*8, 4+12*8, 5+12*8
+        .byte           6+11*8, 7+11*8, 6+12*8, 7+12*8
+        .byte           4+13*8, 5+13*8, 4+14*8, 5+14*8
+        .byte           6+13*8, 7+13*8, 6+14*8, 7+14*8
+endconst