From dfe224f377be3e45758c69d881ca7874b82d647a Mon Sep 17 00:00:00 2001
From: Janne Grunau <janne-libav@jannau.net>
Date: Mon, 13 Jan 2014 01:06:20 +0100
Subject: aarch64: get_cabac inline asm

Based on the x86 branchless get_cabac asm. get_cabac_noinline() gets
approximately 20% faster (no cycle counts available) compared to clang
from Xcode 5.1 beta5. More than 6% faster overall. A part of the overall
speedup might be explained by additional inlining of get_cabac().
---
 libavcodec/aarch64/cabac.h   | 104 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/cabac.h           |   1 +
 libavcodec/cabac_functions.h |   4 +-
 3 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/cabac.h

diff --git a/libavcodec/aarch64/cabac.h b/libavcodec/aarch64/cabac.h
new file mode 100644
index 0000000000..e12953e86c
--- /dev/null
+++ b/libavcodec/aarch64/cabac.h
@@ -0,0 +1,104 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AARCH64_CABAC_H
+#define AVCODEC_AARCH64_CABAC_H
+
+#include "config.h"
+#if HAVE_INLINE_ASM
+
+#include "libavutil/attributes.h"
+#include "libavutil/internal.h"
+#include "libavcodec/cabac.h"
+
+#define get_cabac_inline get_cabac_inline_aarch64
+static av_always_inline int get_cabac_inline_aarch64(CABACContext *c,
+                                                     uint8_t *const state)
+{
+    int bit;
+    void *reg_a, *reg_b, *reg_c, *tmp;
+
+    __asm__ volatile(
+        "ldrb       %w[bit]       , [%[state]]                  \n\t"
+        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
+        "mov        %w[tmp]       , %w[range]                   \n\t"
+        "and        %w[range]     , %w[range]   , #0xC0         \n\t"
+        "lsl        %w[r_c]       , %w[range]   , #1            \n\t"
+        "add        %[r_b]        , %[r_b]      , %w[bit], UXTW \n\t"
+        "ldrb       %w[range]     , [%[r_b], %w[r_c], SXTW]     \n\t"
+        "sub        %w[r_c]       , %w[tmp]     , %w[range]     \n\t"
+        "lsl        %w[tmp]       , %w[r_c]     , #17           \n\t"
+        "cmp        %w[tmp]       , %w[low]                     \n\t"
+        "csel       %w[tmp]       , %w[tmp]     , wzr      , cc \n\t"
+        "csel       %w[range]     , %w[r_c]     , %w[range], gt \n\t"
+        "cinv       %w[bit]       , %w[bit]     , cc            \n\t"
+        "sub        %w[low]       , %w[low]     , %w[tmp]       \n\t"
+        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
+        "add        %[r_a]        , %[tables]   , %[mlps_off]   \n\t"
+        "ldrb       %w[tmp]       , [%[r_b], %w[range], SXTW]   \n\t"
+        "ldrb       %w[r_a]       , [%[r_a], %w[bit], SXTW]     \n\t"
+        "lsl        %w[low]       , %w[low]     , %w[tmp]       \n\t"
+        "lsl        %w[range]     , %w[range]   , %w[tmp]       \n\t"
+        "uxth       %w[r_c]       , %w[low]                     \n\t"
+        "strb       %w[r_a]       , [%[state]]                  \n\t"
+        "cbnz       %w[r_c]       , 2f                          \n\t"
+        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
+        "ldr        %[r_a]        , [%[c], %[end]]              \n\t"
+        "ldrh       %w[tmp]       , [%[r_c]]                    \n\t"
+        "cmp        %[r_c]        , %[r_a]                      \n\t"
+        "b.ge       1f                                          \n\t"
+        "add        %[r_a]        , %[r_c]      , #2            \n\t"
+        "str        %[r_a]        , [%[c], %[byte]]             \n\t"
+        "1:                                                     \n\t"
+        "sub        %w[r_c]       , %w[low]     , #1            \n\t"
+        "eor        %w[r_c]       , %w[r_c]     , %w[low]       \n\t"
+        "rev        %w[tmp]       , %w[tmp]                     \n\t"
+        "lsr        %w[r_c]       , %w[r_c]     , #15           \n\t"
+        "lsr        %w[tmp]       , %w[tmp]     , #15           \n\t"
+        "ldrb       %w[r_c]       , [%[r_b], %w[r_c], SXTW]     \n\t"
+        "mov        %w[r_b]       , #0xFFFF                     \n\t"
+        "mov        %w[r_a]       , #7                          \n\t"
+        "sub        %w[tmp]       , %w[tmp]     , %w[r_b]       \n\t"
+        "sub        %w[r_c]       , %w[r_a]     , %w[r_c]       \n\t"
+        "lsl        %w[tmp]       , %w[tmp]     , %w[r_c]       \n\t"
+        "add        %w[low]       , %w[low]     , %w[tmp]       \n\t"
+        "2:                                                     \n\t"
+        :    [bit]"=&r"(bit),
+             [low]"+&r"(c->low),
+           [range]"+&r"(c->range),
+             [r_a]"=&r"(reg_a),
+             [r_b]"=&r"(reg_b),
+             [r_c]"=&r"(reg_c),
+             [tmp]"=&r"(tmp)
+        :        [c]"r"(c),
+             [state]"r"(state),
+            [tables]"r"(ff_h264_cabac_tables),
+              [byte]"i"(offsetof(CABACContext, bytestream)),
+               [end]"i"(offsetof(CABACContext, bytestream_end)),
+          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
+           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
+          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
+        : "memory", "cc"
+        );
+
+    return bit & 1;
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_AARCH64_CABAC_H */
diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
index 7401bc96a9..426f338e34 100644
--- a/libavcodec/cabac.h
+++ b/libavcodec/cabac.h
@@ -31,6 +31,7 @@
 
 #include "put_bits.h"
 
+extern uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
 #define H264_NORM_SHIFT_OFFSET 0
 #define H264_LPS_RANGE_OFFSET 512
 #define H264_MLPS_STATE_OFFSET 1024
diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index 11c9646742..39f0afbb36 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -32,11 +32,13 @@
 #include "cabac.h"
 #include "config.h"
 
+#if ARCH_AARCH64
+#   include "aarch64/cabac.h"
+#endif
 #if ARCH_X86
 #   include "x86/cabac.h"
 #endif
 
-extern uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
 static uint8_t * const ff_h264_norm_shift = ff_h264_cabac_tables + H264_NORM_SHIFT_OFFSET;
 static uint8_t * const ff_h264_lps_range = ff_h264_cabac_tables + H264_LPS_RANGE_OFFSET;
 static uint8_t * const ff_h264_mlps_state = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET;
-- 
cgit v1.2.3