dsputil: Move APE-specific bits into apedsp

author: Diego Biurrun <diego@biurrun.de> 2013-12-29 02:32:16 +0100
committer: Diego Biurrun <diego@biurrun.de> 2014-05-29 06:41:15 -0700
commit: 054013a0fc6f2b52c60cee3e051be8cc7f82cef3 (patch)
tree: 87098f4b0443359b7109066486c15fdaad09dddb /libavcodec/x86
parent: 256da0770e495176d1b2699ec6e9c7993c2a6d7b (diff)
5 files changed, 216 insertions, 150 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 8830a22a8f..10242269c2 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -25,6 +25,7 @@ OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
 OBJS-$(CONFIG_XMM_CLOBBER_TEST)        += x86/w64xmmtest.o
 
 OBJS-$(CONFIG_AAC_DECODER)             += x86/sbrdsp_init.o
+OBJS-$(CONFIG_APE_DECODER)             += x86/apedsp_init.o
 OBJS-$(CONFIG_CAVS_DECODER)            += x86/cavsdsp.o
 OBJS-$(CONFIG_DCA_DECODER)             += x86/dcadsp_init.o
 OBJS-$(CONFIG_DNXHD_ENCODER)           += x86/dnxhdenc_init.o
@@ -89,6 +90,7 @@ YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
 
 YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
+YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
diff --git a/libavcodec/x86/apedsp.asm b/libavcodec/x86/apedsp.asm
new file mode 100644
index 0000000000..d721ebda6b
--- /dev/null
+++ b/libavcodec/x86/apedsp.asm
@@ -0,0 +1,167 @@
+;******************************************************************************
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+%macro SCALARPRODUCT 0
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+;                                     int order, int mul)
+cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+%if mmsize == 16
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+%else
+    pshufw  m7, m7, 0
+%endif
+    pxor    m6, m6
+    add v1q, orderq
+    add v2q, orderq
+    add v3q, orderq
+    neg orderq
+.loop:
+    movu    m0, [v2q + orderq]
+    movu    m1, [v2q + orderq + mmsize]
+    mova    m4, [v1q + orderq]
+    mova    m5, [v1q + orderq + mmsize]
+    movu    m2, [v3q + orderq]
+    movu    m3, [v3q + orderq + mmsize]
+    pmaddwd m0, m4
+    pmaddwd m1, m5
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddd   m6, m0
+    paddd   m6, m1
+    paddw   m2, m4
+    paddw   m3, m5
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    add     orderq, mmsize*2
+    jl .loop
+%if mmsize == 16
+    movhlps m0, m6
+    paddd   m6, m0
+    pshuflw m0, m6, 0x4e
+%else
+    pshufw  m0, m6, 0x4e
+%endif
+    paddd   m6, m0
+    movd   eax, m6
+    RET
+%endmacro
+
+INIT_MMX mmxext
+SCALARPRODUCT
+INIT_XMM sse2
+SCALARPRODUCT
+
+%macro SCALARPRODUCT_LOOP 1
+align 16
+.loop%1:
+    sub     orderq, mmsize*2
+%if %1
+    mova    m1, m4
+    mova    m4, [v2q + orderq]
+    mova    m0, [v2q + orderq + mmsize]
+    palignr m1, m0, %1
+    palignr m0, m4, %1
+    mova    m3, m5
+    mova    m5, [v3q + orderq]
+    mova    m2, [v3q + orderq + mmsize]
+    palignr m3, m2, %1
+    palignr m2, m5, %1
+%else
+    mova    m0, [v2q + orderq]
+    mova    m1, [v2q + orderq + mmsize]
+    mova    m2, [v3q + orderq]
+    mova    m3, [v3q + orderq + mmsize]
+%endif
+    %define t0  [v1q + orderq]
+    %define t1  [v1q + orderq + mmsize]
+%if ARCH_X86_64
+    mova    m8, t0
+    mova    m9, t1
+    %define t0  m8
+    %define t1  m9
+%endif
+    pmaddwd m0, t0
+    pmaddwd m1, t1
+    pmullw  m2, m7
+    pmullw  m3, m7
+    paddw   m2, t0
+    paddw   m3, t1
+    paddd   m6, m0
+    paddd   m6, m1
+    mova    [v1q + orderq], m2
+    mova    [v1q + orderq + mmsize], m3
+    jg .loop%1
+%if %1
+    jmp .end
+%endif
+%endmacro
+
+; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
+;                                     int order, int mul)
+INIT_XMM ssse3
+cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
+    shl orderq, 1
+    movd    m7, mulm
+    pshuflw m7, m7, 0
+    punpcklqdq m7, m7
+    pxor    m6, m6
+    mov    r4d, v2d
+    and    r4d, 15
+    and    v2q, ~15
+    and    v3q, ~15
+    mova    m4, [v2q + orderq]
+    mova    m5, [v3q + orderq]
+    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
+    cmp    r4d, 0
+    je .loop0
+    cmp    r4d, 2
+    je .loop2
+    cmp    r4d, 4
+    je .loop4
+    cmp    r4d, 6
+    je .loop6
+    cmp    r4d, 8
+    je .loop8
+    cmp    r4d, 10
+    je .loop10
+    cmp    r4d, 12
+    je .loop12
+SCALARPRODUCT_LOOP 14
+SCALARPRODUCT_LOOP 12
+SCALARPRODUCT_LOOP 10
+SCALARPRODUCT_LOOP 8
+SCALARPRODUCT_LOOP 6
+SCALARPRODUCT_LOOP 4
+SCALARPRODUCT_LOOP 2
+SCALARPRODUCT_LOOP 0
+.end:
+    movhlps m0, m6
+    paddd   m6, m0
+    pshuflw m0, m6, 0x4e
+    paddd   m6, m0
+    movd   eax, m6
+    RET
diff --git a/libavcodec/x86/apedsp_init.c b/libavcodec/x86/apedsp_init.c
new file mode 100644
index 0000000000..f692c2b9b6
--- /dev/null
+++ b/libavcodec/x86/apedsp_init.c
@@ -0,0 +1,47 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/apedsp.h"
+
+int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
+                                               const int16_t *v3,
+                                               int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
+                                             const int16_t *v3,
+                                             int order, int mul);
+int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
+                                              const int16_t *v3,
+                                              int order, int mul);
+
+av_cold void ff_apedsp_init_x86(APEDSPContext *c)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (EXTERNAL_MMXEXT(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
+
+    if (EXTERNAL_SSE2(cpu_flags))
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+
+    if (EXTERNAL_SSSE3(cpu_flags) &&
+        !(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
+        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+}
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 684f09b7fc..b5d6d3cc65 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -53,52 +53,6 @@ cglobal scalarproduct_int16, 3,3,3, v1, v2, order
     paddd   m2, m0
     movd   eax, m2
     RET
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-;                                     int order, int mul)
-cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
-    shl orderq, 1
-    movd    m7, mulm
-%if mmsize == 16
-    pshuflw m7, m7, 0
-    punpcklqdq m7, m7
-%else
-    pshufw  m7, m7, 0
-%endif
-    pxor    m6, m6
-    add v1q, orderq
-    add v2q, orderq
-    add v3q, orderq
-    neg orderq
-.loop:
-    movu    m0, [v2q + orderq]
-    movu    m1, [v2q + orderq + mmsize]
-    mova    m4, [v1q + orderq]
-    mova    m5, [v1q + orderq + mmsize]
-    movu    m2, [v3q + orderq]
-    movu    m3, [v3q + orderq + mmsize]
-    pmaddwd m0, m4
-    pmaddwd m1, m5
-    pmullw  m2, m7
-    pmullw  m3, m7
-    paddd   m6, m0
-    paddd   m6, m1
-    paddw   m2, m4
-    paddw   m3, m5
-    mova    [v1q + orderq], m2
-    mova    [v1q + orderq + mmsize], m3
-    add     orderq, mmsize*2
-    jl .loop
-%if mmsize == 16
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-%else
-    pshufw  m0, m6, 0x4e
-%endif
-    paddd   m6, m0
-    movd   eax, m6
-    RET
 %endmacro
 
 INIT_MMX mmxext
@@ -106,97 +60,6 @@ SCALARPRODUCT
 INIT_XMM sse2
 SCALARPRODUCT
 
-%macro SCALARPRODUCT_LOOP 1
-align 16
-.loop%1:
-    sub     orderq, mmsize*2
-%if %1
-    mova    m1, m4
-    mova    m4, [v2q + orderq]
-    mova    m0, [v2q + orderq + mmsize]
-    palignr m1, m0, %1
-    palignr m0, m4, %1
-    mova    m3, m5
-    mova    m5, [v3q + orderq]
-    mova    m2, [v3q + orderq + mmsize]
-    palignr m3, m2, %1
-    palignr m2, m5, %1
-%else
-    mova    m0, [v2q + orderq]
-    mova    m1, [v2q + orderq + mmsize]
-    mova    m2, [v3q + orderq]
-    mova    m3, [v3q + orderq + mmsize]
-%endif
-    %define t0  [v1q + orderq]
-    %define t1  [v1q + orderq + mmsize]
-%if ARCH_X86_64
-    mova    m8, t0
-    mova    m9, t1
-    %define t0  m8
-    %define t1  m9
-%endif
-    pmaddwd m0, t0
-    pmaddwd m1, t1
-    pmullw  m2, m7
-    pmullw  m3, m7
-    paddw   m2, t0
-    paddw   m3, t1
-    paddd   m6, m0
-    paddd   m6, m1
-    mova    [v1q + orderq], m2
-    mova    [v1q + orderq + mmsize], m3
-    jg .loop%1
-%if %1
-    jmp .end
-%endif
-%endmacro
-
-; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
-;                                     int order, int mul)
-INIT_XMM ssse3
-cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
-    shl orderq, 1
-    movd    m7, mulm
-    pshuflw m7, m7, 0
-    punpcklqdq m7, m7
-    pxor    m6, m6
-    mov    r4d, v2d
-    and    r4d, 15
-    and    v2q, ~15
-    and    v3q, ~15
-    mova    m4, [v2q + orderq]
-    mova    m5, [v3q + orderq]
-    ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
-    cmp    r4d, 0
-    je .loop0
-    cmp    r4d, 2
-    je .loop2
-    cmp    r4d, 4
-    je .loop4
-    cmp    r4d, 6
-    je .loop6
-    cmp    r4d, 8
-    je .loop8
-    cmp    r4d, 10
-    je .loop10
-    cmp    r4d, 12
-    je .loop12
-SCALARPRODUCT_LOOP 14
-SCALARPRODUCT_LOOP 12
-SCALARPRODUCT_LOOP 10
-SCALARPRODUCT_LOOP 8
-SCALARPRODUCT_LOOP 6
-SCALARPRODUCT_LOOP 4
-SCALARPRODUCT_LOOP 2
-SCALARPRODUCT_LOOP 0
-.end:
-    movhlps m0, m6
-    paddd   m6, m0
-    pshuflw m0, m6, 0x4e
-    paddd   m6, m0
-    movd   eax, m6
-    RET
-
 
 ;-----------------------------------------------------------------------------
 ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 10fa166db4..9b0788ff73 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -76,15 +76,6 @@ int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
                                       int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
                                     int order);
-int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
-                                               const int16_t *v3,
-                                               int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
-                                             const int16_t *v3,
-                                             int order, int mul);
-int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
-                                              const int16_t *v3,
-                                              int order, int mul);
 
 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
@@ -568,7 +559,6 @@ static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
 
     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
-    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }
 
@@ -607,7 +597,6 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
 
 #if HAVE_SSE2_EXTERNAL
     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
-    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
     if (cpu_flags & AV_CPU_FLAG_ATOM) {
         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
     } else {
@@ -621,8 +610,6 @@ static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
                                        int cpu_flags, unsigned high_bit_depth)
 {
 #if HAVE_SSSE3_EXTERNAL
-    if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
-        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
     c->bswap_buf = ff_bswap32_buf_ssse3;
 #endif /* HAVE_SSSE3_EXTERNAL */
 }
author	Diego Biurrun <diego@biurrun.de>	2013-12-29 02:32:16 +0100
committer	Diego Biurrun <diego@biurrun.de>	2014-05-29 06:41:15 -0700
commit	054013a0fc6f2b52c60cee3e051be8cc7f82cef3 (patch)
tree	87098f4b0443359b7109066486c15fdaad09dddb /libavcodec/x86
parent	256da0770e495176d1b2699ec6e9c7993c2a6d7b (diff)