From dd2c9034b174a2b17f8e3ed972c49720bab1d4c1 Mon Sep 17 00:00:00 2001
From: James Almer <jamrial@gmail.com>
Date: Mon, 30 Jun 2014 13:06:00 -0300
Subject: x86/swr: convert resample_{common, linear}_double_sse2 to yasm

Signed-off-by: James Almer <jamrial@gmail.com>

312531 -> 311528 dezicycles

Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
 libswresample/resample_template.c    |  22 +------
 libswresample/x86/resample.asm       | 122 ++++++++++++++++++-----------------
 libswresample/x86/resample_mmx.h     |  72 ---------------------
 libswresample/x86/resample_x86_dsp.c |  27 +++-----
 4 files changed, 74 insertions(+), 169 deletions(-)
 delete mode 100644 libswresample/x86/resample_mmx.h

(limited to 'libswresample')

diff --git a/libswresample/resample_template.c b/libswresample/resample_template.c
index 2a64f50038..4f1638edb1 100644
--- a/libswresample/resample_template.c
+++ b/libswresample/resample_template.c
@@ -25,23 +25,15 @@
  * @author Michael Niedermayer <michaelni@gmx.at>
  */
 
-#if    defined(TEMPLATE_RESAMPLE_DBL)     \
-    || defined(TEMPLATE_RESAMPLE_DBL_SSE2)
+#if defined(TEMPLATE_RESAMPLE_DBL)
 
+#    define RENAME(N) N ## _double
 #    define FILTER_SHIFT 0
 #    define DELEM  double
 #    define FELEM  double
 #    define FELEM2 double
 #    define OUT(d, v) d = v
 
-#    if defined(TEMPLATE_RESAMPLE_DBL)
-#        define RENAME(N) N ## _double
-#    elif defined(TEMPLATE_RESAMPLE_DBL_SSE2)
-#        define COMMON_CORE COMMON_CORE_DBL_SSE2
-#        define LINEAR_CORE LINEAR_CORE_DBL_SSE2
-#        define RENAME(N) N ## _double_sse2
-#    endif
-
 #elif    defined(TEMPLATE_RESAMPLE_FLT)
 
 #    define RENAME(N) N ## _float
@@ -104,16 +96,12 @@ int RENAME(swri_resample_common)(ResampleContext *c,
     for (dst_index = 0; dst_index < n; dst_index++) {
         FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index;
 
-#ifdef COMMON_CORE
-        COMMON_CORE
-#else
         FELEM2 val=0;
         int i;
         for (i = 0; i < c->filter_length; i++) {
             val += src[sample_index + i] * (FELEM2)filter[i];
         }
         OUT(dst[dst_index], val);
-#endif
 
         frac  += c->dst_incr_mod;
         index += c->dst_incr_div;
@@ -150,15 +138,11 @@ int RENAME(swri_resample_linear)(ResampleContext *c,
         FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index;
         FELEM2 val=0, v2 = 0;
 
-#ifdef LINEAR_CORE
-        LINEAR_CORE
-#else
         int i;
         for (i = 0; i < c->filter_length; i++) {
             val += src[sample_index + i] * (FELEM2)filter[i];
             v2  += src[sample_index + i] * (FELEM2)filter[i + c->filter_alloc];
         }
-#endif
 #ifdef FELEML
         val += (v2 - val) * (FELEML) frac / c->src_incr;
 #else
@@ -188,8 +172,6 @@ int RENAME(swri_resample_linear)(ResampleContext *c,
     return sample_index;
 }
 
-#undef COMMON_CORE
-#undef LINEAR_CORE
 #undef RENAME
 #undef FILTER_SHIFT
 #undef DELEM
diff --git a/libswresample/x86/resample.asm b/libswresample/x86/resample.asm
index 2fe03c846b..bce1389bec 100644
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -50,11 +50,12 @@ endstruc
 SECTION_RODATA
 
 pf_1:      dd 1.0
+pdbl_1:    dq 1.0
 pd_0x4000: dd 0x4000
 
 SECTION .text
 
-%macro RESAMPLE_FNS 3 ; format [float or int16], bps, log2_bps
+%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
 ; int resample_common_$format(ResampleContext *ctx, $format *dst,
 ;                             const $format *src, int size, int update_ctx)
 %if ARCH_X86_64 ; unix64 and win64
@@ -165,21 +166,21 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
     lea                      filterq, [min_filter_count_x4q+filterq*%2]
     mov         min_filter_count_x4q, min_filter_length_x4q
 %endif
-%ifidn %1, float
-    xorps                         m0, m0, m0
-%else ; int16
+%ifidn %1, int16
     movd                          m0, [pd_0x4000]
+%else ; float/double
+    xorps                         m0, m0, m0
 %endif
 
     align 16
 .inner_loop:
     movu                          m1, [srcq+min_filter_count_x4q*1]
-%ifidn %1, float
-    mulps                         m1, m1, [filterq+min_filter_count_x4q*1]
-    addps                         m0, m0, m1
-%else ; int16
+%ifidn %1, int16
     pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
     paddd                         m0, m1
+%else ; float/double
+    mulp%4                        m1, m1, [filterq+min_filter_count_x4q*1]
+    addp%4                        m0, m0, m1
 %endif
     add         min_filter_count_x4q, mmsize
     js .inner_loop
@@ -189,16 +190,7 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
     addps                        xm0, xm1
 %endif
 
-    ; horizontal sum & store
-%ifidn %1, float
-    movhlps                      xm1, xm0
-    addps                        xm0, xm1
-    shufps                       xm1, xm0, xm0, q0001
-    add                        fracd, dst_incr_modd
-    addps                        xm0, xm1
-    add                       indexd, dst_incr_divd
-    movss                     [dstq], xm0
-%else ; int16
+%ifidn %1, int16
 %if mmsize == 16
     pshufd                        m1, m0, q0032
     paddd                         m0, m1
@@ -212,6 +204,17 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
     packssdw                      m0, m0
     add                       indexd, dst_incr_divd
     movd                      [dstq], m0
+%else ; float/double
+    ; horizontal sum & store
+    movhlps                      xm1, xm0
+%ifidn %1, float
+    addps                        xm0, xm1
+    shufps                       xm1, xm0, xm0, q0001
+%endif
+    add                        fracd, dst_incr_modd
+    addp%4                       xm0, xm1
+    add                       indexd, dst_incr_divd
+    movs%4                    [dstq], xm0
 %endif
     cmp                        fracd, src_incrd
     jl .skip
@@ -307,12 +310,12 @@ cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index,
     mov                   ctx_stackq, ctxq
     mov            phase_mask_stackd, phase_maskd
     mov           min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
-%ifidn %1, float
-    cvtsi2ss                     xm0, src_incrd
-    movss                        xm4, [pf_1]
-    divss                        xm4, xm0
-%else ; int16
+%ifidn %1, int16
     movd                          m4, [pd_0x4000]
+%else ; float/double
+    cvtsi2s%4                    xm0, src_incrd
+    movs%4                       xm4, [%5]
+    divs%4                       xm4, xm0
 %endif
     mov                dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
     shl           min_filter_len_x4d, %3
@@ -360,12 +363,12 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
     mov                           r3, dword [ctxq+ResampleContext.src_incr]
     PUSH                              dword [ctxq+ResampleContext.phase_mask]
     PUSH                              r3d
-%ifidn %1, float
-    cvtsi2ss                     xm0, r3d
-    movss                        xm4, [pf_1]
-    divss                        xm4, xm0
-%else ; int16
+%ifidn %1, int16
     movd                          m4, [pd_0x4000]
+%else ; float/double
+    cvtsi2s%4                    xm0, r3d
+    movs%4                       xm4, [%5]
+    divs%4                       xm4, xm0
 %endif
     mov        min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
     mov                       indexd, [ctxq+ResampleContext.index]
@@ -409,27 +412,27 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
     mov                     filter2q, filter1q
     add                     filter2q, filter_alloc_x4q
 %endif
-%ifidn %1, float
-    xorps                         m0, m0, m0
-    xorps                         m2, m2, m2
-%else ; int16
+%ifidn %1, int16
     mova                          m0, m4
     mova                          m2, m4
+%else ; float/double
+    xorps                         m0, m0, m0
+    xorps                         m2, m2, m2
 %endif
 
     align 16
 .inner_loop:
     movu                          m1, [srcq+min_filter_count_x4q*1]
-%ifidn %1, float
-    mulps                         m3, m1, [filter2q+min_filter_count_x4q*1]
-    mulps                         m1, m1, [filter1q+min_filter_count_x4q*1]
-    addps                         m2, m2, m3
-    addps                         m0, m0, m1
-%else ; int16
+%ifidn %1, int16
     pmaddwd                       m3, m1, [filter2q+min_filter_count_x4q*1]
     pmaddwd                       m1, [filter1q+min_filter_count_x4q*1]
     paddd                         m2, m3
     paddd                         m0, m1
+%else ; float/double
+    mulp%4                        m3, m1, [filter2q+min_filter_count_x4q*1]
+    mulp%4                        m1, m1, [filter1q+min_filter_count_x4q*1]
+    addp%4                        m2, m2, m3
+    addp%4                        m0, m0, m1
 %endif
     add         min_filter_count_x4q, mmsize
     js .inner_loop
@@ -441,24 +444,7 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
     addps                        xm2, xm3
 %endif
 
-%ifidn %1, float
-    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
-    cvtsi2ss                     xm1, fracd
-    subps                        xm2, xm0
-    mulps                        xm1, xm4
-    shufps                       xm1, xm1, q0000
-    mulps                        xm2, xm1
-    addps                        xm0, xm2
-
-    ; horizontal sum & store
-    movhlps                      xm1, xm0
-    addps                        xm0, xm1
-    shufps                       xm1, xm0, xm0, q0001
-    add                        fracd, dst_incr_modd
-    addps                        xm0, xm1
-    add                       indexd, dst_incr_divd
-    movss                     [dstq], xm0
-%else ; int16
+%ifidn %1, int16
 %if mmsize == 16
     pshufd                        m3, m2, q0032
     pshufd                        m1, m0, q0032
@@ -491,6 +477,25 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
     ; - 32bit: eax=r0[filter1], edx=r2[filter2]
     ; - win64: eax=r6[filter1], edx=r1[todo]
     ; - unix64: eax=r6[filter1], edx=r2[todo]
+%else ; float/double
+    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
+    cvtsi2s%4                    xm1, fracd
+    subp%4                       xm2, xm0
+    mulp%4                       xm1, xm4
+    shufp%4                      xm1, xm1, q0000
+    mulp%4                       xm2, xm1
+    addp%4                       xm0, xm2
+
+    ; horizontal sum & store
+    movhlps                      xm1, xm0
+%ifidn %1, float
+    addps                        xm0, xm1
+    shufps                       xm1, xm0, xm0, q0001
+%endif
+    add                        fracd, dst_incr_modd
+    addp%4                       xm0, xm1
+    add                       indexd, dst_incr_divd
+    movs%4                    [dstq], xm0
 %endif
     cmp                        fracd, src_incrd
     jl .skip
@@ -553,11 +558,11 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
 %endmacro
 
 INIT_XMM sse
-RESAMPLE_FNS float, 4, 2
+RESAMPLE_FNS float, 4, 2, s, pf_1
 
 %if HAVE_AVX_EXTERNAL
 INIT_YMM avx
-RESAMPLE_FNS float, 4, 2
+RESAMPLE_FNS float, 4, 2, s, pf_1
 %endif
 
 %if ARCH_X86_32
@@ -567,3 +572,4 @@ RESAMPLE_FNS int16, 2, 1
 
 INIT_XMM sse2
 RESAMPLE_FNS int16, 2, 1
+RESAMPLE_FNS double, 8, 3, d, pdbl_1
diff --git a/libswresample/x86/resample_mmx.h b/libswresample/x86/resample_mmx.h
deleted file mode 100644
index b0ea496361..0000000000
--- a/libswresample/x86/resample_mmx.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/x86/asm.h"
-#include "libavutil/cpu.h"
-#include "libswresample/swresample_internal.h"
-
-#define COMMON_CORE_DBL_SSE2 \
-    x86_reg len= -8*c->filter_length;\
-__asm__ volatile(\
-    "xorpd     %%xmm0, %%xmm0     \n\t"\
-    "1:                           \n\t"\
-    "movupd  (%1, %0), %%xmm1     \n\t"\
-    "mulpd   (%2, %0), %%xmm1     \n\t"\
-    "addpd     %%xmm1, %%xmm0     \n\t"\
-    "add       $16, %0            \n\t"\
-    " js 1b                       \n\t"\
-    "movhlps   %%xmm0, %%xmm1     \n\t"\
-    "addpd     %%xmm1, %%xmm0     \n\t"\
-    "movsd     %%xmm0, (%3)       \n\t"\
-    : "+r" (len)\
-    : "r" (((uint8_t*)(src+sample_index))-len),\
-      "r" (((uint8_t*)filter)-len),\
-      "r" (dst+dst_index)\
-    XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
-);
-
-#define LINEAR_CORE_DBL_SSE2 \
-    x86_reg len= -8*c->filter_length;\
-__asm__ volatile(\
-    "xorpd      %%xmm0, %%xmm0    \n\t"\
-    "xorpd      %%xmm2, %%xmm2    \n\t"\
-    "1:                           \n\t"\
-    "movupd   (%3, %0), %%xmm1    \n\t"\
-    "movapd     %%xmm1, %%xmm3    \n\t"\
-    "mulpd    (%4, %0), %%xmm1    \n\t"\
-    "mulpd    (%5, %0), %%xmm3    \n\t"\
-    "addpd      %%xmm1, %%xmm0    \n\t"\
-    "addpd      %%xmm3, %%xmm2    \n\t"\
-    "add           $16, %0        \n\t"\
-    " js 1b                       \n\t"\
-    "movhlps    %%xmm0, %%xmm1    \n\t"\
-    "movhlps    %%xmm2, %%xmm3    \n\t"\
-    "addpd      %%xmm1, %%xmm0    \n\t"\
-    "addpd      %%xmm3, %%xmm2    \n\t"\
-    "movsd      %%xmm0, %1        \n\t"\
-    "movsd      %%xmm2, %2        \n\t"\
-    : "+r" (len),\
-      "=m" (val),\
-      "=m" (v2)\
-    : "r" (((uint8_t*)(src+sample_index))-len),\
-      "r" (((uint8_t*)filter)-len),\
-      "r" (((uint8_t*)(filter+c->filter_alloc))-len)\
-    XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
-);
diff --git a/libswresample/x86/resample_x86_dsp.c b/libswresample/x86/resample_x86_dsp.c
index 5130ecdd1b..9049da6951 100644
--- a/libswresample/x86/resample_x86_dsp.c
+++ b/libswresample/x86/resample_x86_dsp.c
@@ -27,21 +27,6 @@
 
 #include "libswresample/resample.h"
 
-int swri_resample_common_double_sse2(ResampleContext *c,  double *dst, const  double *src, int n, int update_ctx);
-int swri_resample_linear_double_sse2(ResampleContext *c,  double *dst, const  double *src, int n, int update_ctx);
-
-#if HAVE_SSE2_INLINE
-#define DO_RESAMPLE_ONE 0
-
-#include "resample_mmx.h"
-
-#define TEMPLATE_RESAMPLE_DBL_SSE2
-#include "libswresample/resample_template.c"
-#undef TEMPLATE_RESAMPLE_DBL_SSE2
-#endif
-
-#undef DO_RESAMPLE_ONE
-
 int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
                                     const uint8_t *src, int sz, int upd);
 int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
@@ -62,6 +47,11 @@ int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
 int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
                                  const uint8_t *src, int sz, int upd);
 
+int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
+                                   const uint8_t *src, int sz, int upd);
+int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
+                                   const uint8_t *src, int sz, int upd);
+
 void swresample_dsp_x86_init(ResampleContext *c)
 {
     int av_unused mm_flags = av_get_cpu_flags();
@@ -78,10 +68,9 @@ void swresample_dsp_x86_init(ResampleContext *c)
     if (HAVE_SSE2_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE2) {
         c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_sse2;
         c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_sse2;
-    }
-    if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) {
-        c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2;
-        c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2;
+
+        c->dsp.resample_common[FNIDX(DBLP)] = ff_resample_common_double_sse2;
+        c->dsp.resample_linear[FNIDX(DBLP)] = ff_resample_linear_double_sse2;
     }
     if (HAVE_AVX_EXTERNAL && mm_flags & AV_CPU_FLAG_AVX) {
         c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;
-- 
cgit v1.2.3