summaryrefslogtreecommitdiff
path: root/libavfilter/x86/af_afir.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavfilter/x86/af_afir.asm')
-rw-r--r--libavfilter/x86/af_afir.asm60
1 files changed, 60 insertions, 0 deletions
diff --git a/libavfilter/x86/af_afir.asm b/libavfilter/x86/af_afir.asm
new file mode 100644
index 0000000000..849d85e70f
--- /dev/null
+++ b/libavfilter/x86/af_afir.asm
@@ -0,0 +1,60 @@
+;*****************************************************************************
+;* x86-optimized functions for afir filter
+;* Copyright (c) 2017 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;------------------------------------------------------------------------------
+; void ff_fcmul_add(float *sum, const float *t, const float *c, int len)
+;------------------------------------------------------------------------------
+
+INIT_XMM sse3
+cglobal fcmul_add, 4,4,6, sum, t, c, len
+ shl lend, 3
+ add lend, mmsize*2
+ add tq, lenq
+ add cq, lenq
+ add sumq, lenq
+ neg lenq
+ALIGN 16
+.loop:
+ movsldup m0, [tq + lenq]
+ movsldup m3, [tq + lenq+mmsize]
+ movaps m1, [cq + lenq]
+ movaps m4, [cq + lenq+mmsize]
+ mulps m0, m1
+ mulps m3, m4
+ shufps m1, m1, 0xb1
+ shufps m4, m4, 0xb1
+ movshdup m2, [tq + lenq]
+ movshdup m5, [tq + lenq+mmsize]
+ mulps m2, m1
+ mulps m5, m4
+ addsubps m0, m2
+ addsubps m3, m5
+ addps m0, [sumq + lenq]
+ addps m3, [sumq + lenq+mmsize]
+ movaps [sumq + lenq], m0
+ movaps [sumq + lenq+mmsize], m3
+ add lenq, mmsize*2
+ jl .loop
+ REP_RET