summaryrefslogtreecommitdiff
path: root/libavutil/x86/x86util.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavutil/x86/x86util.asm')
-rw-r--r--libavutil/x86/x86util.asm242
1 files changed, 231 insertions, 11 deletions
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index bba958ebfc..fe9a727e22 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -6,20 +6,20 @@
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -29,6 +29,11 @@
%include "libavutil/x86/x86inc.asm"
+; expands to [base],...,[base+7*stride]
+%define PASS8ROWS(base, base3, stride, stride3) \
+ [base], [base + stride], [base + 2*stride], [base3], \
+ [base3 + stride], [base3 + 2*stride], [base3 + stride3], [base3 + stride*4]
+
; Interleave low src0 with low src1 and store in src0,
; interleave high src0 with high src1 and store in src1.
; %1 - types
@@ -40,7 +45,10 @@
; output: src0: x0 y0 x1 y1 x2 y2 x3 y3
; src1: z0 q0 z1 q1 z2 q2 z3 q3
%macro SBUTTERFLY 4
-%if avx_enabled == 0
+%ifidn %1, dqqq
+ vperm2i128 m%4, m%2, m%3, q0301
+ vinserti128 m%2, m%2, xm%3, 1
+%elif avx_enabled == 0
mova m%4, m%2
punpckl%1 m%2, m%3
punpckh%1 m%4, m%3
@@ -79,6 +87,15 @@
SWAP %2, %3
%endmacro
+%macro TRANSPOSE2x4x4B 5
+ SBUTTERFLY bw, %1, %2, %5
+ SBUTTERFLY bw, %3, %4, %5
+ SBUTTERFLY wd, %1, %3, %5
+ SBUTTERFLY wd, %2, %4, %5
+ SBUTTERFLY dq, %1, %2, %5
+ SBUTTERFLY dq, %3, %4, %5
+%endmacro
+
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
@@ -108,6 +125,43 @@
SWAP %5, %2, %3
%endmacro
+%macro TRANSPOSE8x4D 9-11
+%if ARCH_X86_64
+ SBUTTERFLY dq, %1, %2, %9
+ SBUTTERFLY dq, %3, %4, %9
+ SBUTTERFLY dq, %5, %6, %9
+ SBUTTERFLY dq, %7, %8, %9
+ SBUTTERFLY qdq, %1, %3, %9
+ SBUTTERFLY qdq, %2, %4, %9
+ SBUTTERFLY qdq, %5, %7, %9
+ SBUTTERFLY qdq, %6, %8, %9
+ SWAP %2, %5
+ SWAP %4, %7
+%else
+; in: m0..m7
+; out: m0..m7, unless %11 in which case m2 is in %9
+; spills into %9 and %10
+ movdqa %9, m%7
+ SBUTTERFLY dq, %1, %2, %7
+ movdqa %10, m%2
+ movdqa m%7, %9
+ SBUTTERFLY dq, %3, %4, %2
+ SBUTTERFLY dq, %5, %6, %2
+ SBUTTERFLY dq, %7, %8, %2
+ SBUTTERFLY qdq, %1, %3, %2
+ movdqa %9, m%3
+ movdqa m%2, %10
+ SBUTTERFLY qdq, %2, %4, %3
+ SBUTTERFLY qdq, %5, %7, %3
+ SBUTTERFLY qdq, %6, %8, %3
+ SWAP %2, %5
+ SWAP %4, %7
+%if %0<11
+ movdqa m%3, %9
+%endif
+%endif
+%endmacro
+
%macro TRANSPOSE8x8W 9-11
%if ARCH_X86_64
SBUTTERFLY wd, %1, %2, %9
@@ -157,6 +211,85 @@
%endif
%endmacro
+%macro TRANSPOSE16x16W 18-19
+; in: m0..m15, unless %19 in which case m6 is in %17
+; out: m0..m15, unless %19 in which case m4 is in %18
+; spills into %17 and %18
+%if %0 < 19
+ mova %17, m%7
+%endif
+
+ SBUTTERFLY dqqq, %1, %9, %7
+ SBUTTERFLY dqqq, %2, %10, %7
+ SBUTTERFLY dqqq, %3, %11, %7
+ SBUTTERFLY dqqq, %4, %12, %7
+ SBUTTERFLY dqqq, %5, %13, %7
+ SBUTTERFLY dqqq, %6, %14, %7
+ mova %18, m%14
+ mova m%7, %17
+ SBUTTERFLY dqqq, %7, %15, %14
+ SBUTTERFLY dqqq, %8, %16, %14
+
+ SBUTTERFLY wd, %1, %2, %14
+ SBUTTERFLY wd, %3, %4, %14
+ SBUTTERFLY wd, %5, %6, %14
+ SBUTTERFLY wd, %7, %8, %14
+ SBUTTERFLY wd, %9, %10, %14
+ SBUTTERFLY wd, %11, %12, %14
+ mova %17, m%12
+ mova m%14, %18
+ SBUTTERFLY wd, %13, %14, %12
+ SBUTTERFLY wd, %15, %16, %12
+
+ SBUTTERFLY dq, %1, %3, %12
+ SBUTTERFLY dq, %2, %4, %12
+ SBUTTERFLY dq, %5, %7, %12
+ SBUTTERFLY dq, %6, %8, %12
+ SBUTTERFLY dq, %9, %11, %12
+ mova %18, m%11
+ mova m%12, %17
+ SBUTTERFLY dq, %10, %12, %11
+ SBUTTERFLY dq, %13, %15, %11
+ SBUTTERFLY dq, %14, %16, %11
+
+ SBUTTERFLY qdq, %1, %5, %11
+ SBUTTERFLY qdq, %2, %6, %11
+ SBUTTERFLY qdq, %3, %7, %11
+ SBUTTERFLY qdq, %4, %8, %11
+
+ SWAP %2, %5
+ SWAP %4, %7
+
+ SBUTTERFLY qdq, %9, %13, %11
+ SBUTTERFLY qdq, %10, %14, %11
+ mova m%11, %18
+ mova %18, m%5
+ SBUTTERFLY qdq, %11, %15, %5
+ SBUTTERFLY qdq, %12, %16, %5
+
+%if %0 < 19
+ mova m%5, %18
+%endif
+
+ SWAP %10, %13
+ SWAP %12, %15
+%endmacro
+
+%macro TRANSPOSE_8X8B 8
+ %if mmsize == 8
+ %error "This macro does not support mmsize == 8"
+ %endif
+ punpcklbw m%1, m%2
+ punpcklbw m%3, m%4
+ punpcklbw m%5, m%6
+ punpcklbw m%7, m%8
+ TRANSPOSE4x4W %1, %3, %5, %7, %2
+ MOVHL m%2, m%1
+ MOVHL m%4, m%3
+ MOVHL m%6, m%5
+ MOVHL m%8, m%7
+%endmacro
+
; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
%macro PABSW 2
%if cpuflag(ssse3)
@@ -174,13 +307,13 @@
%endif
%endmacro
-%macro PSIGNW_MMX 2
+%macro PSIGNW 2
+%if cpuflag(ssse3)
+ psignw %1, %2
+%else
pxor %1, %2
psubw %1, %2
-%endmacro
-
-%macro PSIGNW_SSSE3 2
- psignw %1, %2
+%endif
%endmacro
%macro ABS1 2
@@ -283,6 +416,55 @@
%endif
%endmacro
+%macro HADDD 2 ; sum junk
+%if sizeof%1 == 32
+%define %2 xmm%2
+ vextracti128 %2, %1, 1
+%define %1 xmm%1
+ paddd %1, %2
+%endif
+%if mmsize >= 16
+%if cpuflag(xop) && sizeof%1 == 16
+ vphadddq %1, %1
+%endif
+ movhlps %2, %1
+ paddd %1, %2
+%endif
+%if notcpuflag(xop) || sizeof%1 != 16
+%if cpuflag(mmxext)
+ PSHUFLW %2, %1, q0032
+%else ; mmx
+ mova %2, %1
+ psrlq %2, 32
+%endif
+ paddd %1, %2
+%endif
+%undef %1
+%undef %2
+%endmacro
+
+%macro HADDW 2 ; reg, tmp
+%if cpuflag(xop) && sizeof%1 == 16
+ vphaddwq %1, %1
+ movhlps %2, %1
+ paddd %1, %2
+%else
+ pmaddwd %1, [pw_1]
+ HADDD %1, %2
+%endif
+%endmacro
+
+%macro HADDPS 3 ; dst, src, tmp
+%if cpuflag(sse3)
+ haddps %1, %1, %2
+%else
+ movaps %3, %1
+ shufps %1, %2, q2020
+ shufps %3, %2, q3131
+ addps %1, %3
+%endif
+%endmacro
+
%macro PALIGNR 4-5
%if cpuflag(ssse3)
%if %0==5
@@ -312,11 +494,19 @@
%endif
%endmacro
-%macro PAVGB 2
+%macro PAVGB 2-4
%if cpuflag(mmxext)
pavgb %1, %2
%elif cpuflag(3dnow)
pavgusb %1, %2
+%elif cpuflag(mmx)
+ movu %3, %2
+ por %3, %1
+ pxor %1, %2
+ pand %1, %4
+ psrlq %1, 1
+ psubb %3, %1
+ SWAP %1, %3
%endif
%endmacro
@@ -643,7 +833,9 @@
%if cpuflag(avx)
vbroadcastss %1, %2
%else ; sse
+%ifnidn %1, %2
movss %1, %2
+%endif
shufps %1, %1, 0
%endif
%endmacro
@@ -695,3 +887,31 @@
addps %1, %4
%endif
%endmacro
+
+%macro LSHIFT 2
+%if mmsize > 8
+ pslldq %1, %2
+%else
+ psllq %1, 8*(%2)
+%endif
+%endmacro
+
+%macro RSHIFT 2
+%if mmsize > 8
+ psrldq %1, %2
+%else
+ psrlq %1, 8*(%2)
+%endif
+%endmacro
+
+%macro MOVHL 2 ; dst, src
+%ifidn %1, %2
+ punpckhqdq %1, %2
+%elif cpuflag(avx)
+ punpckhqdq %1, %2, %2
+%elif cpuflag(sse4)
+ pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
+%else
+ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
+%endif
+%endmacro