summaryrefslogtreecommitdiff
path: root/libavcodec/x86/mpegvideoencdsp.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86/mpegvideoencdsp.asm')
-rw-r--r--libavcodec/x86/mpegvideoencdsp.asm151
1 files changed, 105 insertions, 46 deletions
diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm
index 9326ee776d..aec73f82dc 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -4,92 +4,151 @@
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
%include "libavutil/x86/x86util.asm"
-SECTION .text
+SECTION_RODATA
-INIT_MMX mmx
+cextern pw_1
+
+SECTION .text
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size)
-cglobal pix_sum16, 2, 3
+; %1 = number of loops
+; %2 = number of GPRs used
+%macro PIX_SUM16 3
+cglobal pix_sum16, 2, %2, 6
movsxdifnidn r1, r1d
- mov r2, r1
- neg r2
- shl r2, 4
- sub r0, r2
- pxor m7, m7
- pxor m6, m6
+ mov r2, %1
+%if mmsize == 16
+ lea r3, [r1*3]
+%endif
+%if notcpuflag(xop)
+ pxor m5, m5
+%endif
+ pxor m4, m4
.loop:
- mova m0, [r0+r2+0]
- mova m1, [r0+r2+0]
- mova m2, [r0+r2+8]
- mova m3, [r0+r2+8]
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
+%if cpuflag(xop)
+ vphaddubq m0, [r0]
+ vphaddubq m1, [r0+r1]
+ vphaddubq m2, [r0+r1*2]
+ vphaddubq m3, [r0+r3]
+%else
+ mova m0, [r0]
+%if mmsize == 8
+ mova m1, [r0+8]
+%if cpuflag(mmxext)
+ mova m2, [r0+r1]
+ mova m3, [r0+r1+8]
+%endif
+%else ; sse2
+ mova m1, [r0+r1]
+ mova m2, [r0+r1*2]
+ mova m3, [r0+r3]
+%endif
+%if cpuflag(mmxext)
+ psadbw m0, m5
+ psadbw m1, m5
+ psadbw m2, m5
+ psadbw m3, m5
+%else ; mmx
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif ; cpuflag(mmxext)
+%endif ; cpuflag(xop)
paddw m1, m0
paddw m3, m2
paddw m3, m1
- paddw m6, m3
- add r2, r1
- js .loop
- mova m5, m6
- psrlq m6, 32
- paddw m6, m5
- mova m5, m6
- psrlq m6, 16
- paddw m6, m5
- movd eax, m6
- and eax, 0xffff
+ paddw m4, m3
+%if cpuflag(mmxext)
+ lea r0, [r0+r1*%3]
+%else
+ add r0, r1
+%endif
+ dec r2
+ jne .loop
+%if mmsize == 16
+ pshufd m0, m4, q0032
+ paddd m4, m0
+%elif notcpuflag(mmxext)
+ HADDW m4, m5
+%endif
+ movd eax, m4
RET
+%endmacro
+%if ARCH_X86_32
INIT_MMX mmx
+PIX_SUM16 16, 3, 0
+INIT_MMX mmxext
+PIX_SUM16 8, 4, 2
+%endif
+INIT_XMM sse2
+PIX_SUM16 4, 4, 4
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+PIX_SUM16 4, 4, 4
+%endif
+
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size)
-cglobal pix_norm1, 2, 4
+; %1 = number of xmm registers used
+; %2 = number of loops
+%macro PIX_NORM1 2
+cglobal pix_norm1, 2, 3, %1
movsxdifnidn r1, r1d
- mov r2, 16
+ mov r2, %2
pxor m0, m0
- pxor m7, m7
+ pxor m5, m5
.loop:
mova m2, [r0+0]
+%if mmsize == 8
mova m3, [r0+8]
- mova m1, m2
- punpckhbw m1, m0
+%else
+ mova m3, [r0+r1]
+%endif
+ punpckhbw m1, m2, m0
punpcklbw m2, m0
- mova m4, m3
- punpckhbw m3, m0
- punpcklbw m4, m0
+ punpckhbw m4, m3, m0
+ punpcklbw m3, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
paddd m2, m1
paddd m4, m3
- paddd m7, m2
+ paddd m5, m2
+ paddd m5, m4
+%if mmsize == 8
add r0, r1
- paddd m7, m4
+%else
+ lea r0, [r0+r1*2]
+%endif
dec r2
jne .loop
- mova m1, m7
- psrlq m7, 32
- paddd m1, m7
- movd eax, m1
+ HADDD m5, m1
+ movd eax, m5
RET
+%endmacro
+
+INIT_MMX mmx
+PIX_NORM1 0, 16
+INIT_XMM sse2
+PIX_NORM1 6, 8