summaryrefslogtreecommitdiff
path: root/libswscale/x86/output.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libswscale/x86/output.asm')
-rw-r--r--libswscale/x86/output.asm172
1 files changed, 94 insertions, 78 deletions
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 71cd72edf4..db3e9934f8 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -3,20 +3,20 @@
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;* Kieran Kunhya <kieran@kunhya.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -54,76 +54,8 @@ SECTION .text
; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
; of 2. $offset is either 0 or 3. $dither holds 8 values.
;-----------------------------------------------------------------------------
-
-%macro yuv2planeX_fn 3
-
-%if ARCH_X86_32
-%define cntr_reg fltsizeq
-%define movsx mov
-%else
-%define cntr_reg r7
-%define movsx movsxd
-%endif
-
-cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
-%if %1 == 8 || %1 == 9 || %1 == 10
- pxor m6, m6
-%endif ; %1 == 8/9/10
-
-%if %1 == 8
-%if ARCH_X86_32
-%assign pad 0x2c - (stack_offset & 15)
- SUB rsp, pad
-%define m_dith m7
-%else ; x86-64
-%define m_dith m9
-%endif ; x86-32
-
- ; create registers holding dither
- movq m_dith, [ditherq] ; dither
- test offsetd, offsetd
- jz .no_rot
-%if mmsize == 16
- punpcklqdq m_dith, m_dith
-%endif ; mmsize == 16
- PALIGNR m_dith, m_dith, 3, m0
-.no_rot:
-%if mmsize == 16
- punpcklbw m_dith, m6
-%if ARCH_X86_64
- punpcklwd m8, m_dith, m6
- pslld m8, 12
-%else ; x86-32
- punpcklwd m5, m_dith, m6
- pslld m5, 12
-%endif ; x86-32/64
- punpckhwd m_dith, m6
- pslld m_dith, 12
-%if ARCH_X86_32
- mova [rsp+ 0], m5
- mova [rsp+16], m_dith
-%endif
-%else ; mmsize == 8
- punpcklbw m5, m_dith, m6
- punpckhbw m_dith, m6
- punpcklwd m4, m5, m6
- punpckhwd m5, m6
- punpcklwd m3, m_dith, m6
- punpckhwd m_dith, m6
- pslld m4, 12
- pslld m5, 12
- pslld m3, 12
- pslld m_dith, 12
- mova [rsp+ 0], m4
- mova [rsp+ 8], m5
- mova [rsp+16], m3
- mova [rsp+24], m_dith
-%endif ; mmsize == 8/16
-%endif ; %1 == 8
-
- xor r5, r5
-
-.pixelloop:
+%macro yuv2planeX_mainloop 2
+.pixelloop_%2:
%assign %%i 0
; the rep here is for the 8-bit output MMX case, where dither covers
; 8 pixels but we can only handle 2 pixels per register, and thus 4
@@ -150,7 +82,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
mova m2, m1
%endif ; %1 == 8/9/10/16
movsx cntr_reg, fltsizem
-.filterloop_ %+ %%i:
+.filterloop_%2_ %+ %%i:
; input pixels
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
%if %1 == 16
@@ -197,7 +129,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%endif ; %1 == 8/9/10/16
sub cntr_reg, 2
- jg .filterloop_ %+ %%i
+ jg .filterloop_%2_ %+ %%i
%if %1 == 16
psrad m2, 31 - %1
@@ -224,7 +156,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%endif ; mmxext/sse2/sse4/avx
pminsw m2, [yuv2yuvX_%1_upper]
%endif ; %1 == 9/10/16
- mova [dstq+r5*2], m2
+ mov%2 [dstq+r5*2], m2
%endif ; %1 == 8/9/10/16
add r5, mmsize/2
@@ -232,7 +164,87 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%assign %%i %%i+2
%endrep
- jg .pixelloop
+ jg .pixelloop_%2
+%endmacro
+
+%macro yuv2planeX_fn 3
+
+%if ARCH_X86_32
+%define cntr_reg fltsizeq
+%define movsx mov
+%else
+%define cntr_reg r7
+%define movsx movsxd
+%endif
+
+cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
+%if %1 == 8 || %1 == 9 || %1 == 10
+ pxor m6, m6
+%endif ; %1 == 8/9/10
+
+%if %1 == 8
+%if ARCH_X86_32
+%assign pad 0x2c - (stack_offset & 15)
+ SUB rsp, pad
+%define m_dith m7
+%else ; x86-64
+%define m_dith m9
+%endif ; x86-32
+
+ ; create registers holding dither
+ movq m_dith, [ditherq] ; dither
+ test offsetd, offsetd
+ jz .no_rot
+%if mmsize == 16
+ punpcklqdq m_dith, m_dith
+%endif ; mmsize == 16
+ PALIGNR m_dith, m_dith, 3, m0
+.no_rot:
+%if mmsize == 16
+ punpcklbw m_dith, m6
+%if ARCH_X86_64
+ punpcklwd m8, m_dith, m6
+ pslld m8, 12
+%else ; x86-32
+ punpcklwd m5, m_dith, m6
+ pslld m5, 12
+%endif ; x86-32/64
+ punpckhwd m_dith, m6
+ pslld m_dith, 12
+%if ARCH_X86_32
+ mova [rsp+ 0], m5
+ mova [rsp+16], m_dith
+%endif
+%else ; mmsize == 8
+ punpcklbw m5, m_dith, m6
+ punpckhbw m_dith, m6
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m3, m_dith, m6
+ punpckhwd m_dith, m6
+ pslld m4, 12
+ pslld m5, 12
+ pslld m3, 12
+ pslld m_dith, 12
+ mova [rsp+ 0], m4
+ mova [rsp+ 8], m5
+ mova [rsp+16], m3
+ mova [rsp+24], m_dith
+%endif ; mmsize == 8/16
+%endif ; %1 == 8
+
+ xor r5, r5
+
+%if mmsize == 8 || %1 == 8
+ yuv2planeX_mainloop %1, a
+%else ; mmsize == 16
+ test dstq, 15
+ jnz .unaligned
+ yuv2planeX_mainloop %1, a
+ REP_RET
+.unaligned:
+ yuv2planeX_mainloop %1, u
+%endif ; mmsize == 8/16
%if %1 == 8
%if ARCH_X86_32
@@ -264,10 +276,12 @@ yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
yuv2planeX_fn 16, 8, 5
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
+%endif
; %1=outout-bpc, %2=alignment (u/a)
%macro yuv2plane1_mainloop 2
@@ -402,8 +416,10 @@ yuv2plane1_fn 16, 6, 3
INIT_XMM sse4
yuv2plane1_fn 16, 5, 3
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
+%endif