summaryrefslogtreecommitdiff
path: root/libavcodec/x86/vp9mc.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86/vp9mc.asm')
-rw-r--r--libavcodec/x86/vp9mc.asm105
1 files changed, 79 insertions, 26 deletions
diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index 15e93ea6cb..f64161b2c2 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -3,20 +3,20 @@
;*
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -45,6 +45,13 @@ times 8 dw %7
times 8 dw %8
%endmacro
+%macro F8_16BPP_TAPS 8
+times 8 dw %1, %2
+times 8 dw %3, %4
+times 8 dw %5, %6
+times 8 dw %7, %8
+%endmacro
+
%macro FILTER 1
const filters_%1 ; smooth
F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0
@@ -102,12 +109,15 @@ FILTER ssse3
%define F8_TAPS F8_SSE2_TAPS
; int16_t ff_filters_sse2[3][15][8][8]
FILTER sse2
+%define F8_TAPS F8_16BPP_TAPS
+; int16_t ff_filters_16bpp[3][15][4][16]
+FILTER 16bpp
SECTION .text
%macro filter_sse2_h_fn 1
%assign %%px mmsize/2
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 15, dst, src, dstride, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 15, dst, dstride, src, sstride, h, filtery
pxor m5, m5
mova m6, [pw_64]
mova m7, [filteryq+ 0]
@@ -192,7 +202,7 @@ filter_sse2_h_fn avg
%macro filter_h_fn 1
%assign %%px mmsize/2
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, src, dstride, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery
mova m6, [pw_256]
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
@@ -253,7 +263,7 @@ filter_h_fn avg
%if ARCH_X86_64
%macro filter_hx2_fn 1
%assign %%px mmsize
-cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, src, dstride, sstride, h, filtery
+cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 14, dst, dstride, src, sstride, h, filtery
mova m13, [pw_256]
mova m8, [filteryq+ 0]
mova m9, [filteryq+32]
@@ -315,9 +325,9 @@ filter_hx2_fn avg
%macro filter_sse2_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 15, dst, src, dstride, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 15, dst, src, dstride, sstride, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%define hd r4mp
%endif
@@ -413,9 +423,9 @@ filter_sse2_v_fn avg
%macro filter_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, src, dstride, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, src, dstride, sstride, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%define hd r4mp
%endif
@@ -486,7 +496,7 @@ filter_v_fn avg
%macro filter_vx2_fn 1
%assign %%px mmsize
-cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, src, dstride, sstride, h, filtery, src4, sstride3
+cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
mova m13, [pw_256]
lea sstride3q, [sstrideq*3]
lea src4q, [srcq+sstrideq]
@@ -552,7 +562,7 @@ filter_vx2_fn avg
%endif ; ARCH_X86_64
-%macro fpel_fn 6
+%macro fpel_fn 6-8 0, 4
%if %2 == 4
%define %%srcfn movh
%define %%dstfn movh
@@ -561,29 +571,57 @@ filter_vx2_fn avg
%define %%dstfn mova
%endif
+%if %7 == 8
+%define %%pavg pavgb
+%define %%szsuf _8
+%elif %7 == 16
+%define %%pavg pavgw
+%define %%szsuf _16
+%else
+%define %%szsuf
+%endif
+
%if %2 <= mmsize
-cglobal vp9_%1%2, 5, 7, 4, dst, src, dstride, sstride, h, dstride3, sstride3
+cglobal vp9_%1%2 %+ %%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
lea sstride3q, [sstrideq*3]
lea dstride3q, [dstrideq*3]
%else
-cglobal vp9_%1%2, 5, 5, 4, dst, src, dstride, sstride, h
+cglobal vp9_%1%2 %+ %%szsuf, 5, 5, %8, dst, dstride, src, sstride, h
%endif
.loop:
%%srcfn m0, [srcq]
%%srcfn m1, [srcq+s%3]
%%srcfn m2, [srcq+s%4]
%%srcfn m3, [srcq+s%5]
+%if %2/mmsize == 8
+ %%srcfn m4, [srcq+mmsize*4]
+ %%srcfn m5, [srcq+mmsize*5]
+ %%srcfn m6, [srcq+mmsize*6]
+ %%srcfn m7, [srcq+mmsize*7]
+%endif
lea srcq, [srcq+sstrideq*%6]
%ifidn %1, avg
- pavgb m0, [dstq]
- pavgb m1, [dstq+d%3]
- pavgb m2, [dstq+d%4]
- pavgb m3, [dstq+d%5]
+ %%pavg m0, [dstq]
+ %%pavg m1, [dstq+d%3]
+ %%pavg m2, [dstq+d%4]
+ %%pavg m3, [dstq+d%5]
+%if %2/mmsize == 8
+ %%pavg m4, [dstq+mmsize*4]
+ %%pavg m5, [dstq+mmsize*5]
+ %%pavg m6, [dstq+mmsize*6]
+ %%pavg m7, [dstq+mmsize*7]
+%endif
%endif
%%dstfn [dstq], m0
%%dstfn [dstq+d%3], m1
%%dstfn [dstq+d%4], m2
%%dstfn [dstq+d%5], m3
+%if %2/mmsize == 8
+ %%dstfn [dstq+mmsize*4], m4
+ %%dstfn [dstq+mmsize*5], m5
+ %%dstfn [dstq+mmsize*6], m6
+ %%dstfn [dstq+mmsize*7], m7
+%endif
lea dstq, [dstq+dstrideq*%6]
sub hd, %6
jnz .loop
@@ -598,23 +636,38 @@ INIT_MMX mmx
fpel_fn put, 4, strideq, strideq*2, stride3q, 4
fpel_fn put, 8, strideq, strideq*2, stride3q, 4
INIT_MMX mmxext
-fpel_fn avg, 4, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 8, strideq, strideq*2, stride3q, 4
+fpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8
INIT_XMM sse
fpel_fn put, 16, strideq, strideq*2, stride3q, 4
fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
+fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8
INIT_XMM sse2
-fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2
-fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8
+fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8
INIT_YMM avx
fpel_fn put, 32, strideq, strideq*2, stride3q, 4
fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2
+fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8
+fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8
+%endif
+INIT_MMX mmxext
+fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16
+INIT_XMM sse2
+fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16
+fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16
+fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
-fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
-fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2
+fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16
+fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16
+fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16
%endif
%undef s16
%undef d16