summaryrefslogtreecommitdiff
path: root/libavfilter/x86
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2015-07-20 00:50:52 -0300
committerJames Almer <jamrial@gmail.com>2015-07-20 13:18:05 -0300
commite3851169eedce6c90534ca41edf3e05e2576453e (patch)
tree4b73a180aa1abadcdfe49d631ca2f9802bea80d9 /libavfilter/x86
parente1778fb657ca56da517cbe2296317178484b05f9 (diff)
x86/vf_ssim: add ff_ssim_4x4_line_xop
~20% faster than ssse3. Also enabled for x86_32 Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>
Diffstat (limited to 'libavfilter/x86')
-rw-r--r--libavfilter/x86/vf_ssim.asm62
-rw-r--r--libavfilter/x86/vf_ssim_init.c5
2 files changed, 64 insertions, 3 deletions
diff --git a/libavfilter/x86/vf_ssim.asm b/libavfilter/x86/vf_ssim.asm
index 66619877e2..3293e66701 100644
--- a/libavfilter/x86/vf_ssim.asm
+++ b/libavfilter/x86/vf_ssim.asm
@@ -30,16 +30,50 @@ ssim_c2: times 4 dd 235963 ;(.03*.03*255*255*64*63 + .5)
SECTION .text
+%macro SSIM_4X4_LINE 1
%if ARCH_X86_64
-
-INIT_XMM ssse3
-cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3
+cglobal ssim_4x4_line, 6, 8, %1, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3
+%else
+cglobal ssim_4x4_line, 5, 7, %1, buf, buf_stride, ref, ref_stride, sums, buf_stride3, ref_stride3
+%define wd r5mp
+%endif
lea ref_stride3q, [ref_strideq*3]
lea buf_stride3q, [buf_strideq*3]
+%if notcpuflag(xop)
pxor m7, m7
mova m15, [pw_1]
+%endif
.loop:
+%if cpuflag(xop)
+ pmovzxbw m0, [bufq+buf_strideq*0]
+ pmovzxbw m1, [refq+ref_strideq*0]
+ pmaddwd m4, m0, m0
+ pmaddwd m6, m0, m1
+ pmovzxbw m2, [bufq+buf_strideq*1]
+ vpmadcswd m4, m1, m1, m4
+ pmovzxbw m3, [refq+ref_strideq*1]
+ paddw m0, m2
+ vpmadcswd m4, m2, m2, m4
+ vpmadcswd m6, m2, m3, m6
+ paddw m1, m3
+ vpmadcswd m4, m3, m3, m4
+
+ pmovzxbw m2, [bufq+buf_strideq*2]
+ pmovzxbw m3, [refq+ref_strideq*2]
+ vpmadcswd m4, m2, m2, m4
+ vpmadcswd m6, m2, m3, m6
+ pmovzxbw m5, [bufq+buf_stride3q]
+ pmovzxbw m7, [refq+ref_stride3q]
+ vpmadcswd m4, m3, m3, m4
+ vpmadcswd m6, m5, m7, m6
+ paddw m0, m2
+ paddw m1, m3
+ vpmadcswd m4, m5, m5, m4
+ paddw m0, m5
+ paddw m1, m7
+ vpmadcswd m4, m7, m7, m4
+%else
movh m0, [bufq+buf_strideq*0] ; a1
movh m1, [refq+ref_strideq*0] ; b1
movh m2, [bufq+buf_strideq*1] ; a2
@@ -85,12 +119,25 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
paddd m4, m9
paddd m6, m14
paddd m4, m12
+%endif
; m0 = [word] s1 a,a,a,a,b,b,b,b
; m1 = [word] s2 a,a,a,a,b,b,b,b
; m4 = [dword] ss a,a,b,b
; m6 = [dword] s12 a,a,b,b
+%if cpuflag(xop)
+ vphaddwq m0, m0 ; [dword] s1 a, 0, b, 0
+ vphaddwq m1, m1 ; [dword] s2 a, 0, b, 0
+ vphadddq m4, m4 ; [dword] ss a, 0, b, 0
+ vphadddq m6, m6 ; [dword] s12 a, 0, b, 0
+ punpckhdq m2, m0, m1 ; [dword] s1 b, s2 b, 0, 0
+ punpckldq m0, m1 ; [dword] s1 a, s2 a, 0, 0
+ punpckhdq m3, m4, m6 ; [dword] ss b, s12 b, 0, 0
+ punpckldq m4, m6 ; [dword] ss a, s12 a, 0, 0
+ punpcklqdq m1, m2, m3 ; [dword] b s1, s2, ss, s12
+ punpcklqdq m0, m4 ; [dword] a s1, s2, ss, s12
+%else
pmaddwd m0, m15 ; [dword] s1 a,a,b,b
pmaddwd m1, m15 ; [dword] s2 a,a,b,b
phaddd m0, m4 ; [dword] s1 a, b, ss a, b
@@ -99,6 +146,7 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
punpckldq m0, m1 ; [dword] s1 a, s2 a, s1 b, s2 b
punpckhqdq m1, m0, m2 ; [dword] b s1, s2, ss, s12
punpcklqdq m0, m2 ; [dword] a s1, s2, ss, s12
+%endif
mova [sumsq+ 0], m0
mova [sumsq+mmsize], m1
@@ -109,7 +157,15 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
sub wd, mmsize/8
jg .loop
RET
+%endmacro
+%if ARCH_X86_64
+INIT_XMM ssse3
+SSIM_4X4_LINE 16
+%endif
+%if HAVE_XOP_EXTERNAL
+INIT_XMM xop
+SSIM_4X4_LINE 8
%endif
INIT_XMM sse4
diff --git a/libavfilter/x86/vf_ssim_init.c b/libavfilter/x86/vf_ssim_init.c
index 9514b25ee3..599c928403 100644
--- a/libavfilter/x86/vf_ssim_init.c
+++ b/libavfilter/x86/vf_ssim_init.c
@@ -25,6 +25,9 @@
void ff_ssim_4x4_line_ssse3(const uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *ref, ptrdiff_t ref_stride,
int (*sums)[4], int w);
+void ff_ssim_4x4_line_xop (const uint8_t *buf, ptrdiff_t buf_stride,
+ const uint8_t *ref, ptrdiff_t ref_stride,
+ int (*sums)[4], int w);
float ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w);
void ff_ssim_init_x86(SSIMDSPContext *dsp)
@@ -35,4 +38,6 @@ void ff_ssim_init_x86(SSIMDSPContext *dsp)
dsp->ssim_4x4_line = ff_ssim_4x4_line_ssse3;
if (EXTERNAL_SSE4(cpu_flags))
dsp->ssim_end_line = ff_ssim_end_line_sse4;
+ if (EXTERNAL_XOP(cpu_flags))
+ dsp->ssim_4x4_line = ff_ssim_4x4_line_xop;
}