From 923a324174c2d943b8d21d0b77fc0d0c847abca0 Mon Sep 17 00:00:00 2001 From: Martin Vignali Date: Sat, 24 Mar 2018 20:16:11 +0100 Subject: swscale/rgb : add X86 SIMD (SSSE3) for shuffle_bytes_2103 and shuffle_bytes_0321 --- libswscale/x86/Makefile | 1 + libswscale/x86/rgb2rgb.c | 10 +++++- libswscale/x86/rgb_2_rgb.asm | 80 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 libswscale/x86/rgb_2_rgb.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index b50c7f265a..f317d5dd9b 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -11,3 +11,4 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o X86ASM-OBJS += x86/input.o \ x86/output.o \ x86/scale.o \ + x86/rgb_2_rgb.o \ diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c index ffd12e1609..5caabf03ed 100644 --- a/libswscale/x86/rgb2rgb.c +++ b/libswscale/x86/rgb2rgb.c @@ -144,11 +144,14 @@ DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset); #endif /* HAVE_INLINE_ASM */ +void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size); +void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size); + av_cold void rgb2rgb_init_x86(void) { -#if HAVE_INLINE_ASM int cpu_flags = av_get_cpu_flags(); +#if HAVE_INLINE_ASM if (INLINE_MMX(cpu_flags)) rgb2rgb_init_mmx(); if (INLINE_AMD3DNOW(cpu_flags)) @@ -160,4 +163,9 @@ av_cold void rgb2rgb_init_x86(void) if (INLINE_AVX(cpu_flags)) rgb2rgb_init_avx(); #endif /* HAVE_INLINE_ASM */ + + if (EXTERNAL_SSSE3(cpu_flags)) { + shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3; + shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3; + } } diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm new file mode 100644 index 0000000000..e4104d9be2 --- /dev/null +++ b/libswscale/x86/rgb_2_rgb.asm @@ -0,0 +1,80 @@ +;****************************************************************************** +;* Copyright Nick Kurshev +;* Copyright Michael (michaelni@gmx.at) +;* Copyright 2018 Jokyo Images +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pb_shuffle2103: db 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 +pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13 + +SECTION .text + +;------------------------------------------------------------------------------ +; shuffle_bytes_## (const uint8_t *src, uint8_t *dst, int src_size) +;------------------------------------------------------------------------------ +; %1-4 index shuffle +%macro SHUFFLE_BYTES 4 +cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x + VBROADCASTI128 m0, [pb_shuffle%1%2%3%4] + movsxdifnidn wq, wd + mov xq, wq + + add srcq, wq + add dstq, wq + neg wq + +;calc scalar loop + and xq, mmsize-4 + je .loop_simd + +.loop_scalar: + mov tmpb, [srcq + wq + %1] + mov [dstq+wq + 0], tmpb + mov tmpb, [srcq + wq + %2] + mov [dstq+wq + 1], tmpb + mov tmpb, [srcq + wq + %3] + mov [dstq+wq + 2], tmpb + mov tmpb, [srcq + wq + %4] + mov [dstq+wq + 3], tmpb + add wq, 4 + sub xq, 4 + jg .loop_scalar + +;check if src_size < mmsize +cmp wq, 0 +jge .end + +.loop_simd: + movu m1, [srcq+wq] + pshufb m1, m0 + movu [dstq+wq], m1 + add wq, mmsize + jl .loop_simd + +.end: + RET +%endmacro + +INIT_XMM ssse3 +SHUFFLE_BYTES 2, 1, 0, 3 +SHUFFLE_BYTES 0, 3, 2, 1 -- cgit v1.2.3