summaryrefslogtreecommitdiff
path: root/libavfilter/x86
diff options
context:
space:
mode:
authorPaul B Mahol <onemda@gmail.com>2018-04-30 12:01:07 +0200
committerPaul B Mahol <onemda@gmail.com>2018-05-02 23:58:21 +0200
commit6d7c63588c81ba61b75701702b8680bd0063f36c (patch)
tree8afa4754f11330ea27ee3fa31071bae5176baeb6 /libavfilter/x86
parenta150b2e3a099fd539ecc6664050fd20617ce223c (diff)
avfilter/vf_overlay: add x86 SIMD
Specifically for yuv444, yuv422, yuv420 format when main stream has no alpha, and alpha is straight. Signed-off-by: Paul B Mahol <onemda@gmail.com>
Diffstat (limited to 'libavfilter/x86')
-rw-r--r--libavfilter/x86/Makefile2
-rw-r--r--libavfilter/x86/vf_overlay.asm144
-rw-r--r--libavfilter/x86/vf_overlay_init.c63
3 files changed, 209 insertions, 0 deletions
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index f60de3b73b..b484c8bd1c 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_tinterlace_init.o
OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter_init.o
OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge_init.o
OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
+OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay_init.o
OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o
OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
@@ -41,6 +42,7 @@ X86ASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
X86ASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
X86ASM-OBJS-$(CONFIG_LIMITER_FILTER) += x86/vf_limiter.o
X86ASM-OBJS-$(CONFIG_MASKEDMERGE_FILTER) += x86/vf_maskedmerge.o
+X86ASM-OBJS-$(CONFIG_OVERLAY_FILTER) += x86/vf_overlay.o
X86ASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o
X86ASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o
X86ASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
diff --git a/libavfilter/x86/vf_overlay.asm b/libavfilter/x86/vf_overlay.asm
new file mode 100644
index 0000000000..14ec60ca34
--- /dev/null
+++ b/libavfilter/x86/vf_overlay.asm
@@ -0,0 +1,144 @@
+;*****************************************************************************
+;* x86-optimized functions for overlay filter
+;*
+;* Copyright (C) 2018 Paul B Mahol
+;* Copyright (C) 2018 Henrik Gramner
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_128: times 8 dw 128
+pw_255: times 8 dw 255
+pw_257: times 8 dw 257
+
+SECTION .text
+
+INIT_XMM sse4
+cglobal overlay_row_44, 5, 7, 6, 0, d, da, s, a, w, r, x
+ xor xq, xq
+ movsxdifnidn wq, wd
+ mov rq, wq
+ and rq, mmsize/2 - 1
+ cmp wq, mmsize/2
+ jl .end
+ sub wq, rq
+ mova m3, [pw_255]
+ mova m4, [pw_128]
+ mova m5, [pw_257]
+ .loop:
+ pmovzxbw m0, [sq+xq]
+ pmovzxbw m2, [aq+xq]
+ pmovzxbw m1, [dq+xq]
+ pmullw m0, m2
+ pxor m2, m3
+ pmullw m1, m2
+ paddw m0, m4
+ paddw m0, m1
+ pmulhuw m0, m5
+ packuswb m0, m0
+ movq [dq+xq], m0
+ add xq, mmsize/2
+ cmp xq, wq
+ jl .loop
+
+ .end:
+ mov eax, xd
+ RET
+
+INIT_XMM sse4
+cglobal overlay_row_22, 5, 7, 6, 0, d, da, s, a, w, r, x
+ xor xq, xq
+ movsxdifnidn wq, wd
+ sub wq, 1
+ mov rq, wq
+ and rq, mmsize/2 - 1
+ cmp wq, mmsize/2
+ jl .end
+ sub wq, rq
+ mova m3, [pw_255]
+ mova m4, [pw_128]
+ mova m5, [pw_257]
+ .loop:
+ pmovzxbw m0, [sq+xq]
+ movu m1, [aq+2*xq]
+ pandn m2, m3, m1
+ psllw m1, 8
+ pavgw m2, m1
+ pavgw m2, m1
+ psrlw m2, 8
+ pmovzxbw m1, [dq+xq]
+ pmullw m0, m2
+ pxor m2, m3
+ pmullw m1, m2
+ paddw m0, m4
+ paddw m0, m1
+ pmulhuw m0, m5
+ packuswb m0, m0
+ movq [dq+xq], m0
+ add xq, mmsize/2
+ cmp xq, wq
+ jl .loop
+
+ .end:
+ mov eax, xd
+ RET
+
+INIT_XMM sse4
+cglobal overlay_row_20, 6, 7, 7, 0, d, da, s, a, w, r, x
+ mov daq, aq
+ add daq, rmp
+ xor xq, xq
+ movsxdifnidn wq, wd
+ sub wq, 1
+ mov rq, wq
+ and rq, mmsize/2 - 1
+ cmp wq, mmsize/2
+ jl .end
+ sub wq, rq
+ mova m3, [pw_255]
+ mova m4, [pw_128]
+ mova m5, [pw_257]
+ mova m6, [pb_1]
+ .loop:
+ pmovzxbw m0, [sq+xq]
+ movu m2, [aq+2*xq]
+ movu m1, [daq+2*xq]
+ pmaddubsw m2, m6
+ pmaddubsw m1, m6
+ paddw m2, m1
+ psrlw m2, 2
+ pmovzxbw m1, [dq+xq]
+ pmullw m0, m2
+ pxor m2, m3
+ pmullw m1, m2
+ paddw m0, m4
+ paddw m0, m1
+ pmulhuw m0, m5
+ packuswb m0, m0
+ movq [dq+xq], m0
+ add xq, mmsize/2
+ cmp xq, wq
+ jl .loop
+
+ .end:
+ mov eax, xd
+ RET
diff --git a/libavfilter/x86/vf_overlay_init.c b/libavfilter/x86/vf_overlay_init.c
new file mode 100644
index 0000000000..fec1629829
--- /dev/null
+++ b/libavfilter/x86/vf_overlay_init.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_overlay.h"
+
+int ff_overlay_row_44_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
+ int w, ptrdiff_t alinesize);
+
+int ff_overlay_row_20_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
+ int w, ptrdiff_t alinesize);
+
+int ff_overlay_row_22_sse4(uint8_t *d, uint8_t *da, uint8_t *s, uint8_t *a,
+ int w, ptrdiff_t alinesize);
+
+av_cold void ff_overlay_init_x86(OverlayContext *s, int format, int alpha_format, int main_has_alpha)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE4(cpu_flags) &&
+ (format == OVERLAY_FORMAT_YUV444 ||
+ format == OVERLAY_FORMAT_GBRP) &&
+ alpha_format == 0 && main_has_alpha == 0) {
+ s->blend_row[0] = ff_overlay_row_44_sse4;
+ s->blend_row[1] = ff_overlay_row_44_sse4;
+ s->blend_row[2] = ff_overlay_row_44_sse4;
+ }
+
+ if (EXTERNAL_SSE4(cpu_flags) &&
+ (format == OVERLAY_FORMAT_YUV420) &&
+ alpha_format == 0 && main_has_alpha == 0) {
+ s->blend_row[0] = ff_overlay_row_44_sse4;
+ s->blend_row[1] = ff_overlay_row_20_sse4;
+ s->blend_row[2] = ff_overlay_row_20_sse4;
+ }
+
+ if (EXTERNAL_SSE4(cpu_flags) &&
+ (format == OVERLAY_FORMAT_YUV422) &&
+ alpha_format == 0 && main_has_alpha == 0) {
+ s->blend_row[0] = ff_overlay_row_44_sse4;
+ s->blend_row[1] = ff_overlay_row_22_sse4;
+ s->blend_row[2] = ff_overlay_row_22_sse4;
+ }
+}