diff options
Diffstat (limited to 'libavfilter/x86')
31 files changed, 4379 insertions, 104 deletions
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 13b5d318ec..5382027f70 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,11 +1,32 @@ +OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o +OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o +OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o +OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o +OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o +OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o +OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o +OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o +OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o +OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o +OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o +YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o +YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o YASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o +YASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o +YASM-OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr.o +YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o +ifdef CONFIG_GPL +YASM-OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain.o +endif +YASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o +YASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o -YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o +YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o diff --git a/libavfilter/x86/af_volume.asm b/libavfilter/x86/af_volume.asm index 4e5ad2258c..f4cbcbc5de 100644 --- a/libavfilter/x86/af_volume.asm +++ b/libavfilter/x86/af_volume.asm @@ -2,20 +2,20 @@ ;* x86-optimized functions for volume filter ;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -99,9 +99,11 @@ cglobal scale_samples_s32, 4,4,4, dst, src, len, volume INIT_XMM sse2 %define CVTDQ2PD cvtdq2pd SCALE_SAMPLES_S32 +%if HAVE_AVX_EXTERNAL %define CVTDQ2PD vcvtdq2pd INIT_YMM avx SCALE_SAMPLES_S32 +%endif %undef CVTDQ2PD ; NOTE: This is not bit-identical with the C version because it clips to diff --git a/libavfilter/x86/af_volume_init.c b/libavfilter/x86/af_volume_init.c index 26605fb2ce..88f5a9679a 100644 --- a/libavfilter/x86/af_volume_init.c +++ b/libavfilter/x86/af_volume_init.c @@ -1,18 +1,18 @@ /* - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/libavfilter/x86/vf_eq.c b/libavfilter/x86/vf_eq.c new file mode 100644 index 0000000000..16f399505f --- /dev/null +++ b/libavfilter/x86/vf_eq.c @@ -0,0 +1,96 @@ +/* + * + * Original MPlayer filters by Richard Felker. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavfilter/vf_eq.h" + +#if HAVE_MMX_INLINE && HAVE_6REGS +static void process_MMX(EQParameters *param, uint8_t *dst, int dst_stride, + const uint8_t *src, int src_stride, int w, int h) +{ + int i; + int pel; + int dstep = dst_stride - w; + int sstep = src_stride - w; + short brvec[4]; + short contvec[4]; + int contrast = (int) (param->contrast * 256 * 16); + int brightness = ((int) (100.0 * param->brightness + 100.0) * 511) / 200 - 128 - contrast / 32; + + brvec[0] = brvec[1] = brvec[2] = brvec[3] = brightness; + contvec[0] = contvec[1] = contvec[2] = contvec[3] = contrast; + + while (h--) { + __asm__ volatile ( + "movq (%5), %%mm3 \n\t" + "movq (%6), %%mm4 \n\t" + "pxor %%mm0, %%mm0 \n\t" + "movl %4, %%eax \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%0), %%mm1 \n\t" + "movq (%0), %%mm2 \n\t" + "punpcklbw %%mm0, %%mm1\n\t" + "punpckhbw %%mm0, %%mm2\n\t" + "psllw $4, %%mm1 \n\t" + "psllw $4, %%mm2 \n\t" + "pmulhw %%mm4, %%mm1 \n\t" + "pmulhw %%mm4, %%mm2 \n\t" + "paddw %%mm3, %%mm1 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "packuswb %%mm2, %%mm1 \n\t" + "add $8, %0 \n\t" + "movq %%mm1, (%1) \n\t" + "add $8, %1 \n\t" + "decl %%eax \n\t" + "jnz 1b \n\t" + : "=r" (src), "=r" (dst) + : "0" (src), "1" (dst), "r" (w>>3), "r" (brvec), "r" (contvec) + : "%eax" + ); + + for (i = w&7; i; i--) { + pel = ((*src++ * contrast) >> 12) + brightness; + if (pel & ~255) + pel = (-pel) >> 31; + *dst++ = pel; + } + + src += sstep; + dst += dstep; + } + __asm__ volatile ( "emms \n\t" ::: "memory" ); +} +#endif + +av_cold void ff_eq_init_x86(EQContext *eq) +{ +#if HAVE_MMX_INLINE && HAVE_6REGS + int cpu_flags = av_get_cpu_flags(); + + if (cpu_flags & AV_CPU_FLAG_MMX) { + eq->process = process_MMX; + } +#endif +} diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm new file mode 100644 index 0000000000..c7f8f64f1b --- /dev/null +++ b/libavfilter/x86/vf_fspp.asm @@ -0,0 +1,727 @@ +;***************************************************************************** +;* x86-optimized functions for fspp filter +;* +;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \ + 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \ + 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \ + 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21 +pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) +pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) +pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) +pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14) +pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) +pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) +pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) +pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14) +pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14) +pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14) +pw_4: times 4 dw 4 +pw_2: times 4 dw 2 + +SECTION .text + +%define DCTSIZE 8 + +INIT_MMX mmx + +;void ff_store_slice_mmx(uint8_t *dst, int16_t *src, +; ptrdiff_t dst_stride, ptrdiff_t src_stride, +; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +%if ARCH_X86_64 +cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 +%else +cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 +%define dst_strideq r2m +%define src_strideq r3m + mov widthq, r4m + mov dither_heightq, r5m + mov ditherq, r6m ; log2_scale +%endif + add widthq, 7 + mov tmpq, src_strideq + and widthq, ~7 + sub dst_strideq, widthq + movd m5, ditherd ; log2_scale + xor ditherq, -1 ; log2_scale + mov tmp2q, tmpq + add ditherq, 7 ; log2_scale + neg tmpq + sub tmp2q, widthq + movd m2, ditherd ; log2_scale + add tmp2q, tmp2q + lea ditherq, [pb_dither] + mov src_strideq, tmp2q + shl tmpq, 4 + lea dither_heightq, [ditherq+dither_heightq*8] + pxor m7, m7 + +.loop_height: + movq m3, [ditherq] + movq m4, m3 + punpcklbw m3, m7 + punpckhbw m4, m7 + mov tmp2q, widthq + psraw m3, m5 + psraw m4, m5 + +.loop_width: + movq [srcq+tmpq], m7 + movq m0, [srcq] + movq m1, [srcq+8] + movq [srcq+tmpq+8], m7 + paddw m0, m3 + paddw m1, m4 + movq [srcq], m7 + psraw m0, m2 + psraw m1, m2 + movq [srcq+8], m7 + packuswb m0, m1 + add srcq, 16 + movq [dstq], m0 + add dstq, 8 + sub tmp2q, 8 + jg .loop_width + + add srcq, src_strideq + add ditherq, 8 + add dstq, dst_strideq + cmp ditherq, dither_heightq + jl .loop_height + RET + +;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, +; ptrdiff_t dst_stride, ptrdiff_t src_stride, +; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) +%if ARCH_X86_64 +cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 +%else +cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 +%define dst_strideq r2m +%define src_strideq r3m + mov dstq, dstm + mov srcq, srcm + mov widthq, r4m + mov dither_heightq, r5m + mov ditherq, r6m ; log2_scale +%endif + add widthq, 7 + mov tmpq, src_strideq + and widthq, ~7 + sub dst_strideq, widthq + movd m5, ditherd ; log2_scale + xor ditherq, -1 ; log2_scale + mov tmp2q, tmpq + add ditherq, 7 ; log2_scale + sub tmp2q, widthq + movd m2, ditherd ; log2_scale + add tmp2q, tmp2q + lea ditherq, [pb_dither] + mov src_strideq, tmp2q + shl tmpq, 5 + lea dither_heightq, [ditherq+dither_heightq*8] + pxor m7, m7 + +.loop_height: + movq m3, [ditherq] + movq m4, m3 + punpcklbw m3, m7 + punpckhbw m4, m7 + mov tmp2q,widthq + psraw m3, m5 + psraw m4, m5 + +.loop_width: + movq m0, [srcq] + movq m1, [srcq+8] + paddw m0, m3 + paddw m0, [srcq+tmpq] + paddw m1, m4 + movq m6, [srcq+tmpq+8] + movq [srcq+tmpq], m7 + psraw m0, m2 + paddw m1, m6 + movq [srcq+tmpq+8], m7 + psraw m1, m2 + packuswb m0, m1 + movq [dstq], m0 + add srcq, 16 + add dstq, 8 + sub tmp2q, 8 + jg .loop_width + + add srcq, src_strideq + add ditherq, 8 + add dstq, dst_strideq + cmp ditherq, dither_heightq + jl .loop_height + RET + +;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); +cglobal mul_thrmat, 3, 3, 0, thrn, thr, q + movd m7, qd + movq m0, [thrnq] + punpcklwd m7, m7 + movq m1, [thrnq+8] + punpckldq m7, m7 + pmullw m0, m7 + movq m2, [thrnq+8*2] + pmullw m1, m7 + movq m3, [thrnq+8*3] + pmullw m2, m7 + movq [thrq], m0 + movq m4, [thrnq+8*4] + pmullw m3, m7 + movq [thrq+8], m1 + movq m5, [thrnq+8*5] + pmullw m4, m7 + movq [thrq+8*2], m2 + movq m6, [thrnq+8*6] + pmullw m5, m7 + movq [thrq+8*3], m3 + movq m0, [thrnq+8*7] + pmullw m6, m7 + movq [thrq+8*4], m4 + movq m1, [thrnq+8*7+8] + pmullw m0, m7 + movq [thrq+8*5], m5 + movq m2, [thrnq+8*7+8*2] + pmullw m1, m7 + movq [thrq+8*6], m6 + movq m3, [thrnq+8*7+8*3] + pmullw m2, m7 + movq [thrq+8*7], m0 + movq m4, [thrnq+8*7+8*4] + pmullw m3, m7 + movq [thrq+8*7+8], m1 + movq m5, [thrnq+8*7+8*5] + pmullw m4, m7 + movq [thrq+8*7+8*2], m2 + movq m6, [thrnq+8*7+8*6] + pmullw m5, m7 + movq [thrq+8*7+8*3], m3 + movq m0, [thrnq+14*8] + pmullw m6, m7 + movq [thrq+8*7+8*4], m4 + movq m1, [thrnq+14*8+8] + pmullw m0, m7 + movq [thrq+8*7+8*5], m5 + pmullw m1, m7 + movq [thrq+8*7+8*6], m6 + movq [thrq+14*8], m0 + movq [thrq+14*8+8], m1 + RET + +%macro COLUMN_FDCT 1-3 0, 0 + movq m1, [srcq+DCTSIZE*0*2] + movq m7, [srcq+DCTSIZE*3*2] + movq m0, m1 + paddw m1, [srcq+DCTSIZE*7*2] + movq m3, m7 + paddw m7, [srcq+DCTSIZE*4*2] + movq m5, m1 + movq m6, [srcq+DCTSIZE*1*2] + psubw m1, m7 + movq m2, [srcq+DCTSIZE*2*2] + movq m4, m6 + paddw m6, [srcq+DCTSIZE*6*2] + paddw m5, m7 + paddw m2, [srcq+DCTSIZE*5*2] + movq m7, m6 + paddw m6, m2 + psubw m7, m2 + movq m2, m5 + paddw m5, m6 + psubw m2, m6 + paddw m7, m1 + movq m6, [thrq+4*16+%2] + psllw m7, 2 + psubw m5, [thrq+%2] + psubw m2, m6 + paddusw m5, [thrq+%2] + paddusw m2, m6 + pmulhw m7, [pw_2D41] + paddw m5, [thrq+%2] + paddw m2, m6 + psubusw m5, [thrq+%2] + psubusw m2, m6 + paddw m5, [pw_2] + movq m6, m2 + paddw m2, m5 + psubw m5, m6 + movq m6, m1 + paddw m1, m7 + psubw m1, [thrq+2*16+%2] + psubw m6, m7 + movq m7, [thrq+6*16+%2] + psraw m5, 2 + paddusw m1, [thrq+2*16+%2] + psubw m6, m7 + paddw m1, [thrq+2*16+%2] + paddusw m6, m7 + psubusw m1, [thrq+2*16+%2] + paddw m6, m7 + psubw m3, [srcq+DCTSIZE*4*2] + psubusw m6, m7 + movq m7, m1 + psraw m2, 2 + psubw m4, [srcq+DCTSIZE*6*2] + psubw m1, m6 + psubw m0, [srcq+DCTSIZE*7*2] + paddw m6, m7 + psraw m6, 2 + movq m7, m2 + pmulhw m1, [pw_5A82] + paddw m2, m6 + movq [rsp], m2 + psubw m7, m6 + movq m2, [srcq+DCTSIZE*2*2] + psubw m1, m6 + psubw m2, [srcq+DCTSIZE*5*2] + movq m6, m5 + movq [rsp+8*3], m7 + paddw m3, m2 + paddw m2, m4 + paddw m4, m0 + movq m7, m3 + psubw m3, m4 + psllw m3, 2 + psllw m7, 2 + pmulhw m3, [pw_187E] + psllw m4, 2 + pmulhw m7, [pw_22A3] + psllw m2, 2 + pmulhw m4, [pw_539F] + paddw m5, m1 + pmulhw m2, [pw_2D41] + psubw m6, m1 + paddw m7, m3 + movq [rsp+8], m5 + paddw m4, m3 + movq m3, [thrq+3*16+%2] + movq m1, m0 + movq [rsp+8*2], m6 + psubw m1, m2 + paddw m0, m2 + movq m5, m1 + movq m2, [thrq+5*16+%2] + psubw m1, m7 + paddw m5, m7 + psubw m1, m3 + movq m7, [thrq+16+%2] + psubw m5, m2 + movq m6, m0 + paddw m0, m4 + paddusw m1, m3 + psubw m6, m4 + movq m4, [thrq+7*16+%2] + psubw m0, m7 + psubw m6, m4 + paddusw m5, m2 + paddusw m6, m4 + paddw m1, m3 + paddw m5, m2 + paddw m6, m4 + psubusw m1, m3 + psubusw m5, m2 + psubusw m6, m4 + movq m4, m1 + por m4, m5 + paddusw m0, m7 + por m4, m6 + paddw m0, m7 + packssdw m4, m4 + psubusw m0, m7 + movd tmpd, m4 + or tmpd, tmpd + jnz %1 + movq m4, [rsp] + movq m1, m0 + pmulhw m0, [pw_3642] + movq m2, m1 + movq m5, [outq+DCTSIZE*0*2] + movq m3, m2 + pmulhw m1, [pw_2441] + paddw m5, m4 + movq m6, [rsp+8] + psraw m3, 2 + pmulhw m2, [pw_0CBB] + psubw m4, m3 + movq m7, [outq+DCTSIZE*1*2] + paddw m5, m3 + movq [outq+DCTSIZE*7*2], m4 + paddw m7, m6 + movq m3, [rsp+8*2] + psubw m6, m0 + movq m4, [outq+DCTSIZE*2*2] + paddw m7, m0 + movq [outq], m5 + paddw m4, m3 + movq [outq+DCTSIZE*6*2], m6 + psubw m3, m1 + movq m5, [outq+DCTSIZE*5*2] + paddw m4, m1 + movq m6, [outq+DCTSIZE*3*2] + paddw m5, m3 + movq m0, [rsp+8*3] + add srcq, 8+%3 + movq [outq+DCTSIZE*1*2], m7 + paddw m6, m0 + movq [outq+DCTSIZE*2*2], m4 + psubw m0, m2 + movq m7, [outq+DCTSIZE*4*2] + paddw m6, m2 + movq [outq+DCTSIZE*5*2], m5 + paddw m7, m0 + movq [outq+DCTSIZE*3*2], m6 + movq [outq+DCTSIZE*4*2], m7 + add outq, 8+%3 +%endmacro + +%macro COLUMN_IDCT 0-1 0 + movq m3, m5 + psubw m5, m1 + psllw m5, 1 + paddw m3, m1 + movq m2, m0 + psubw m0, m6 + movq m1, m5 + psllw m0, 1 + pmulhw m1, [pw_AC62] + paddw m5, m0 + pmulhw m5, [pw_3B21] + paddw m2, m6 + pmulhw m0, [pw_22A3] + movq m7, m2 + movq m4, [rsp] + psubw m2, m3 + psllw m2, 1 + paddw m7, m3 + pmulhw m2, [pw_2D41] + movq m6, m4 + psraw m7, 2 + paddw m4, [outq] + psubw m6, m7 + movq m3, [rsp+8] + paddw m4, m7 + movq [outq+DCTSIZE*7*2], m6 + paddw m1, m5 + movq [outq], m4 + psubw m1, m7 + movq m7, [rsp+8*2] + psubw m0, m5 + movq m6, [rsp+8*3] + movq m5, m3 + paddw m3, [outq+DCTSIZE*1*2] + psubw m5, m1 + psubw m2, m1 + paddw m3, m1 + movq [outq+DCTSIZE*6*2], m5 + movq m4, m7 + paddw m7, [outq+DCTSIZE*2*2] + psubw m4, m2 + paddw m4, [outq+DCTSIZE*5*2] + paddw m7, m2 + movq [outq+DCTSIZE*1*2], m3 + paddw m0, m2 + movq [outq+DCTSIZE*2*2], m7 + movq m1, m6 + paddw m6, [outq+DCTSIZE*4*2] + psubw m1, m0 + paddw m1, [outq+DCTSIZE*3*2] + paddw m6, m0 + movq [outq+DCTSIZE*5*2], m4 + add srcq, 8+%1 + movq [outq+DCTSIZE*4*2], m6 + movq [outq+DCTSIZE*3*2], m1 + add outq, 8+%1 +%endmacro + +;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); +cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp +.fdct1: + COLUMN_FDCT .idct1 + jmp .fdct2 + +.idct1: + COLUMN_IDCT + +.fdct2: + COLUMN_FDCT .idct2, 8, 16 + sub cntd, 2 + jg .fdct1 + RET + +.idct2: + COLUMN_IDCT 16 + sub cntd, 2 + jg .fdct1 + RET + +;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); +cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3 + add strideq, strideq + lea stride3q, [strideq+strideq*2] +.loop: + movq m0, [srcq+DCTSIZE*0*2] + movq m1, [srcq+DCTSIZE*1*2] + movq m4, m0 + movq m2, [srcq+DCTSIZE*2*2] + punpcklwd m0, m1 + movq m3, [srcq+DCTSIZE*3*2] + punpckhwd m4, m1 + movq m7, m2 + punpcklwd m2, m3 + movq m6, m0 + punpckldq m0, m2 + punpckhdq m6, m2 + movq m5, m0 + punpckhwd m7, m3 + psubw m0, m6 + pmulhw m0, [pw_5A82] + movq m2, m4 + punpckldq m4, m7 + paddw m5, m6 + punpckhdq m2, m7 + movq m1, m4 + psllw m0, 2 + paddw m4, m2 + movq m3, [srcq+DCTSIZE*0*2+8] + psubw m1, m2 + movq m2, [srcq+DCTSIZE*1*2+8] + psubw m0, m5 + movq m6, m4 + paddw m4, m5 + psubw m6, m5 + movq m7, m1 + movq m5, [srcq+DCTSIZE*2*2+8] + paddw m1, m0 + movq [rsp], m4 + movq m4, m3 + movq [rsp+8], m6 + punpcklwd m3, m2 + movq m6, [srcq+DCTSIZE*3*2+8] + punpckhwd m4, m2 + movq m2, m5 + punpcklwd m5, m6 + psubw m7, m0 + punpckhwd m2, m6 + movq m0, m3 + punpckldq m3, m5 + punpckhdq m0, m5 + movq m5, m4 + movq m6, m3 + punpckldq m4, m2 + psubw m3, m0 + punpckhdq m5, m2 + paddw m6, m0 + movq m2, m4 + movq m0, m3 + psubw m4, m5 + pmulhw m0, [pw_AC62] + paddw m3, m4 + pmulhw m3, [pw_3B21] + paddw m2, m5 + pmulhw m4, [pw_22A3] + movq m5, m2 + psubw m2, m6 + paddw m5, m6 + pmulhw m2, [pw_2D41] + paddw m0, m3 + psllw m0, 3 + psubw m4, m3 + movq m6, [rsp] + movq m3, m1 + psllw m4, 3 + psubw m0, m5 + psllw m2, 3 + paddw m1, m0 + psubw m2, m0 + psubw m3, m0 + paddw m4, m2 + movq m0, m7 + paddw m7, m2 + psubw m0, m2 + movq m2, [pw_4] + psubw m6, m5 + paddw m5, [rsp] + paddw m1, m2 + paddw m5, m2 + psraw m1, 3 + paddw m7, m2 + psraw m5, 3 + paddw m5, [dstq] + psraw m7, 3 + paddw m1, [dstq+strideq*1] + paddw m0, m2 + paddw m7, [dstq+strideq*2] + paddw m3, m2 + movq [dstq], m5 + paddw m6, m2 + movq [dstq+strideq*1], m1 + psraw m0, 3 + movq [dstq+strideq*2], m7 + add dstq, stride3q + movq m5, [rsp+8] + psraw m3, 3 + paddw m0, [dstq+strideq*2] + psubw m5, m4 + paddw m3, [dstq+stride3q*1] + psraw m6, 3 + paddw m4, [rsp+8] + paddw m5, m2 + paddw m6, [dstq+strideq*4] + paddw m4, m2 + movq [dstq+strideq*2], m0 + psraw m5, 3 + paddw m5, [dstq] + psraw m4, 3 + paddw m4, [dstq+strideq*1] + add srcq, DCTSIZE*2*4 + movq [dstq+stride3q*1], m3 + movq [dstq+strideq*4], m6 + movq [dstq], m5 + movq [dstq+strideq*1], m4 + sub dstq, stride3q + add dstq, 8 + dec r3d + jnz .loop + RET + +;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); +cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3 + lea stride3q, [strideq+strideq*2] +.loop: + movd m0, [pixq] + pxor m7, m7 + movd m1, [pixq+strideq*1] + punpcklbw m0, m7 + movd m2, [pixq+strideq*2] + punpcklbw m1, m7 + punpcklbw m2, m7 + add pixq,stride3q + movq m5, m0 + movd m3, [pixq+strideq*4] + movq m6, m1 + movd m4, [pixq+stride3q*1] + punpcklbw m3, m7 + psubw m5, m3 + punpcklbw m4, m7 + paddw m0, m3 + psubw m6, m4 + movd m3, [pixq+strideq*2] + paddw m1, m4 + movq [rsp], m5 + punpcklbw m3, m7 + movq [rsp+8], m6 + movq m4, m2 + movd m5, [pixq] + paddw m2, m3 + movd m6, [pixq+strideq*1] + punpcklbw m5, m7 + psubw m4, m3 + punpcklbw m6, m7 + movq m3, m5 + paddw m5, m6 + psubw m3, m6 + movq m6, m0 + movq m7, m1 + psubw m0, m5 + psubw m1, m2 + paddw m7, m2 + paddw m1, m0 + movq m2, m7 + psllw m1, 2 + paddw m6, m5 + pmulhw m1, [pw_2D41] + paddw m7, m6 + psubw m6, m2 + movq m5, m0 + movq m2, m7 + punpcklwd m7, m6 + paddw m0, m1 + punpckhwd m2, m6 + psubw m5, m1 + movq m6, m0 + movq m1, [rsp+8] + punpcklwd m0, m5 + punpckhwd m6, m5 + movq m5, m0 + punpckldq m0, m7 + paddw m3, m4 + punpckhdq m5, m7 + movq m7, m6 + movq [srcq+DCTSIZE*0*2], m0 + punpckldq m6, m2 + movq [srcq+DCTSIZE*1*2], m5 + punpckhdq m7, m2 + movq [srcq+DCTSIZE*2*2], m6 + paddw m4, m1 + movq [srcq+DCTSIZE*3*2], m7 + psllw m3, 2 + movq m2, [rsp] + psllw m4, 2 + pmulhw m4, [pw_2D41] + paddw m1, m2 + psllw m1, 2 + movq m0, m3 + pmulhw m0, [pw_22A3] + psubw m3, m1 + pmulhw m3, [pw_187E] + movq m5, m2 + pmulhw m1, [pw_539F] + psubw m2, m4 + paddw m5, m4 + movq m6, m2 + paddw m0, m3 + movq m7, m5 + paddw m2, m0 + psubw m6, m0 + movq m4, m2 + paddw m1, m3 + punpcklwd m2, m6 + paddw m5, m1 + punpckhwd m4, m6 + psubw m7, m1 + movq m6, m5 + punpcklwd m5, m7 + punpckhwd m6, m7 + movq m7, m2 + punpckldq m2, m5 + sub pixq, stride3q + punpckhdq m7, m5 + movq m5, m4 + movq [srcq+DCTSIZE*0*2+8], m2 + punpckldq m4, m6 + movq [srcq+DCTSIZE*1*2+8], m7 + punpckhdq m5, m6 + movq [srcq+DCTSIZE*2*2+8], m4 + add pixq, 4 + movq [srcq+DCTSIZE*3*2+8], m5 + add srcq, DCTSIZE*4*2 + dec cntd + jnz .loop + RET diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c new file mode 100644 index 0000000000..8e00317cb7 --- /dev/null +++ b/libavfilter/x86/vf_fspp_init.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/attributes.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_fspp.h" + +void ff_store_slice_mmx(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); +void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, + ptrdiff_t dst_stride, ptrdiff_t src_stride, + ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale); +void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); +void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); +void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); +void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); + +av_cold void ff_fspp_init_x86(FSPPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + s->store_slice = ff_store_slice_mmx; + s->store_slice2 = ff_store_slice2_mmx; + s->mul_thrmat = ff_mul_thrmat_mmx; + s->column_fidct = ff_column_fidct_mmx; + s->row_idct = ff_row_idct_mmx; + s->row_fdct = ff_row_fdct_mmx; + } +} diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm index 00fcb166fb..3581f89fe8 100644 --- a/libavfilter/x86/vf_gradfun.asm +++ b/libavfilter/x86/vf_gradfun.asm @@ -1,20 +1,20 @@ ;****************************************************************************** ;* x86-optimized functions for gradfun filter ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** diff --git a/libavfilter/x86/vf_gradfun_init.c b/libavfilter/x86/vf_gradfun_init.c index 3f23bf6799..c638a05e87 100644 --- a/libavfilter/x86/vf_gradfun_init.c +++ b/libavfilter/x86/vf_gradfun_init.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2009 Loren Merritt <lorenm@u.washington.edu> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -26,29 +26,29 @@ #include "libavutil/x86/cpu.h" #include "libavfilter/gradfun.h" -void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, uint8_t *src, - uint16_t *dc, int thresh, +void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, const uint8_t *src, + const uint16_t *dc, int thresh, const uint16_t *dithers); - -void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, uint8_t *src, - uint16_t *dc, int thresh, +void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, const uint8_t *src, + const uint16_t *dc, int thresh, const uint16_t *dithers); void ff_gradfun_blur_line_movdqa_sse2(intptr_t x, uint16_t *buf, - uint16_t *buf1, uint16_t *dc, - uint8_t *src1, uint8_t *src2); + const uint16_t *buf1, uint16_t *dc, + const uint8_t *src1, const uint8_t *src2); void ff_gradfun_blur_line_movdqu_sse2(intptr_t x, uint16_t *buf, - uint16_t *buf1, uint16_t *dc, - uint8_t *src1, uint8_t *src2); + const uint16_t *buf1, uint16_t *dc, + const uint8_t *src1, const uint8_t *src2); #if HAVE_YASM -static void gradfun_filter_line(uint8_t *dst, uint8_t *src, uint16_t *dc, - int width, int thresh, const uint16_t *dithers, - int alignment) +static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src, + const uint16_t *dc, + int width, int thresh, + const uint16_t *dithers) { intptr_t x; - if (width & alignment) { - x = width & ~alignment; + if (width & 3) { + x = width & ~3; ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); width = x; @@ -58,22 +58,25 @@ static void gradfun_filter_line(uint8_t *dst, uint8_t *src, uint16_t *dc, thresh, dithers); } -static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc, - int width, int thresh, - const uint16_t *dithers) -{ - gradfun_filter_line(dst, src, dc, width, thresh, dithers, 3); -} - -static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, +static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc, int width, int thresh, const uint16_t *dithers) { - gradfun_filter_line(dst, src, dc, width, thresh, dithers, 7); + intptr_t x; + if (width & 7) { + // could be 10% faster if I somehow eliminated this + x = width & ~7; + ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, + width - x, thresh, dithers); + width = x; + } + x = -width; + ff_gradfun_filter_line_ssse3(x, dst + width, src + width, dc + width / 2, + thresh, dithers); } -static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, - uint8_t *src, int src_linesize, int width) +static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1, + const uint8_t *src, int src_linesize, int width) { intptr_t x = -2 * width; if (((intptr_t) src | src_linesize) & 15) diff --git a/libavfilter/x86/vf_hqdn3d.asm b/libavfilter/x86/vf_hqdn3d.asm index 02632a1f09..e3b1bdca53 100644 --- a/libavfilter/x86/vf_hqdn3d.asm +++ b/libavfilter/x86/vf_hqdn3d.asm @@ -1,20 +1,20 @@ ;****************************************************************************** ;* Copyright (c) 2012 Loren Merritt ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -27,8 +27,8 @@ SECTION .text %if lut_bits != 8 sar %1q, 8-lut_bits %endif - movsx %1d, word [%3q+%1q*2] - add %1d, %2d + movsx %1q, word [%3q+%1q*2] + add %1q, %2q %endmacro %macro LOAD 3 ; dstreg, x, bitdepth diff --git a/libavfilter/x86/vf_hqdn3d_init.c b/libavfilter/x86/vf_hqdn3d_init.c index 06f9e00ec9..b63916b674 100644 --- a/libavfilter/x86/vf_hqdn3d_init.c +++ b/libavfilter/x86/vf_hqdn3d_init.c @@ -1,18 +1,20 @@ /* - * This file is part of Libav. + * Copyright (c) 2012 Loren Merritt * - * Libav is free software; you can redistribute it and/or modify + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along - * with Libav; if not, write to the Free Software Foundation, Inc., + * with FFmpeg; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm new file mode 100644 index 0000000000..007e63deb9 --- /dev/null +++ b/libavfilter/x86/vf_idet.asm @@ -0,0 +1,170 @@ +;***************************************************************************** +;* x86-optimized functions for idet filter +;* +;* Copyright (C) 2014 Pascal Massimino (pascal.massimino@gmail.com) +;* Copyright (c) 2014 Neil Birkbeck (birkbeck@google.com) +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +; Implementation that does 8-bytes at a time using single-word operations. +%macro IDET_FILTER_LINE 1 +INIT_MMX %1 +cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index + xor indexq, indexq +%define m_zero m2 +%define m_sum m5 + pxor m_sum, m_sum + pxor m_zero, m_zero + +.loop: + movu m0, [aq + indexq*1] + punpckhbw m1, m0, m_zero + punpcklbw m0, m_zero + + movu m3, [cq + indexq*1] + punpckhbw m4, m3, m_zero + punpcklbw m3, m_zero + + paddsw m1, m4 + paddsw m0, m3 + + movu m3, [bq + indexq*1] + punpckhbw m4, m3, m_zero + punpcklbw m3, m_zero + + paddw m4, m4 + paddw m3, m3 + psubsw m1, m4 + psubsw m0, m3 + + ABS2 m1, m0, m4, m3 + + paddw m0, m1 + punpckhwd m1, m0, m_zero + punpcklwd m0, m_zero + + paddd m0, m1 + paddd m_sum, m0 + + add indexq, 0x8 + CMP widthd, indexd + jg .loop + + HADDD m_sum, m0 + movd eax, m_sum + RET +%endmacro + +%if ARCH_X86_32 +IDET_FILTER_LINE mmxext +IDET_FILTER_LINE mmx +%endif + +;****************************************************************************** +; 16bit implementation that does 4/8-pixels at a time + +%macro PABS_DIFF_WD 3 ; a, b, junk , output=a + psubusw %3, %2, %1 + psubusw %1, %2 + por %1, %3 + + mova %2, %1 + punpcklwd %1, m_zero + punpckhwd %2, m_zero + paddd %1, %2 +%endmacro + +%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words) +cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index + xor indexq, indexq +%define m_zero m1 +%define m_sum m0 + pxor m_sum, m_sum + pxor m_zero, m_zero + +.loop_16bit: + movu m2, [bq + indexq * 2] ; B + movu m3, [aq + indexq * 2] ; A + mova m6, m2 + psubusw m5, m2, m3 ; ba + + movu m4, [cq + indexq * 2] ; C + add indexq, %1 + psubusw m3, m2 ; ab + CMP indexd, widthd + + psubusw m6, m4 ; bc + psubusw m4, m2 ; cb + + PABS_DIFF_WD m3, m6, m7 ; |ab - bc| + PABS_DIFF_WD m5, m4, m7 ; |ba - cb| + paddd m_sum, m3 + paddd m_sum, m5 + jl .loop_16bit + + HADDD m_sum, m2 + movd eax, m_sum + RET +%endmacro + +INIT_XMM sse2 +IDET_FILTER_LINE_16BIT 8 +%if ARCH_X86_32 +INIT_MMX mmx +IDET_FILTER_LINE_16BIT 4 +%endif + +;****************************************************************************** +; SSE2 8-bit implementation that does 16-bytes at a time: + +INIT_XMM sse2 +cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total + xor indexq, indexq + pxor m0, m0 + pxor m1, m1 + +.sse2_loop: + movu m2, [bq + indexq*1] ; B + movu m3, [aq + indexq*1] ; A + mova m6, m2 + mova m4, m3 + psubusb m5, m2, m3 ; ba + + movu m3, [cq + indexq*1] ; C + add indexq, 0x10 + psubusb m4, m2 ; ab + CMP indexd, widthd + + psubusb m6, m3 ; bc + psubusb m3, m2 ; cb + + psadbw m4, m6 ; |ab - bc| + paddq m0, m4 + psadbw m5, m3 ; |ba - cb| + paddq m1, m5 + jl .sse2_loop + + paddq m0, m1 + movhlps m1, m0 + paddq m0, m1 + movd eax, m0 + RET diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c new file mode 100644 index 0000000000..1147ca8ba8 --- /dev/null +++ b/libavfilter/x86/vf_idet_init.c @@ -0,0 +1,87 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_idet.h" + +#if HAVE_YASM + +/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */ +#define FUNC_MAIN_DECL(KIND, SPAN) \ +int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ + const uint8_t *c, int w); \ +static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ + const uint8_t *c, int w) { \ + int sum = 0; \ + const int left_over = w & (SPAN - 1); \ + w -= left_over; \ + if (w > 0) \ + sum += ff_idet_filter_line_##KIND(a, b, c, w); \ + if (left_over > 0) \ + sum += ff_idet_filter_line_c(a + w, b + w, c + w, left_over); \ + return sum; \ +} + + +#define FUNC_MAIN_DECL_16bit(KIND, SPAN) \ +int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \ + const uint16_t *c, int w); \ +static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \ + const uint16_t *c, int w) { \ + int sum = 0; \ + const int left_over = w & (SPAN - 1); \ + w -= left_over; \ + if (w > 0) \ + sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w); \ + if (left_over > 0) \ + sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over); \ + return sum; \ +} + +FUNC_MAIN_DECL(sse2, 16) +FUNC_MAIN_DECL_16bit(sse2, 8) +#if ARCH_X86_32 +FUNC_MAIN_DECL(mmx, 8) +FUNC_MAIN_DECL(mmxext, 8) +FUNC_MAIN_DECL_16bit(mmx, 4) +#endif + +#endif +av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b) +{ +#if HAVE_YASM + const int cpu_flags = av_get_cpu_flags(); + +#if ARCH_X86_32 + if (EXTERNAL_MMX(cpu_flags)) { + idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx; + } + if (EXTERNAL_MMXEXT(cpu_flags)) { + idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext; + } +#endif // ARCH_x86_32 + + if (EXTERNAL_SSE2(cpu_flags)) { + idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2; + } +#endif // HAVE_YASM +} diff --git a/libavfilter/x86/vf_interlace.asm b/libavfilter/x86/vf_interlace.asm index 85811da8d1..f70c700965 100644 --- a/libavfilter/x86/vf_interlace.asm +++ b/libavfilter/x86/vf_interlace.asm @@ -4,20 +4,20 @@ ;* Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv> ;* Copyright (c) 2014 Michael Niedermayer <michaelni@gmx.at> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or modify +;* FFmpeg is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License along -;* with Libav; if not, write to the Free Software Foundation, Inc., +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ;****************************************************************************** @@ -37,7 +37,7 @@ cglobal lowpass_line, 5, 5, 7 pcmpeqb m6, m6 -.loop +.loop: mova m0, [r3+r1] mova m1, [r3+r1+mmsize] pavgb m0, [r4+r1] diff --git a/libavfilter/x86/vf_interlace_init.c b/libavfilter/x86/vf_interlace_init.c index 231ab85a1c..68ee47d9bc 100644 --- a/libavfilter/x86/vf_interlace_init.c +++ b/libavfilter/x86/vf_interlace_init.c @@ -1,20 +1,20 @@ /* * Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or modify + * FFmpeg is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along - * with Libav; if not, write to the Free Software Foundation, Inc., + * with FFmpeg; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ diff --git a/libavfilter/x86/vf_noise.c b/libavfilter/x86/vf_noise.c new file mode 100644 index 0000000000..0a86cb084b --- /dev/null +++ b/libavfilter/x86/vf_noise.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> + * Copyright (c) 2013 Paul B Mahol + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/x86/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavfilter/vf_noise.h" + +#if HAVE_INLINE_ASM +static void line_noise_mmx(uint8_t *dst, const uint8_t *src, + const int8_t *noise, int len, int shift) +{ + x86_reg mmx_len= len & (~7); + noise += shift; + + __asm__ volatile( + "mov %3, %%"REG_a" \n\t" + "pcmpeqb %%mm7, %%mm7 \n\t" + "psllw $15, %%mm7 \n\t" + "packsswb %%mm7, %%mm7 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "paddsb %%mm1, %%mm0 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "movq %%mm0, (%2, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + :: "r" (src+mmx_len), "r" (noise+mmx_len), "r" (dst+mmx_len), "g" (-mmx_len) + : "%"REG_a + ); + if (mmx_len != len) + ff_line_noise_c(dst+mmx_len, src+mmx_len, noise+mmx_len, len-mmx_len, 0); +} + +#if HAVE_6REGS +static void line_noise_avg_mmx(uint8_t *dst, const uint8_t *src, + int len, const int8_t * const *shift) +{ + x86_reg mmx_len = len & (~7); + + __asm__ volatile( + "mov %5, %%"REG_a" \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "paddb (%2, %%"REG_a"), %%mm1 \n\t" + "paddb (%3, %%"REG_a"), %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpckhbw %%mm2, %%mm2 \n\t" + "punpcklbw %%mm1, %%mm1 \n\t" + "punpckhbw %%mm3, %%mm3 \n\t" + "pmulhw %%mm0, %%mm1 \n\t" + "pmulhw %%mm2, %%mm3 \n\t" + "paddw %%mm1, %%mm1 \n\t" + "paddw %%mm3, %%mm3 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "paddw %%mm2, %%mm3 \n\t" + "psrlw $8, %%mm1 \n\t" + "psrlw $8, %%mm3 \n\t" + "packuswb %%mm3, %%mm1 \n\t" + "movq %%mm1, (%4, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + :: "r" (src+mmx_len), "r" (shift[0]+mmx_len), "r" (shift[1]+mmx_len), "r" (shift[2]+mmx_len), + "r" (dst+mmx_len), "g" (-mmx_len) + : "%"REG_a + ); + + if (mmx_len != len){ + const int8_t *shift2[3] = { shift[0]+mmx_len, shift[1]+mmx_len, shift[2]+mmx_len }; + ff_line_noise_avg_c(dst+mmx_len, src+mmx_len, len-mmx_len, shift2); + } +} +#endif /* HAVE_6REGS */ + +static void line_noise_mmxext(uint8_t *dst, const uint8_t *src, + const int8_t *noise, int len, int shift) +{ + x86_reg mmx_len = len & (~7); + noise += shift; + + __asm__ volatile( + "mov %3, %%"REG_a" \n\t" + "pcmpeqb %%mm7, %%mm7 \n\t" + "psllw $15, %%mm7 \n\t" + "packsswb %%mm7, %%mm7 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "paddsb %%mm1, %%mm0 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "movntq %%mm0, (%2, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + :: "r" (src+mmx_len), "r" (noise+mmx_len), "r" (dst+mmx_len), "g" (-mmx_len) + : "%"REG_a + ); + if (mmx_len != len) + ff_line_noise_c(dst+mmx_len, src+mmx_len, noise+mmx_len, len-mmx_len, 0); +} +#endif /* HAVE_INLINE_ASM */ + +av_cold void ff_noise_init_x86(NoiseContext *n) +{ +#if HAVE_INLINE_ASM + int cpu_flags = av_get_cpu_flags(); + + if (INLINE_MMX(cpu_flags)) { + n->line_noise = line_noise_mmx; +#if HAVE_6REGS + n->line_noise_avg = line_noise_avg_mmx; +#endif + } + if (INLINE_MMXEXT(cpu_flags)) { + n->line_noise = line_noise_mmxext; + } +#endif +} diff --git a/libavfilter/x86/vf_pp7.asm b/libavfilter/x86/vf_pp7.asm new file mode 100644 index 0000000000..7b3e5cf5e3 --- /dev/null +++ b/libavfilter/x86/vf_pp7.asm @@ -0,0 +1,57 @@ +;***************************************************************************** +;* x86-optimized functions for pp7 filter +;* +;* Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +INIT_MMX mmx + +;void ff_pp7_dctB_mmx(int16_t *dst, int16_t *src) +cglobal pp7_dctB, 2, 2, 0, dst, src + movq m0, [srcq] + movq m1, [srcq+mmsize*1] + paddw m0, [srcq+mmsize*6] + paddw m1, [srcq+mmsize*5] + movq m2, [srcq+mmsize*2] + movq m3, [srcq+mmsize*3] + paddw m2, [srcq+mmsize*4] + paddw m3, m3 + movq m4, m3 + psubw m3, m0 + paddw m4, m0 + movq m0, m2 + psubw m2, m1 + paddw m0, m1 + movq m1, m4 + psubw m4, m0 + paddw m1, m0 + movq m0, m3 + psubw m3, m2 + psubw m3, m2 + paddw m2, m0 + paddw m2, m0 + movq [dstq], m1 + movq [dstq+mmsize*2], m4 + movq [dstq+mmsize*1], m2 + movq [dstq+mmsize*3], m3 + RET diff --git a/libavfilter/x86/vf_pp7_init.c b/libavfilter/x86/vf_pp7_init.c new file mode 100644 index 0000000000..165b0dd5d0 --- /dev/null +++ b/libavfilter/x86/vf_pp7_init.c @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_pp7.h" + +void ff_pp7_dctB_mmx(int16_t *dst, int16_t *src); + +av_cold void ff_pp7_init_x86(PP7Context *p) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) + p->dctB = ff_pp7_dctB_mmx; +} diff --git a/libavfilter/x86/vf_psnr.asm b/libavfilter/x86/vf_psnr.asm new file mode 100644 index 0000000000..ef88d6f694 --- /dev/null +++ b/libavfilter/x86/vf_psnr.asm @@ -0,0 +1,140 @@ +;***************************************************************************** +;* x86-optimized functions for interlace filter +;* +;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro SSE_LINE_FN 2 ; 8 or 16, byte or word +INIT_XMM sse2 +%if ARCH_X86_32 +%if %1 == 8 +cglobal sse_line_%1 %+ bit, 0, 6, 8, res, buf, w, px1, px2, ref +%else +cglobal sse_line_%1 %+ bit, 0, 7, 8, res, buf, reshigh, w, px1, px2, ref +%endif + mov bufq, r0mp + mov refq, r1mp + mov wd, r2m +%else +cglobal sse_line_%1 %+ bit, 3, 5, 8, buf, ref, w, px1, px2 +%endif + pxor m6, m6 + pxor m7, m7 + sub wd, mmsize*2 + jl .end + +.loop: + movu m0, [bufq+mmsize*0] + movu m1, [bufq+mmsize*1] + movu m2, [refq+mmsize*0] + movu m3, [refq+mmsize*1] +%if %1 == 8 + add bufq, mmsize*2 + add refq, mmsize*2 + psubusb m4, m0, m2 + psubusb m5, m1, m3 + psubusb m2, m0 + psubusb m3, m1 + por m2, m4 + por m3, m5 + punpcklbw m0, m2, m6 + punpcklbw m1, m3, m6 + punpckhbw m2, m6 + punpckhbw m3, m6 +%else + psubw m0, m2 + psubw m1, m3 + movu m2, [bufq+mmsize*2] + movu m3, [bufq+mmsize*3] + movu m4, [refq+mmsize*2] + movu m5, [refq+mmsize*3] + psubw m2, m4 + psubw m3, m5 + add bufq, mmsize*4 + add refq, mmsize*4 +%endif + pmaddwd m0, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m0, m1 + paddd m2, m3 +%if %1 == 8 + paddd m7, m0 + paddd m7, m2 +%else + paddd m0, m2 + punpckldq m2, m0, m6 + punpckhdq m0, m6 + paddq m7, m0 + paddq m7, m2 +%endif + sub wd, mmsize*2 + jge .loop + +.end: + add wd, mmsize*2 + movhlps m0, m7 +%if %1 == 8 + paddd m7, m0 + pshufd m0, m7, 1 + paddd m7, m0 + movd eax, m7 +%else + paddq m7, m0 +%if ARCH_X86_32 + movd eax, m7 + psrldq m7, 4 + movd edx, m7 +%else + movq rax, m7 +%endif +%endif + + ; deal with cases where w % 32 != 0 + test wd, wd + jz .end_scalar +.loop_scalar: + movzx px1d, %2 [bufq+wq*(%1/8)-(%1/8)] + movzx px2d, %2 [refq+wq*(%1/8)-(%1/8)] + sub px1d, px2d + imul px1d, px1d +%if %1 == 8 + add eax, px1d +%elif ARCH_X86_64 + add rax, px1q +%else + add eax, px1d + adc edx, 0 +%endif + dec wd + jg .loop_scalar + +.end_scalar: + ; for %1=8, no need to zero edx on x86-32, since edx=wd, which is zero + RET +%endmacro + +INIT_XMM sse2 +SSE_LINE_FN 8, byte +SSE_LINE_FN 16, word diff --git a/libavfilter/x86/vf_psnr_init.c b/libavfilter/x86/vf_psnr_init.c new file mode 100644 index 0000000000..c387812204 --- /dev/null +++ b/libavfilter/x86/vf_psnr_init.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/cpu.h" + +#include "libavfilter/psnr.h" + +uint64_t ff_sse_line_8bit_sse2(const uint8_t *buf, const uint8_t *ref, int w); +uint64_t ff_sse_line_16bit_sse2(const uint8_t *buf, const uint8_t *ref, int w); + +void ff_psnr_init_x86(PSNRDSPContext *dsp, int bpp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) { + if (bpp <= 8) { + dsp->sse_line = ff_sse_line_8bit_sse2; + } else if (bpp <= 15) { + dsp->sse_line = ff_sse_line_16bit_sse2; + } + } +} diff --git a/libavfilter/x86/vf_pullup.asm b/libavfilter/x86/vf_pullup.asm new file mode 100644 index 0000000000..d3a195511e --- /dev/null +++ b/libavfilter/x86/vf_pullup.asm @@ -0,0 +1,178 @@ +;***************************************************************************** +;* x86-optimized functions for pullup filter +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +INIT_MMX mmx +cglobal pullup_filter_diff, 3, 5, 8, first, second, size + mov r3, 4 + pxor m4, m4 + pxor m7, m7 + +.loop: + movq m0, [firstq] + movq m2, [firstq] + add firstq, sizeq + movq m1, [secondq] + add secondq, sizeq + psubusb m2, m1 + psubusb m1, m0 + movq m0, m2 + movq m3, m1 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpckhbw m2, m7 + punpckhbw m3, m7 + paddw m4, m0 + paddw m4, m1 + paddw m4, m2 + paddw m4, m3 + + dec r3 + jnz .loop + + movq m3, m4 + punpcklwd m4, m7 + punpckhwd m3, m7 + paddd m3, m4 + movd eax, m3 + psrlq m3, 32 + movd r4d, m3 + add eax, r4d + RET + +INIT_MMX mmx +cglobal pullup_filter_comb, 3, 5, 8, first, second, size + mov r3, 4 + pxor m6, m6 + pxor m7, m7 + sub secondq, sizeq + +.loop: + movq m0, [firstq] + movq m1, [secondq] + punpcklbw m0, m7 + movq m2, [secondq+sizeq] + punpcklbw m1, m7 + punpcklbw m2, m7 + paddw m0, m0 + paddw m1, m2 + movq m2, m0 + psubusw m0, m1 + psubusw m1, m2 + paddw m6, m0 + paddw m6, m1 + + movq m0, [firstq] + movq m1, [secondq] + punpckhbw m0, m7 + movq m2, [secondq+sizeq] + punpckhbw m1, m7 + punpckhbw m2, m7 + paddw m0, m0 + paddw m1, m2 + movq m2, m0 + psubusw m0, m1 + psubusw m1, m2 + paddw m6, m0 + paddw m6, m1 + + movq m0, [secondq+sizeq] + movq m1, [firstq] + punpcklbw m0, m7 + movq m2, [firstq+sizeq] + punpcklbw m1, m7 + punpcklbw m2, m7 + paddw m0, m0 + paddw m1, m2 + movq m2, m0 + psubusw m0, m1 + psubusw m1, m2 + paddw m6, m0 + paddw m6, m1 + + movq m0, [secondq+sizeq] + movq m1, [firstq] + punpckhbw m0, m7 + movq m2, [firstq+sizeq] + punpckhbw m1, m7 + punpckhbw m2, m7 + paddw m0, m0 + paddw m1, m2 + movq m2, m0 + psubusw m0, m1 + psubusw m1, m2 + paddw m6, m0 + paddw m6, m1 + + add firstq, sizeq + add secondq, sizeq + dec r3 + jnz .loop + + movq m5, m6 + punpcklwd m6, m7 + punpckhwd m5, m7 + paddd m5, m6 + movd eax, m5 + psrlq m5, 32 + movd r4d, m5 + add eax, r4d + RET + +INIT_MMX mmx +cglobal pullup_filter_var, 3, 5, 8, first, second, size + mov r3, 3 + pxor m4, m4 + pxor m7, m7 + +.loop: + movq m0, [firstq] + movq m2, [firstq] + movq m1, [firstq+sizeq] + add firstq, sizeq + psubusb m2, m1 + psubusb m1, m0 + movq m0, m2 + movq m3, m1 + punpcklbw m0, m7 + punpcklbw m1, m7 + punpckhbw m2, m7 + punpckhbw m3, m7 + paddw m4, m0 + paddw m4, m1 + paddw m4, m2 + paddw m4, m3 + + dec r3 + jnz .loop + + movq m3, m4 + punpcklwd m4, m7 + punpckhwd m3, m7 + paddd m3, m4 + movd eax, m3 + psrlq m3, 32 + movd r4d, m3 + add eax, r4d + shl eax, 2 + RET diff --git a/libavfilter/x86/vf_pullup_init.c b/libavfilter/x86/vf_pullup_init.c new file mode 100644 index 0000000000..5b36b68e51 --- /dev/null +++ b/libavfilter/x86/vf_pullup_init.c @@ -0,0 +1,41 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_pullup.h" + +int ff_pullup_filter_diff_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s); +int ff_pullup_filter_comb_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s); +int ff_pullup_filter_var_mmx (const uint8_t *a, const uint8_t *b, ptrdiff_t s); + +av_cold void ff_pullup_init_x86(PullupContext *s) +{ +#if HAVE_YASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + s->diff = ff_pullup_filter_diff_mmx; + s->comb = ff_pullup_filter_comb_mmx; + s->var = ff_pullup_filter_var_mmx; + } +#endif +} diff --git a/libavfilter/x86/vf_removegrain.asm b/libavfilter/x86/vf_removegrain.asm new file mode 100644 index 0000000000..c09f89ea30 --- /dev/null +++ b/libavfilter/x86/vf_removegrain.asm @@ -0,0 +1,1218 @@ +;***************************************************************************** +;* x86-optimized functions for removegrain filter +;* +;* Copyright (C) 2015 James Darnley +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License along +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., +;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;***************************************************************************** + +; column: -1 0 +1 +; row -1: a1 a2 a3 +; row 0: a4 c a5 +; row +1: a6 a7 a8 + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA 32 + +pw_4: times 16 dw 4 +pw_8: times 16 dw 8 +pw_div9: times 16 dw ((1<<16)+4)/9 + +SECTION_TEXT + +;*** Preprocessor helpers + +%define a1 srcq+stride_n-1 +%define a2 srcq+stride_n +%define a3 srcq+stride_n+1 +%define a4 srcq-1 +%define c srcq +%define a5 srcq+1 +%define a6 srcq+stride_p-1 +%define a7 srcq+stride_p +%define a8 srcq+stride_p+1 + +; %1 dest simd register +; %2 source memory location +; %3 zero location (simd register/memory) +%macro LOAD 3 + movh %1, %2 + punpcklbw %1, %3 +%endmacro + +%macro LOAD_SQUARE 0 + movu m1, [a1] + movu m2, [a2] + movu m3, [a3] + movu m4, [a4] + movu m0, [c] + movu m5, [a5] + movu m6, [a6] + movu m7, [a7] + movu m8, [a8] +%endmacro + +; %1 zero location (simd register/memory) +%macro LOAD_SQUARE_16 1 + LOAD m1, [a1], %1 + LOAD m2, [a2], %1 + LOAD m3, [a3], %1 + LOAD m4, [a4], %1 + LOAD m0, [c], %1 + LOAD m5, [a5], %1 + LOAD m6, [a6], %1 + LOAD m7, [a7], %1 + LOAD m8, [a8], %1 +%endmacro + +; %1 data type +; %2 simd register to hold maximums +; %3 simd register to hold minimums +; %4 temp location (simd register/memory) +%macro SORT_PAIR 4 + mova %4, %2 + pmin%1 %2, %3 + pmax%1 %3, %4 +%endmacro + +%macro SORT_AXIS 0 + SORT_PAIR ub, m1, m8, m9 + SORT_PAIR ub, m2, m7, m10 + SORT_PAIR ub, m3, m6, m11 + SORT_PAIR ub, m4, m5, m12 +%endmacro + + +%macro SORT_AXIS_16 0 + SORT_PAIR sw, m1, m8, m9 + SORT_PAIR sw, m2, m7, m10 + SORT_PAIR sw, m3, m6, m11 + SORT_PAIR sw, m4, m5, m12 +%endmacro + +; The loop doesn't need to do all the iterations. It could stop when the right +; pixels are in the right registers. +%macro SORT_SQUARE 0 + %assign k 7 + %rep 7 + %assign i 1 + %assign j 2 + %rep k + SORT_PAIR ub, m %+ i , m %+ j , m9 + %assign i i+1 + %assign j j+1 + %endrep + %assign k k-1 + %endrep +%endmacro + +; %1 dest simd register +; %2 source (simd register/memory) +; %3 temp simd register +%macro ABS_DIFF 3 + mova %3, %2 + psubusb %3, %1 + psubusb %1, %2 + por %1, %3 +%endmacro + +; %1 dest simd register +; %2 source (simd register/memory) +; %3 temp simd register +%macro ABS_DIFF_W 3 + mova %3, %2 + psubusw %3, %1 + psubusw %1, %2 + por %1, %3 +%endmacro + +; %1 simd register that holds the "false" values and will hold the result +; %2 simd register that holds the "true" values +; %3 location (simd register/memory) that hold the mask +%macro BLEND 3 +%if cpuflag(avx2) + vpblendvb %1, %1, %2, %3 +%else + pand %2, %3 + pandn %3, %1 + por %3, %2 + SWAP %1, %3 +%endif +%endmacro + +; Functions + +INIT_XMM sse2 +cglobal rg_fl_mode_1, 4, 5, 3, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + movu m0, [a1] + mova m1, m0 + + movu m2, [a2] + pmaxub m0, m2 + pminub m1, m2 + + movu m2, [a3] + pmaxub m0, m2 + pminub m1, m2 + + movu m2, [a4] + pmaxub m0, m2 + pminub m1, m2 + + movu m2, [a5] + pmaxub m0, m2 + pminub m1, m2 + + movu m2, [a6] + pmaxub m0, m2 + pminub m1, m2 + + movu m2, [a7] + pmaxub m0, m2 + pminub m1, m2 + + movu m2, [a8] + pmaxub m0, m2 + pminub m1, m2 + + movu m2, [c] + pminub m2, m0 + pmaxub m2, m1 + + movu [dstq], m2 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +%if ARCH_X86_64 +cglobal rg_fl_mode_2, 4, 5, 10, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + LOAD_SQUARE + SORT_SQUARE + + CLIPUB m0, m2, m7 + + movu [dstq], m0 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +cglobal rg_fl_mode_3, 4, 5, 10, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + LOAD_SQUARE + SORT_SQUARE + + CLIPUB m0, m3, m6 + + movu [dstq], m0 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +cglobal rg_fl_mode_4, 4, 5, 10, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + LOAD_SQUARE + SORT_SQUARE + + CLIPUB m0, m4, m5 + + movu [dstq], m0 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +cglobal rg_fl_mode_5, 4, 5, 13, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + LOAD_SQUARE + SORT_AXIS + + mova m9, m0 + mova m10, m0 + mova m11, m0 + mova m12, m0 + + CLIPUB m9, m1, m8 + CLIPUB m10, m2, m7 + CLIPUB m11, m3, m6 + CLIPUB m12, m4, m5 + + mova m8, m9 ; clip1 + mova m7, m10 ; clip2 + mova m6, m11 ; clip3 + mova m5, m12 ; clip4 + + ABS_DIFF m9, m0, m1 ; c1 + ABS_DIFF m10, m0, m2 ; c2 + ABS_DIFF m11, m0, m3 ; c3 + ABS_DIFF m12, m0, m4 ; c4 + + pminub m9, m10 + pminub m9, m11 + pminub m9, m12 ; mindiff + + pcmpeqb m10, m9 + pcmpeqb m11, m9 + pcmpeqb m12, m9 + + ; Notice the order here: c1, c3, c2, c4 + BLEND m8, m6, m11 + BLEND m8, m7, m10 + BLEND m8, m5, m12 + + movu [dstq], m8 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +cglobal rg_fl_mode_6, 4, 5, 16, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + ; Some register saving suggestions: the zero can be somewhere other than a + ; register, the center pixels could be on the stack. + + pxor m15, m15 + .loop: + LOAD_SQUARE_16 m15 + SORT_AXIS_16 + + mova m9, m0 + mova m10, m0 + mova m11, m0 + mova m12, m0 + CLIPW m9, m1, m8 ; clip1 + CLIPW m10, m2, m7 ; clip2 + CLIPW m11, m3, m6 ; clip3 + CLIPW m12, m4, m5 ; clip4 + + psubw m8, m1 ; d1 + psubw m7, m2 ; d2 + psubw m6, m3 ; d3 + psubw m5, m4 ; d4 + + mova m1, m9 + mova m2, m10 + mova m3, m11 + mova m4, m12 + ABS_DIFF_W m1, m0, m13 + ABS_DIFF_W m2, m0, m14 + ABS_DIFF_W m3, m0, m13 + ABS_DIFF_W m4, m0, m14 + psllw m1, 1 + psllw m2, 1 + psllw m3, 1 + psllw m4, 1 + paddw m1, m8 ; c1 + paddw m2, m7 ; c2 + paddw m3, m6 ; c3 + paddw m4, m5 ; c4 + ; As the differences (d1..d4) can only be postive, there is no need to + ; clip to zero. Also, the maximum positive value is less than 768. + + pminsw m1, m2 + pminsw m1, m3 + pminsw m1, m4 + + pcmpeqw m2, m1 + pcmpeqw m3, m1 + pcmpeqw m4, m1 + + BLEND m9, m11, m3 + BLEND m9, m10, m2 + BLEND m9, m12, m4 + packuswb m9, m9 + + movh [dstq], m9 + add srcq, mmsize/2 + add dstq, mmsize/2 + sub pixelsd, mmsize/2 + jg .loop +RET + +; This is just copy-pasted straight from mode 6 with the left shifts removed. +cglobal rg_fl_mode_7, 4, 5, 16, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + ; Can this be done without unpacking? + + pxor m15, m15 + .loop: + LOAD_SQUARE_16 m15 + SORT_AXIS_16 + + mova m9, m0 + mova m10, m0 + mova m11, m0 + mova m12, m0 + CLIPW m9, m1, m8 ; clip1 + CLIPW m10, m2, m7 ; clip2 + CLIPW m11, m3, m6 ; clip3 + CLIPW m12, m4, m5 ; clip4 + + psubw m8, m1 ; d1 + psubw m7, m2 ; d2 + psubw m6, m3 ; d3 + psubw m5, m4 ; d4 + + mova m1, m9 + mova m2, m10 + mova m3, m11 + mova m4, m12 + ABS_DIFF_W m1, m0, m13 + ABS_DIFF_W m2, m0, m14 + ABS_DIFF_W m3, m0, m13 + ABS_DIFF_W m4, m0, m14 + paddw m1, m8 ; c1 + paddw m2, m7 ; c2 + paddw m3, m6 ; c3 + paddw m4, m5 ; c4 + + pminsw m1, m2 + pminsw m1, m3 + pminsw m1, m4 + + pcmpeqw m2, m1 + pcmpeqw m3, m1 + pcmpeqw m4, m1 + + BLEND m9, m11, m3 + BLEND m9, m10, m2 + BLEND m9, m12, m4 + packuswb m9, m9 + + movh [dstq], m9 + add srcq, mmsize/2 + add dstq, mmsize/2 + sub pixelsd, mmsize/2 + jg .loop +RET + +; This is just copy-pasted straight from mode 6 with a few changes. +cglobal rg_fl_mode_8, 4, 5, 16, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + pxor m15, m15 + .loop: + LOAD_SQUARE_16 m15 + SORT_AXIS_16 + + mova m9, m0 + mova m10, m0 + mova m11, m0 + mova m12, m0 + CLIPW m9, m1, m8 ; clip1 + CLIPW m10, m2, m7 ; clip2 + CLIPW m11, m3, m6 ; clip3 + CLIPW m12, m4, m5 ; clip4 + + psubw m8, m1 ; d1 + psubw m7, m2 ; d2 + psubw m6, m3 ; d3 + psubw m5, m4 ; d4 + psllw m8, 1 + psllw m7, 1 + psllw m6, 1 + psllw m5, 1 + + mova m1, m9 + mova m2, m10 + mova m3, m11 + mova m4, m12 + ABS_DIFF_W m1, m0, m13 + ABS_DIFF_W m2, m0, m14 + ABS_DIFF_W m3, m0, m13 + ABS_DIFF_W m4, m0, m14 + paddw m1, m8 ; c1 + paddw m2, m7 ; c1 + paddw m3, m6 ; c1 + paddw m4, m5 ; c1 + ; As the differences (d1..d4) can only be postive, there is no need to + ; clip to zero. Also, the maximum positive value is less than 768. + + pminsw m1, m2 + pminsw m1, m3 + pminsw m1, m4 + + pcmpeqw m2, m1 + pcmpeqw m3, m1 + pcmpeqw m4, m1 + + BLEND m9, m11, m3 + BLEND m9, m10, m2 + BLEND m9, m12, m4 + packuswb m9, m9 + + movh [dstq], m9 + add srcq, mmsize/2 + add dstq, mmsize/2 + sub pixelsd, mmsize/2 + jg .loop +RET + +cglobal rg_fl_mode_9, 4, 5, 13, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + LOAD_SQUARE + SORT_AXIS + + mova m9, m0 + mova m10, m0 + mova m11, m0 + mova m12, m0 + CLIPUB m9, m1, m8 ; clip1 + CLIPUB m10, m2, m7 ; clip2 + CLIPUB m11, m3, m6 ; clip3 + CLIPUB m12, m4, m5 ; clip4 + + psubb m8, m1 ; d1 + psubb m7, m2 ; d2 + psubb m6, m3 ; d3 + psubb m5, m4 ; d4 + + pminub m8, m7 + pminub m8, m6 + pminub m8, m5 + + pcmpeqb m7, m8 + pcmpeqb m6, m8 + pcmpeqb m5, m8 + + BLEND m9, m11, m6 + BLEND m9, m10, m7 + BLEND m9, m12, m5 + + movu [dstq], m9 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET +%endif + +cglobal rg_fl_mode_10, 4, 5, 8, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + movu m0, [c] + + movu m1, [a4] + mova m2, m1 + ABS_DIFF m1, m0, m7 + + movu m3, [a5] ; load pixel + mova m4, m3 + ABS_DIFF m4, m0, m7 ; absolute difference from center + pminub m1, m4 ; mindiff + pcmpeqb m4, m1 ; if (difference == mindiff) + BLEND m2, m3, m4 ; return pixel + + movu m5, [a1] + mova m6, m5 + ABS_DIFF m6, m0, m7 + pminub m1, m6 + pcmpeqb m6, m1 + BLEND m2, m5, m6 + + movu m3, [a3] + mova m4, m3 + ABS_DIFF m4, m0, m7 + pminub m1, m4 + pcmpeqb m4, m1 + BLEND m2, m3, m4 + + movu m5, [a2] + mova m6, m5 + ABS_DIFF m6, m0, m7 + pminub m1, m6 + pcmpeqb m6, m1 + BLEND m2, m5, m6 + + movu m3, [a6] + mova m4, m3 + ABS_DIFF m4, m0, m7 + pminub m1, m4 + pcmpeqb m4, m1 + BLEND m2, m3, m4 + + movu m5, [a8] + mova m6, m5 + ABS_DIFF m6, m0, m7 + pminub m1, m6 + pcmpeqb m6, m1 + BLEND m2, m5, m6 + + movu m3, [a7] + mova m4, m3 + ABS_DIFF m4, m0, m7 + pminub m1, m4 + pcmpeqb m4, m1 + BLEND m2, m3, m4 + + movu [dstq], m2 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +cglobal rg_fl_mode_11_12, 4, 5, 7, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + pxor m0, m0 + .loop: + LOAD m1, [c], m0 + LOAD m2, [a2], m0 + LOAD m3, [a4], m0 + LOAD m4, [a5], m0 + LOAD m5, [a7], m0 + + psllw m1, 2 + paddw m2, m3 + paddw m4, m5 + paddw m2, m4 + psllw m2, 1 + + LOAD m3, [a1], m0 + LOAD m4, [a3], m0 + LOAD m5, [a6], m0 + LOAD m6, [a8], m0 + paddw m1, m2 + paddw m3, m4 + paddw m5, m6 + paddw m1, m3 + paddw m1, m5 + + paddw m1, [pw_8] + psraw m1, 4 + + packuswb m1, m1 + + movh [dstq], m1 + add srcq, mmsize/2 + add dstq, mmsize/2 + sub pixelsd, mmsize/2 + jg .loop +RET + +cglobal rg_fl_mode_13_14, 4, 5, 8, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + movu m1, [a1] + movu m2, [a8] + mova m0, m1 + pavgb m1, m2 + ABS_DIFF m0, m2, m6 + + movu m3, [a3] + movu m4, [a6] + mova m5, m3 + pavgb m3, m4 + ABS_DIFF m5, m4, m7 + pminub m0, m5 + pcmpeqb m5, m0 + BLEND m1, m3, m5 + + movu m2, [a2] + movu m3, [a7] + mova m4, m2 + pavgb m2, m3 + ABS_DIFF m4, m3, m6 + pminub m0, m4 + pcmpeqb m4, m0 + BLEND m1, m2, m4 + + movu [dstq], m1 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +%if ARCH_X86_64 +cglobal rg_fl_mode_15_16, 4, 5, 16, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + pxor m15, m15 + .loop: + LOAD_SQUARE_16 m15 + + mova m9, m1 + mova m10, m2 + mova m11, m3 + ABS_DIFF_W m9, m8, m12 + ABS_DIFF_W m10, m7, m13 + ABS_DIFF_W m11, m6, m14 + pminsw m9, m10 + pminsw m9, m11 + pcmpeqw m10, m9 + pcmpeqw m11, m9 + + mova m12, m2 + mova m13, m1 + mova m14, m6 + paddw m12, m7 + psllw m12, 1 + paddw m13, m3 + paddw m14, m8 + paddw m12, [pw_4] + paddw m13, m14 + paddw m12, m13 + psrlw m12, 3 + + SORT_PAIR ub, m1, m8, m0 + SORT_PAIR ub, m2, m7, m9 + SORT_PAIR ub, m3, m6, m14 + mova m4, m12 + mova m5, m12 + CLIPW m4, m1, m8 + CLIPW m5, m2, m7 + CLIPW m12, m3, m6 + + BLEND m4, m12, m11 + BLEND m4, m5, m10 + packuswb m4, m4 + + movh [dstq], m4 + add srcq, mmsize/2 + add dstq, mmsize/2 + sub pixelsd, mmsize/2 + jg .loop +RET + +cglobal rg_fl_mode_17, 4, 5, 9, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + LOAD_SQUARE + SORT_AXIS + + pmaxub m1, m2 + pmaxub m3, m4 + + pminub m8, m7 + pminub m5, m6 + + pmaxub m1, m3 + pminub m8, m5 + + mova m2, m1 + pminub m1, m8 + pmaxub m8, m2 + + CLIPUB m0, m1, m8 + + movu [dstq], m0 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +cglobal rg_fl_mode_18, 4, 5, 16, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + LOAD_SQUARE + + mova m9, m1 + mova m10, m8 + ABS_DIFF m9, m0, m11 + ABS_DIFF m10, m0, m12 + pmaxub m9, m10 ; m9 = d1 + + mova m10, m2 + mova m11, m7 + ABS_DIFF m10, m0, m12 + ABS_DIFF m11, m0, m13 + pmaxub m10, m11 ; m10 = d2 + + mova m11, m3 + mova m12, m6 + ABS_DIFF m11, m0, m13 + ABS_DIFF m12, m0, m14 + pmaxub m11, m12 ; m11 = d3 + + mova m12, m4 + mova m13, m5 + ABS_DIFF m12, m0, m14 + ABS_DIFF m13, m0, m15 + pmaxub m12, m13 ; m12 = d4 + + mova m13, m9 + pminub m13, m10 + pminub m13, m11 + pminub m13, m12 ; m13 = mindiff + + pcmpeqb m10, m13 + pcmpeqb m11, m13 + pcmpeqb m12, m13 + + mova m14, m1 + pminub m1, m8 + pmaxub m8, m14 + + mova m13, m0 + mova m14, m1 + pminub m1, m8 + pmaxub m8, m14 + CLIPUB m13, m1, m8 ; m13 = ret...d1 + + mova m14, m0 + mova m15, m3 + pminub m3, m6 + pmaxub m6, m15 + CLIPUB m14, m3, m6 + pand m14, m11 + pandn m11, m13 + por m14, m11 ; m14 = ret...d3 + + mova m15, m0 + mova m1, m2 + pminub m2, m7 + pmaxub m7, m1 + CLIPUB m15, m2, m7 + pand m15, m10 + pandn m10, m14 + por m15, m10 ; m15 = ret...d2 + + mova m1, m0 + mova m2, m4 + pminub m4, m5 + pmaxub m5, m2 + CLIPUB m1, m4, m5 + pand m1, m12 + pandn m12, m15 + por m1, m12 ; m15 = ret...d4 + + movu [dstq], m1 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET +%endif + +cglobal rg_fl_mode_19, 4, 5, 7, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + pxor m0, m0 + .loop: + LOAD m1, [a1], m0 + LOAD m2, [a2], m0 + paddw m1, m2 + + LOAD m3, [a3], m0 + LOAD m4, [a4], m0 + paddw m3, m4 + + LOAD m5, [a5], m0 + LOAD m6, [a6], m0 + paddw m5, m6 + + LOAD m2, [a7], m0 + LOAD m4, [a8], m0 + paddw m2, m4 + + paddw m1, m3 + paddw m2, m5 + paddw m1, m2 + + paddw m1, [pw_4] + psraw m1, 3 + + packuswb m1, m1 + + movh [dstq], m1 + add srcq, mmsize/2 + add dstq, mmsize/2 + sub pixelsd, mmsize/2 + jg .loop +RET + +cglobal rg_fl_mode_20, 4, 5, 7, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + pxor m0, m0 + .loop: + LOAD m1, [a1], m0 + LOAD m2, [a2], m0 + paddw m1, m2 + + LOAD m3, [a3], m0 + LOAD m4, [a4], m0 + paddw m3, m4 + + LOAD m5, [a5], m0 + LOAD m6, [a6], m0 + paddw m5, m6 + + LOAD m2, [a7], m0 + LOAD m4, [a8], m0 + paddw m2, m4 + + LOAD m6, [c], m0 + paddw m1, m3 + paddw m2, m5 + paddw m6, [pw_4] + + paddw m1, m2 + paddw m1, m6 + + pmulhuw m1, [pw_div9] + + packuswb m1, m1 + + movh [dstq], m1 + add srcq, mmsize/2 + add dstq, mmsize/2 + sub pixelsd, mmsize/2 + jg .loop +RET + +cglobal rg_fl_mode_21, 4, 5, 8, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + pxor m0, m0 + .loop: + movu m1, [a1] + movu m2, [a8] + pavgb m7, m1, m2 + punpckhbw m3, m1, m0 + punpcklbw m1, m0 + punpckhbw m4, m2, m0 + punpcklbw m2, m0 + paddw m3, m4 + paddw m1, m2 + psrlw m3, 1 + psrlw m1, 1 + packuswb m1, m3 + + movu m2, [a2] + movu m3, [a7] + pavgb m6, m2, m3 + punpckhbw m4, m2, m0 + punpcklbw m2, m0 + punpckhbw m5, m3, m0 + punpcklbw m3, m0 + paddw m4, m5 + paddw m2, m3 + psrlw m4, 1 + psrlw m2, 1 + packuswb m2, m4 + + pminub m1, m2 + pmaxub m7, m6 + + movu m2, [a3] + movu m3, [a6] + pavgb m6, m2, m3 + punpckhbw m4, m2, m0 + punpcklbw m2, m0 + punpckhbw m5, m3, m0 + punpcklbw m3, m0 + paddw m4, m5 + paddw m2, m3 + psrlw m4, 1 + psrlw m2, 1 + packuswb m2, m4 + + pminub m1, m2 + pmaxub m7, m6 + + movu m2, [a4] + movu m3, [a5] + pavgb m6, m2, m3 + punpckhbw m4, m2, m0 + punpcklbw m2, m0 + punpckhbw m5, m3, m0 + punpcklbw m3, m0 + paddw m4, m5 + paddw m2, m3 + psrlw m4, 1 + psrlw m2, 1 + packuswb m2, m4 + + pminub m1, m2 + pmaxub m7, m6 + + movu m3, [c] + CLIPUB m3, m1, m7 + + movu [dstq], m3 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +cglobal rg_fl_mode_22, 4, 5, 8, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + .loop: + movu m0, [a1] + movu m1, [a8] + pavgb m0, m1 + movu m2, [a2] + movu m3, [a7] + pavgb m2, m3 + movu m4, [a3] + movu m5, [a6] + pavgb m4, m5 + movu m6, [a4] + movu m7, [a5] + pavgb m6, m7 + + mova m1, m0 + mova m3, m2 + mova m5, m4 + mova m7, m6 + pminub m0, m2 + pminub m4, m6 + pmaxub m1, m3 + pmaxub m5, m7 + pminub m0, m4 + pmaxub m1, m5 + + movu m2, [c] + CLIPUB m2, m0, m1 + + movu [dstq], m2 + add srcq, mmsize + add dstq, mmsize + sub pixelsd, mmsize + jg .loop +RET + +%if ARCH_X86_64 +cglobal rg_fl_mode_23, 4, 5, 16, 0, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + pxor m15, m15 + .loop: + LOAD_SQUARE_16 m15 + SORT_AXIS_16 + + mova m9, m8 + mova m10, m7 + mova m11, m6 + mova m12, m5 + psubw m9, m1 ; linediff1 + psubw m10, m2 ; linediff2 + psubw m11, m3 ; linediff3 + psubw m12, m4 ; linediff4 + + psubw m1, m0 + psubw m2, m0 + psubw m3, m0 + psubw m4, m0 + pminsw m1, m9 ; d1 + pminsw m2, m10 ; d2 + pminsw m3, m11 ; d3 + pminsw m4, m12 ; d4 + pmaxsw m1, m2 + pmaxsw m3, m4 + pmaxsw m1, m3 + pmaxsw m1, m15 ; d + + mova m13, m0 + mova m14, m0 + mova m2, m0 + mova m4, m0 + psubw m13, m8 + psubw m14, m7 + psubw m2, m6 + psubw m4, m5 + pminsw m9, m13 ; u1 + pminsw m10, m14 ; u2 + pminsw m11, m2 ; u3 + pminsw m12, m4 ; u4 + pmaxsw m9, m10 + pmaxsw m11, m12 + pmaxsw m9, m11 + pmaxsw m9, m15 ; u + + paddw m0, m1 + psubw m0, m9 + packuswb m0, m0 + + movh [dstq], m0 + add srcq, mmsize/2 + add dstq, mmsize/2 + sub pixelsd, mmsize/2 + jg .loop +RET + +cglobal rg_fl_mode_24, 4, 5, 16, mmsize, dst, src, stride, pixels + mov r4q, strideq + neg r4q + %define stride_p strideq + %define stride_n r4q + + pxor m15, m15 + .loop: + LOAD_SQUARE_16 m15 + mova [rsp], m0 + SORT_AXIS_16 + + mova m9, m8 + mova m10, m7 + mova m11, m6 + mova m12, m5 + psubw m9, m1 ; linediff1 + psubw m10, m2 ; linediff2 + psubw m11, m3 ; linediff3 + psubw m12, m4 ; linediff4 + + psubw m1, [rsp] ; td1 + psubw m2, [rsp] ; td2 + psubw m3, [rsp] ; td3 + psubw m4, [rsp] ; td4 + mova m0, m9 + mova m13, m10 + mova m14, m11 + mova m15, m12 + psubw m0, m1 + psubw m13, m2 + psubw m14, m3 + psubw m15, m4 + pminsw m1, m0 ; d1 + pminsw m2, m13 ; d2 + pminsw m3, m14 ; d3 + pminsw m4, m15 ; d4 + pmaxsw m1, m2 + pmaxsw m3, m4 + + mova m0, [rsp] + mova m13, [rsp] + mova m14, [rsp] + mova m15, [rsp] + psubw m0, m8 ; tu1 + psubw m13, m7 ; tu2 + psubw m14, m6 ; tu3 + psubw m15, m5 ; tu4 + psubw m9, m0 + psubw m10, m13 + psubw m11, m14 + psubw m12, m15 + pminsw m9, m0 ; u1 + pminsw m10, m13 ; u2 + pminsw m11, m14 ; u3 + pminsw m12, m15 ; u4 + pmaxsw m9, m10 + pmaxsw m11, m12 + + pmaxsw m1, m3 ; d without max(d,0) + pmaxsw m9, m11 ; u without max(u,0) + pxor m15, m15 + pmaxsw m1, m15 + pmaxsw m9, m15 + + mova m0, [rsp] + paddw m0, m1 + psubw m0, m9 + packuswb m0, m0 + + movh [dstq], m0 + add srcq, mmsize/2 + add dstq, mmsize/2 + sub pixelsd, mmsize/2 + jg .loop +RET +%endif diff --git a/libavfilter/x86/vf_removegrain_init.c b/libavfilter/x86/vf_removegrain_init.c new file mode 100644 index 0000000000..07314b3244 --- /dev/null +++ b/libavfilter/x86/vf_removegrain_init.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2015 James Darnley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/removegrain.h" + +void ff_rg_fl_mode_1_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_10_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_11_12_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_13_14_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_19_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_20_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_21_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_22_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +#if ARCH_X86_64 +void ff_rg_fl_mode_2_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_3_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_4_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_5_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_6_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_7_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_8_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_9_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_15_16_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_17_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_18_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_23_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +void ff_rg_fl_mode_24_sse2(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int pixels); +#endif + +av_cold void ff_removegrain_init_x86(RemoveGrainContext *rg) +{ +#if CONFIG_GPL + int cpu_flags = av_get_cpu_flags(); + int i; + + for (i = 0; i < rg->nb_planes; i++) { + if (EXTERNAL_SSE2(cpu_flags)) + switch (rg->mode[i]) { + case 1: rg->fl[i] = ff_rg_fl_mode_1_sse2; break; + case 10: rg->fl[i] = ff_rg_fl_mode_10_sse2; break; + case 11: /* fall through */ + case 12: rg->fl[i] = ff_rg_fl_mode_11_12_sse2; break; + case 13: /* fall through */ + case 14: rg->fl[i] = ff_rg_fl_mode_13_14_sse2; break; + case 19: rg->fl[i] = ff_rg_fl_mode_19_sse2; break; + case 20: rg->fl[i] = ff_rg_fl_mode_20_sse2; break; + case 21: rg->fl[i] = ff_rg_fl_mode_21_sse2; break; + case 22: rg->fl[i] = ff_rg_fl_mode_22_sse2; break; +#if ARCH_X86_64 + case 2: rg->fl[i] = ff_rg_fl_mode_2_sse2; break; + case 3: rg->fl[i] = ff_rg_fl_mode_3_sse2; break; + case 4: rg->fl[i] = ff_rg_fl_mode_4_sse2; break; + case 5: rg->fl[i] = ff_rg_fl_mode_5_sse2; break; + case 6: rg->fl[i] = ff_rg_fl_mode_6_sse2; break; + case 7: rg->fl[i] = ff_rg_fl_mode_7_sse2; break; + case 8: rg->fl[i] = ff_rg_fl_mode_8_sse2; break; + case 9: rg->fl[i] = ff_rg_fl_mode_9_sse2; break; + case 15: /* fall through */ + case 16: rg->fl[i] = ff_rg_fl_mode_15_16_sse2; break; + case 17: rg->fl[i] = ff_rg_fl_mode_17_sse2; break; + case 18: rg->fl[i] = ff_rg_fl_mode_18_sse2; break; + case 23: rg->fl[i] = ff_rg_fl_mode_23_sse2; break; + case 24: rg->fl[i] = ff_rg_fl_mode_24_sse2; break; +#endif /* ARCH_x86_64 */ + } + } +#endif /* CONFIG_GPL */ +} diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c new file mode 100644 index 0000000000..1cfb9e81f7 --- /dev/null +++ b/libavfilter/x86/vf_spp.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavfilter/vf_spp.h" + +#if HAVE_MMX_INLINE +static void hardthresh_mmx(int16_t dst[64], const int16_t src[64], + int qp, const uint8_t *permutation) +{ + int bias = 0; //FIXME + unsigned int threshold1; + + threshold1 = qp * ((1<<4) - bias) - 1; + +#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \ + "movq " #src0 ", %%mm0 \n" \ + "movq " #src1 ", %%mm1 \n" \ + "movq " #src2 ", %%mm2 \n" \ + "movq " #src3 ", %%mm3 \n" \ + "psubw %%mm4, %%mm0 \n" \ + "psubw %%mm4, %%mm1 \n" \ + "psubw %%mm4, %%mm2 \n" \ + "psubw %%mm4, %%mm3 \n" \ + "paddusw %%mm5, %%mm0 \n" \ + "paddusw %%mm5, %%mm1 \n" \ + "paddusw %%mm5, %%mm2 \n" \ + "paddusw %%mm5, %%mm3 \n" \ + "paddw %%mm6, %%mm0 \n" \ + "paddw %%mm6, %%mm1 \n" \ + "paddw %%mm6, %%mm2 \n" \ + "paddw %%mm6, %%mm3 \n" \ + "psubusw %%mm6, %%mm0 \n" \ + "psubusw %%mm6, %%mm1 \n" \ + "psubusw %%mm6, %%mm2 \n" \ + "psubusw %%mm6, %%mm3 \n" \ + "psraw $3, %%mm0 \n" \ + "psraw $3, %%mm1 \n" \ + "psraw $3, %%mm2 \n" \ + "psraw $3, %%mm3 \n" \ + \ + "movq %%mm0, %%mm7 \n" \ + "punpcklwd %%mm2, %%mm0 \n" /*A*/ \ + "punpckhwd %%mm2, %%mm7 \n" /*C*/ \ + "movq %%mm1, %%mm2 \n" \ + "punpcklwd %%mm3, %%mm1 \n" /*B*/ \ + "punpckhwd %%mm3, %%mm2 \n" /*D*/ \ + "movq %%mm0, %%mm3 \n" \ + "punpcklwd %%mm1, %%mm0 \n" /*A*/ \ + "punpckhwd %%mm7, %%mm3 \n" /*C*/ \ + "punpcklwd %%mm2, %%mm7 \n" /*B*/ \ + "punpckhwd %%mm2, %%mm1 \n" /*D*/ \ + \ + "movq %%mm0, " #dst0 " \n" \ + "movq %%mm7, " #dst1 " \n" \ + "movq %%mm3, " #dst2 " \n" \ + "movq %%mm1, " #dst3 " \n" + + __asm__ volatile( + "movd %2, %%mm4 \n" + "movd %3, %%mm5 \n" + "movd %4, %%mm6 \n" + "packssdw %%mm4, %%mm4 \n" + "packssdw %%mm5, %%mm5 \n" + "packssdw %%mm6, %%mm6 \n" + "packssdw %%mm4, %%mm4 \n" + "packssdw %%mm5, %%mm5 \n" + "packssdw %%mm6, %%mm6 \n" + REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0)) + REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0)) + REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0)) + REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0)) + : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed? + ); + dst[0] = (src[0] + 4) >> 3; +} + +static void softthresh_mmx(int16_t dst[64], const int16_t src[64], + int qp, const uint8_t *permutation) +{ + int bias = 0; //FIXME + unsigned int threshold1; + + threshold1 = qp*((1<<4) - bias) - 1; + +#undef REQUANT_CORE +#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \ + "movq " #src0 ", %%mm0 \n" \ + "movq " #src1 ", %%mm1 \n" \ + "pxor %%mm6, %%mm6 \n" \ + "pxor %%mm7, %%mm7 \n" \ + "pcmpgtw %%mm0, %%mm6 \n" \ + "pcmpgtw %%mm1, %%mm7 \n" \ + "pxor %%mm6, %%mm0 \n" \ + "pxor %%mm7, %%mm1 \n" \ + "psubusw %%mm4, %%mm0 \n" \ + "psubusw %%mm4, %%mm1 \n" \ + "pxor %%mm6, %%mm0 \n" \ + "pxor %%mm7, %%mm1 \n" \ + "movq " #src2 ", %%mm2 \n" \ + "movq " #src3 ", %%mm3 \n" \ + "pxor %%mm6, %%mm6 \n" \ + "pxor %%mm7, %%mm7 \n" \ + "pcmpgtw %%mm2, %%mm6 \n" \ + "pcmpgtw %%mm3, %%mm7 \n" \ + "pxor %%mm6, %%mm2 \n" \ + "pxor %%mm7, %%mm3 \n" \ + "psubusw %%mm4, %%mm2 \n" \ + "psubusw %%mm4, %%mm3 \n" \ + "pxor %%mm6, %%mm2 \n" \ + "pxor %%mm7, %%mm3 \n" \ + \ + "paddsw %%mm5, %%mm0 \n" \ + "paddsw %%mm5, %%mm1 \n" \ + "paddsw %%mm5, %%mm2 \n" \ + "paddsw %%mm5, %%mm3 \n" \ + "psraw $3, %%mm0 \n" \ + "psraw $3, %%mm1 \n" \ + "psraw $3, %%mm2 \n" \ + "psraw $3, %%mm3 \n" \ + \ + "movq %%mm0, %%mm7 \n" \ + "punpcklwd %%mm2, %%mm0 \n" /*A*/ \ + "punpckhwd %%mm2, %%mm7 \n" /*C*/ \ + "movq %%mm1, %%mm2 \n" \ + "punpcklwd %%mm3, %%mm1 \n" /*B*/ \ + "punpckhwd %%mm3, %%mm2 \n" /*D*/ \ + "movq %%mm0, %%mm3 \n" \ + "punpcklwd %%mm1, %%mm0 \n" /*A*/ \ + "punpckhwd %%mm7, %%mm3 \n" /*C*/ \ + "punpcklwd %%mm2, %%mm7 \n" /*B*/ \ + "punpckhwd %%mm2, %%mm1 \n" /*D*/ \ + \ + "movq %%mm0, " #dst0 " \n" \ + "movq %%mm7, " #dst1 " \n" \ + "movq %%mm3, " #dst2 " \n" \ + "movq %%mm1, " #dst3 " \n" + + __asm__ volatile( + "movd %2, %%mm4 \n" + "movd %3, %%mm5 \n" + "packssdw %%mm4, %%mm4 \n" + "packssdw %%mm5, %%mm5 \n" + "packssdw %%mm4, %%mm4 \n" + "packssdw %%mm5, %%mm5 \n" + REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0)) + REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0)) + REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0)) + REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0)) + : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed? + ); + + dst[0] = (src[0] + 4) >> 3; +} + +static void store_slice_mmx(uint8_t *dst, const int16_t *src, + int dst_stride, int src_stride, + int width, int height, int log2_scale, + const uint8_t dither[8][8]) +{ + int y; + + for (y = 0; y < height; y++) { + uint8_t *dst1 = dst; + const int16_t *src1 = src; + __asm__ volatile( + "movq (%3), %%mm3 \n" + "movq (%3), %%mm4 \n" + "movd %4, %%mm2 \n" + "pxor %%mm0, %%mm0 \n" + "punpcklbw %%mm0, %%mm3 \n" + "punpckhbw %%mm0, %%mm4 \n" + "psraw %%mm2, %%mm3 \n" + "psraw %%mm2, %%mm4 \n" + "movd %5, %%mm2 \n" + "1: \n" + "movq (%0), %%mm0 \n" + "movq 8(%0), %%mm1 \n" + "paddw %%mm3, %%mm0 \n" + "paddw %%mm4, %%mm1 \n" + "psraw %%mm2, %%mm0 \n" + "psraw %%mm2, %%mm1 \n" + "packuswb %%mm1, %%mm0 \n" + "movq %%mm0, (%1) \n" + "add $16, %0 \n" + "add $8, %1 \n" + "cmp %2, %1 \n" + " jb 1b \n" + : "+r" (src1), "+r"(dst1) + : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale) + ); + src += src_stride; + dst += dst_stride; + } +} + +#endif /* HAVE_MMX_INLINE */ + +av_cold void ff_spp_init_x86(SPPContext *s) +{ +#if HAVE_MMX_INLINE + int cpu_flags = av_get_cpu_flags(); + + if (cpu_flags & AV_CPU_FLAG_MMX) { + s->store_slice = store_slice_mmx; + if (av_get_int(s->dct, "bits_per_sample", NULL) <= 8) { + switch (s->mode) { + case 0: s->requantize = hardthresh_mmx; break; + case 1: s->requantize = softthresh_mmx; break; + } + } + } +#endif +} diff --git a/libavfilter/x86/vf_ssim.asm b/libavfilter/x86/vf_ssim.asm new file mode 100644 index 0000000000..3293e66701 --- /dev/null +++ b/libavfilter/x86/vf_ssim.asm @@ -0,0 +1,247 @@ +;***************************************************************************** +;* x86-optimized functions for ssim filter +;* +;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_1: times 8 dw 1 +ssim_c1: times 4 dd 416 ;(.01*.01*255*255*64 + .5) +ssim_c2: times 4 dd 235963 ;(.03*.03*255*255*64*63 + .5) + +SECTION .text + +%macro SSIM_4X4_LINE 1 +%if ARCH_X86_64 +cglobal ssim_4x4_line, 6, 8, %1, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3 +%else +cglobal ssim_4x4_line, 5, 7, %1, buf, buf_stride, ref, ref_stride, sums, buf_stride3, ref_stride3 +%define wd r5mp +%endif + lea ref_stride3q, [ref_strideq*3] + lea buf_stride3q, [buf_strideq*3] +%if notcpuflag(xop) + pxor m7, m7 + mova m15, [pw_1] +%endif + +.loop: +%if cpuflag(xop) + pmovzxbw m0, [bufq+buf_strideq*0] + pmovzxbw m1, [refq+ref_strideq*0] + pmaddwd m4, m0, m0 + pmaddwd m6, m0, m1 + pmovzxbw m2, [bufq+buf_strideq*1] + vpmadcswd m4, m1, m1, m4 + pmovzxbw m3, [refq+ref_strideq*1] + paddw m0, m2 + vpmadcswd m4, m2, m2, m4 + vpmadcswd m6, m2, m3, m6 + paddw m1, m3 + vpmadcswd m4, m3, m3, m4 + + pmovzxbw m2, [bufq+buf_strideq*2] + pmovzxbw m3, [refq+ref_strideq*2] + vpmadcswd m4, m2, m2, m4 + vpmadcswd m6, m2, m3, m6 + pmovzxbw m5, [bufq+buf_stride3q] + pmovzxbw m7, [refq+ref_stride3q] + vpmadcswd m4, m3, m3, m4 + vpmadcswd m6, m5, m7, m6 + paddw m0, m2 + paddw m1, m3 + vpmadcswd m4, m5, m5, m4 + paddw m0, m5 + paddw m1, m7 + vpmadcswd m4, m7, m7, m4 +%else + movh m0, [bufq+buf_strideq*0] ; a1 + movh m1, [refq+ref_strideq*0] ; b1 + movh m2, [bufq+buf_strideq*1] ; a2 + movh m3, [refq+ref_strideq*1] ; b2 + punpcklbw m0, m7 ; s1 [word] + punpcklbw m1, m7 ; s2 [word] + punpcklbw m2, m7 ; s1 [word] + punpcklbw m3, m7 ; s2 [word] + pmaddwd m4, m0, m0 ; a1 * a1 + pmaddwd m5, m1, m1 ; b1 * b1 + pmaddwd m8, m2, m2 ; a2 * a2 + pmaddwd m9, m3, m3 ; b2 * b2 + paddd m4, m5 ; ss + paddd m8, m9 ; ss + pmaddwd m6, m0, m1 ; a1 * b1 = ss12 + pmaddwd m5, m2, m3 ; a2 * b2 = ss12 + paddw m0, m2 + paddw m1, m3 + paddd m6, m5 ; s12 + paddd m4, m8 ; ss + + movh m2, [bufq+buf_strideq*2] ; a3 + movh m3, [refq+ref_strideq*2] ; b3 + movh m5, [bufq+buf_stride3q] ; a4 + movh m8, [refq+ref_stride3q] ; b4 + punpcklbw m2, m7 ; s1 [word] + punpcklbw m3, m7 ; s2 [word] + punpcklbw m5, m7 ; s1 [word] + punpcklbw m8, m7 ; s2 [word] + pmaddwd m9, m2, m2 ; a3 * a3 + pmaddwd m10, m3, m3 ; b3 * b3 + pmaddwd m12, m5, m5 ; a4 * a4 + pmaddwd m13, m8, m8 ; b4 * b4 + pmaddwd m11, m2, m3 ; a3 * b3 = ss12 + pmaddwd m14, m5, m8 ; a4 * b4 = ss12 + paddd m9, m10 + paddd m12, m13 + paddw m0, m2 + paddw m1, m3 + paddw m0, m5 + paddw m1, m8 + paddd m6, m11 + paddd m4, m9 + paddd m6, m14 + paddd m4, m12 +%endif + + ; m0 = [word] s1 a,a,a,a,b,b,b,b + ; m1 = [word] s2 a,a,a,a,b,b,b,b + ; m4 = [dword] ss a,a,b,b + ; m6 = [dword] s12 a,a,b,b + +%if cpuflag(xop) + vphaddwq m0, m0 ; [dword] s1 a, 0, b, 0 + vphaddwq m1, m1 ; [dword] s2 a, 0, b, 0 + vphadddq m4, m4 ; [dword] ss a, 0, b, 0 + vphadddq m6, m6 ; [dword] s12 a, 0, b, 0 + punpckhdq m2, m0, m1 ; [dword] s1 b, s2 b, 0, 0 + punpckldq m0, m1 ; [dword] s1 a, s2 a, 0, 0 + punpckhdq m3, m4, m6 ; [dword] ss b, s12 b, 0, 0 + punpckldq m4, m6 ; [dword] ss a, s12 a, 0, 0 + punpcklqdq m1, m2, m3 ; [dword] b s1, s2, ss, s12 + punpcklqdq m0, m4 ; [dword] a s1, s2, ss, s12 +%else + pmaddwd m0, m15 ; [dword] s1 a,a,b,b + pmaddwd m1, m15 ; [dword] s2 a,a,b,b + phaddd m0, m4 ; [dword] s1 a, b, ss a, b + phaddd m1, m6 ; [dword] s2 a, b, s12 a, b + punpckhdq m2, m0, m1 ; [dword] ss a, s12 a, ss b, s12 b + punpckldq m0, m1 ; [dword] s1 a, s2 a, s1 b, s2 b + punpckhqdq m1, m0, m2 ; [dword] b s1, s2, ss, s12 + punpcklqdq m0, m2 ; [dword] a s1, s2, ss, s12 +%endif + + mova [sumsq+ 0], m0 + mova [sumsq+mmsize], m1 + + add bufq, mmsize/2 + add refq, mmsize/2 + add sumsq, mmsize*2 + sub wd, mmsize/8 + jg .loop + RET +%endmacro + +%if ARCH_X86_64 +INIT_XMM ssse3 +SSIM_4X4_LINE 16 +%endif +%if HAVE_XOP_EXTERNAL +INIT_XMM xop +SSIM_4X4_LINE 8 +%endif + +INIT_XMM sse4 +cglobal ssim_end_line, 3, 3, 6, sum0, sum1, w + pxor m0, m0 +.loop: + mova m1, [sum0q+mmsize*0] + mova m2, [sum0q+mmsize*1] + mova m3, [sum0q+mmsize*2] + mova m4, [sum0q+mmsize*3] + paddd m1, [sum1q+mmsize*0] + paddd m2, [sum1q+mmsize*1] + paddd m3, [sum1q+mmsize*2] + paddd m4, [sum1q+mmsize*3] + paddd m1, m2 + paddd m2, m3 + paddd m3, m4 + paddd m4, [sum0q+mmsize*4] + paddd m4, [sum1q+mmsize*4] + TRANSPOSE4x4D 1, 2, 3, 4, 5 + + ; m1 = fs1, m2 = fs2, m3 = fss, m4 = fs12 + pslld m3, 6 + pslld m4, 6 + pmulld m5, m1, m2 ; fs1 * fs2 + pmulld m1, m1 ; fs1 * fs1 + pmulld m2, m2 ; fs2 * fs2 + psubd m3, m1 + psubd m4, m5 ; covariance + psubd m3, m2 ; variance + + ; m1 = fs1 * fs1, m2 = fs2 * fs2, m3 = variance, m4 = covariance, m5 = fs1 * fs2 + paddd m4, m4 ; 2 * covariance + paddd m5, m5 ; 2 * fs1 * fs2 + paddd m1, m2 ; fs1 * fs1 + fs2 * fs2 + paddd m3, [ssim_c2] ; variance + ssim_c2 + paddd m4, [ssim_c2] ; 2 * covariance + ssim_c2 + paddd m5, [ssim_c1] ; 2 * fs1 * fs2 + ssim_c1 + paddd m1, [ssim_c1] ; fs1 * fs1 + fs2 * fs2 + ssim_c1 + + ; convert to float + cvtdq2ps m3, m3 + cvtdq2ps m4, m4 + cvtdq2ps m5, m5 + cvtdq2ps m1, m1 + mulps m4, m5 + mulps m3, m1 + divps m4, m3 ; ssim_endl + addps m0, m4 ; ssim + add sum0q, mmsize*4 + add sum1q, mmsize*4 + sub wd, 4 + jg .loop + + ; subps the ones we added too much + test wd, wd + jz .end + add wd, 4 + test wd, 2 + jz .skip2 + psrldq m4, 8 +.skip2: + test wd, 1 + jz .skip1 + psrldq m4, 4 +.skip1: + subps m0, m4 + +.end: + movhlps m4, m0 + addps m0, m4 + movss m4, m0 + shufps m0, m0, 1 + addss m0, m4 +%if ARCH_X86_32 + movss r0m, m0 + fld r0mp +%endif + RET diff --git a/libavfilter/x86/vf_ssim_init.c b/libavfilter/x86/vf_ssim_init.c new file mode 100644 index 0000000000..599c928403 --- /dev/null +++ b/libavfilter/x86/vf_ssim_init.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/cpu.h" + +#include "libavfilter/ssim.h" + +void ff_ssim_4x4_line_ssse3(const uint8_t *buf, ptrdiff_t buf_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + int (*sums)[4], int w); +void ff_ssim_4x4_line_xop (const uint8_t *buf, ptrdiff_t buf_stride, + const uint8_t *ref, ptrdiff_t ref_stride, + int (*sums)[4], int w); +float ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w); + +void ff_ssim_init_x86(SSIMDSPContext *dsp) +{ + int cpu_flags = av_get_cpu_flags(); + + if (ARCH_X86_64 && EXTERNAL_SSSE3(cpu_flags)) + dsp->ssim_4x4_line = ff_ssim_4x4_line_ssse3; + if (EXTERNAL_SSE4(cpu_flags)) + dsp->ssim_end_line = ff_ssim_end_line_sse4; + if (EXTERNAL_XOP(cpu_flags)) + dsp->ssim_4x4_line = ff_ssim_4x4_line_xop; +} diff --git a/libavfilter/x86/vf_tinterlace_init.c b/libavfilter/x86/vf_tinterlace_init.c new file mode 100644 index 0000000000..ddb0cced36 --- /dev/null +++ b/libavfilter/x86/vf_tinterlace_init.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/internal.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" + +#include "libavfilter/tinterlace.h" + +void ff_lowpass_line_sse2(uint8_t *dstp, ptrdiff_t linesize, + const uint8_t *srcp, + const uint8_t *srcp_above, + const uint8_t *srcp_below); +void ff_lowpass_line_avx (uint8_t *dstp, ptrdiff_t linesize, + const uint8_t *srcp, + const uint8_t *srcp_above, + const uint8_t *srcp_below); + +av_cold void ff_tinterlace_init_x86(TInterlaceContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) + s->lowpass_line = ff_lowpass_line_sse2; + if (EXTERNAL_AVX(cpu_flags)) + s->lowpass_line = ff_lowpass_line_avx; +} diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm index 3d8b2bc180..a29620ce55 100644 --- a/libavfilter/x86/vf_yadif.asm +++ b/libavfilter/x86/vf_yadif.asm @@ -4,20 +4,20 @@ ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> ;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or +;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software +;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** @@ -39,11 +39,7 @@ SECTION .text pavgb m5, m3 pand m4, [pb_1] psubusb m5, m4 -%if mmsize == 16 - psrldq m5, 1 -%else - psrlq m5, 8 -%endif + RSHIFT m5, 1 punpcklbw m5, m7 mova m4, m2 psubusb m2, m3 @@ -51,13 +47,8 @@ SECTION .text pmaxub m2, m3 mova m3, m2 mova m4, m2 -%if mmsize == 16 - psrldq m3, 1 - psrldq m4, 2 -%else - psrlq m3, 8 - psrlq m4, 16 -%endif + RSHIFT m3, 1 + RSHIFT m4, 2 punpcklbw m2, m7 punpcklbw m3, m7 punpcklbw m4, m7 @@ -90,17 +81,17 @@ SECTION .text %endmacro %macro LOAD 2 - movh m%1, %2 - punpcklbw m%1, m7 + movh %1, %2 + punpcklbw %1, m7 %endmacro %macro FILTER 3 .loop%1: pxor m7, m7 - LOAD 0, [curq+t1] - LOAD 1, [curq+t0] - LOAD 2, [%2] - LOAD 3, [%3] + LOAD m0, [curq+t1] + LOAD m1, [curq+t0] + LOAD m2, [%2] + LOAD m3, [%3] mova m4, m3 paddw m3, m2 psraw m3, 1 @@ -109,8 +100,8 @@ SECTION .text mova [rsp+32], m1 psubw m2, m4 ABS1 m2, m4 - LOAD 3, [prevq+t1] - LOAD 4, [prevq+t0] + LOAD m3, [prevq+t1] + LOAD m4, [prevq+t0] psubw m3, m0 psubw m4, m1 ABS1 m3, m5 @@ -119,8 +110,8 @@ SECTION .text psrlw m2, 1 psrlw m3, 1 pmaxsw m2, m3 - LOAD 3, [nextq+t1] - LOAD 4, [nextq+t0] + LOAD m3, [nextq+t1] + LOAD m4, [nextq+t0] psubw m3, m0 psubw m4, m1 ABS1 m3, m5 @@ -166,10 +157,10 @@ SECTION .text mova m6, [rsp+48] cmp DWORD r8m, 2 jge .end%1 - LOAD 2, [%2+t1*2] - LOAD 4, [%3+t1*2] - LOAD 3, [%2+t0*2] - LOAD 5, [%3+t0*2] + LOAD m2, [%2+t1*2] + LOAD m4, [%3+t1*2] + LOAD m3, [%2+t0*2] + LOAD m5, [%3+t0*2] paddw m2, m4 paddw m3, m5 psrlw m2, 1 @@ -220,8 +211,6 @@ cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \ cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \ mrefs, parity, mode %endif - cmp DWORD wm, 0 - jle .ret %if ARCH_X86_32 mov r4, r5mp mov r5, r6mp diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c index 510a02394c..1460a642c3 100644 --- a/libavfilter/x86/vf_yadif_init.c +++ b/libavfilter/x86/vf_yadif_init.c @@ -1,26 +1,25 @@ /* * Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" @@ -36,16 +35,63 @@ void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next, int w, int prefs, int mrefs, int parity, int mode); +void ff_yadif_filter_line_16bit_mmxext(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_16bit_ssse3(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); + +void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); +void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur, + void *next, int w, int prefs, + int mrefs, int parity, int mode); + av_cold void ff_yadif_init_x86(YADIFContext *yadif) { int cpu_flags = av_get_cpu_flags(); + int bit_depth = (!yadif->csp) ? 8 + : yadif->csp->comp[0].depth_minus1 + 1; + if (bit_depth >= 15) { +#if ARCH_X86_32 + if (EXTERNAL_MMXEXT(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_16bit_mmxext; +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_16bit_sse2; + if (EXTERNAL_SSSE3(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_16bit_ssse3; + if (EXTERNAL_SSE4(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_16bit_sse4; + } else if ( bit_depth >= 9 && bit_depth <= 14) { +#if ARCH_X86_32 + if (EXTERNAL_MMXEXT(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_10bit_mmxext; +#endif /* ARCH_X86_32 */ + if (EXTERNAL_SSE2(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_10bit_sse2; + if (EXTERNAL_SSSE3(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_10bit_ssse3; + } else { #if ARCH_X86_32 - if (EXTERNAL_MMXEXT(cpu_flags)) - yadif->filter_line = ff_yadif_filter_line_mmxext; + if (EXTERNAL_MMXEXT(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_mmxext; #endif /* ARCH_X86_32 */ - if (EXTERNAL_SSE2(cpu_flags)) - yadif->filter_line = ff_yadif_filter_line_sse2; - if (EXTERNAL_SSSE3(cpu_flags)) - yadif->filter_line = ff_yadif_filter_line_ssse3; + if (EXTERNAL_SSE2(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_sse2; + if (EXTERNAL_SSSE3(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_ssse3; + } } diff --git a/libavfilter/x86/yadif-10.asm b/libavfilter/x86/yadif-10.asm new file mode 100644 index 0000000000..8853e0d2c7 --- /dev/null +++ b/libavfilter/x86/yadif-10.asm @@ -0,0 +1,255 @@ +;***************************************************************************** +;* x86-optimized functions for yadif filter +;* +;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> +;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_1: times 8 dw 1 + +SECTION .text + +%macro PMAXUW 2 +%if cpuflag(sse4) + pmaxuw %1, %2 +%else + psubusw %1, %2 + paddusw %1, %2 +%endif +%endmacro + +%macro CHECK 2 + movu m2, [curq+t1+%1*2] + movu m3, [curq+t0+%2*2] + mova m4, m2 + mova m5, m2 + pxor m4, m3 + pavgw m5, m3 + pand m4, [pw_1] + psubusw m5, m4 + RSHIFT m5, 2 + mova m4, m2 + psubusw m2, m3 + psubusw m3, m4 + PMAXUW m2, m3 + mova m3, m2 + mova m4, m2 + RSHIFT m3, 2 + RSHIFT m4, 4 + paddw m2, m3 + paddw m2, m4 +%endmacro + +%macro CHECK1 0 + mova m3, m0 + pcmpgtw m3, m2 + pminsw m0, m2 + mova m6, m3 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +; %macro CHECK2 0 +; paddw m6, [pw_1] +; psllw m6, 14 +; paddsw m2, m6 +; mova m3, m0 +; pcmpgtw m3, m2 +; pminsw m0, m2 +; pand m5, m3 +; pandn m3, m1 +; por m3, m5 +; mova m1, m3 +; %endmacro + +; This version of CHECK2 is required for 14-bit samples. The left-shift trick +; in the old code is not large enough to correctly select pixels or scores. + +%macro CHECK2 0 + mova m3, m0 + pcmpgtw m0, m2 + pand m0, m6 + mova m6, m0 + pand m5, m6 + pand m2, m0 + pandn m6, m1 + pandn m0, m3 + por m6, m5 + por m0, m2 + mova m1, m6 +%endmacro + +%macro LOAD 2 + movu %1, %2 +%endmacro + +%macro FILTER 3 +.loop%1: + pxor m7, m7 + LOAD m0, [curq+t1] + LOAD m1, [curq+t0] + LOAD m2, [%2] + LOAD m3, [%3] + mova m4, m3 + paddw m3, m2 + psraw m3, 1 + mova [rsp+ 0], m0 + mova [rsp+16], m3 + mova [rsp+32], m1 + psubw m2, m4 + ABS1 m2, m4 + LOAD m3, [prevq+t1] + LOAD m4, [prevq+t0] + psubw m3, m0 + psubw m4, m1 + ABS2 m3, m4, m5, m6 + paddw m3, m4 + psrlw m2, 1 + psrlw m3, 1 + pmaxsw m2, m3 + LOAD m3, [nextq+t1] + LOAD m4, [nextq+t0] + psubw m3, m0 + psubw m4, m1 + ABS2 m3, m4, m5, m6 + paddw m3, m4 + psrlw m3, 1 + pmaxsw m2, m3 + mova [rsp+48], m2 + + paddw m1, m0 + paddw m0, m0 + psubw m0, m1 + psrlw m1, 1 + ABS1 m0, m2 + + movu m2, [curq+t1-1*2] + movu m3, [curq+t0-1*2] + mova m4, m2 + psubusw m2, m3 + psubusw m3, m4 + PMAXUW m2, m3 + mova m3, m2 + RSHIFT m3, 4 + paddw m0, m2 + paddw m0, m3 + psubw m0, [pw_1] + + CHECK -2, 0 + CHECK1 + CHECK -3, 1 + CHECK2 + CHECK 0, -2 + CHECK1 + CHECK 1, -3 + CHECK2 + + mova m6, [rsp+48] + cmp DWORD r8m, 2 + jge .end%1 + LOAD m2, [%2+t1*2] + LOAD m4, [%3+t1*2] + LOAD m3, [%2+t0*2] + LOAD m5, [%3+t0*2] + paddw m2, m4 + paddw m3, m5 + psrlw m2, 1 + psrlw m3, 1 + mova m4, [rsp+ 0] + mova m5, [rsp+16] + mova m7, [rsp+32] + psubw m2, m4 + psubw m3, m7 + mova m0, m5 + psubw m5, m4 + psubw m0, m7 + mova m4, m2 + pminsw m2, m3 + pmaxsw m3, m4 + pmaxsw m2, m5 + pminsw m3, m5 + pmaxsw m2, m0 + pminsw m3, m0 + pxor m4, m4 + pmaxsw m6, m3 + psubw m4, m2 + pmaxsw m6, m4 + +.end%1: + mova m2, [rsp+16] + mova m3, m2 + psubw m2, m6 + paddw m3, m6 + pmaxsw m1, m2 + pminsw m1, m3 + + movu [dstq], m1 + add dstq, mmsize-4 + add prevq, mmsize-4 + add curq, mmsize-4 + add nextq, mmsize-4 + sub DWORD r4m, mmsize/2-2 + jg .loop%1 +%endmacro + +%macro YADIF 0 +%if ARCH_X86_32 +cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ + prefs, mrefs, parity, mode +%else +cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ + prefs, mrefs, parity, mode +%endif +%if ARCH_X86_32 + mov r4, r5mp + mov r5, r6mp + DECLARE_REG_TMP 4,5 +%else + movsxd r5, DWORD r5m + movsxd r6, DWORD r6m + DECLARE_REG_TMP 5,6 +%endif + + cmp DWORD paritym, 0 + je .parity0 + FILTER 1, prevq, curq + jmp .ret + +.parity0: + FILTER 0, curq, nextq + +.ret: + RET +%endmacro + +INIT_XMM ssse3 +YADIF +INIT_XMM sse2 +YADIF +%if ARCH_X86_32 +INIT_MMX mmxext +YADIF +%endif diff --git a/libavfilter/x86/yadif-16.asm b/libavfilter/x86/yadif-16.asm new file mode 100644 index 0000000000..79d127dfaa --- /dev/null +++ b/libavfilter/x86/yadif-16.asm @@ -0,0 +1,317 @@ +;***************************************************************************** +;* x86-optimized functions for yadif filter +;* +;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> +;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +pw_1: times 8 dw 1 +pw_8000: times 8 dw 0x8000 +pd_1: times 4 dd 1 +pd_8000: times 4 dd 0x8000 + +SECTION .text + +%macro PABS 2 +%if cpuflag(ssse3) + pabsd %1, %1 +%else + pxor %2, %2 + pcmpgtd %2, %1 + pxor %1, %2 + psubd %1, %2 +%endif +%endmacro + +%macro PACK 1 +%if cpuflag(sse4) + packusdw %1, %1 +%else + psubd %1, [pd_8000] + packssdw %1, %1 + paddw %1, [pw_8000] +%endif +%endmacro + +%macro PMINSD 3 +%if cpuflag(sse4) + pminsd %1, %2 +%else + mova %3, %2 + pcmpgtd %3, %1 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endif +%endmacro + +%macro PMAXSD 3 +%if cpuflag(sse4) + pmaxsd %1, %2 +%else + mova %3, %1 + pcmpgtd %3, %2 + pand %1, %3 + pandn %3, %2 + por %1, %3 +%endif +%endmacro + +%macro PMAXUW 2 +%if cpuflag(sse4) + pmaxuw %1, %2 +%else + psubusw %1, %2 + paddusw %1, %2 +%endif +%endmacro + +%macro CHECK 2 + movu m2, [curq+t1+%1*2] + movu m3, [curq+t0+%2*2] + mova m4, m2 + mova m5, m2 + pxor m4, m3 + pavgw m5, m3 + pand m4, [pw_1] + psubusw m5, m4 + RSHIFT m5, 2 + punpcklwd m5, m7 + mova m4, m2 + psubusw m2, m3 + psubusw m3, m4 + PMAXUW m2, m3 + mova m3, m2 + mova m4, m2 + RSHIFT m3, 2 + RSHIFT m4, 4 + punpcklwd m2, m7 + punpcklwd m3, m7 + punpcklwd m4, m7 + paddd m2, m3 + paddd m2, m4 +%endmacro + +%macro CHECK1 0 + mova m3, m0 + pcmpgtd m3, m2 + PMINSD m0, m2, m6 + mova m6, m3 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +%macro CHECK2 0 + paddd m6, [pd_1] + pslld m6, 30 + paddd m2, m6 + mova m3, m0 + pcmpgtd m3, m2 + PMINSD m0, m2, m4 + pand m5, m3 + pandn m3, m1 + por m3, m5 + mova m1, m3 +%endmacro + +; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I +; am not sure whether it is any faster. A rewrite or refactor of the filter +; code should make it possible to eliminate the move instruction at the end. It +; exists to satisfy the expectation that the "score" values are in m1. + +; %macro CHECK2 0 +; mova m3, m0 +; pcmpgtd m0, m2 +; pand m0, m6 +; mova m6, m0 +; pand m5, m6 +; pand m2, m0 +; pandn m6, m1 +; pandn m0, m3 +; por m6, m5 +; por m0, m2 +; mova m1, m6 +; %endmacro + +%macro LOAD 2 + movh %1, %2 + punpcklwd %1, m7 +%endmacro + +%macro FILTER 3 +.loop%1: + pxor m7, m7 + LOAD m0, [curq+t1] + LOAD m1, [curq+t0] + LOAD m2, [%2] + LOAD m3, [%3] + mova m4, m3 + paddd m3, m2 + psrad m3, 1 + mova [rsp+ 0], m0 + mova [rsp+16], m3 + mova [rsp+32], m1 + psubd m2, m4 + PABS m2, m4 + LOAD m3, [prevq+t1] + LOAD m4, [prevq+t0] + psubd m3, m0 + psubd m4, m1 + PABS m3, m5 + PABS m4, m5 + paddd m3, m4 + psrld m2, 1 + psrld m3, 1 + PMAXSD m2, m3, m6 + LOAD m3, [nextq+t1] + LOAD m4, [nextq+t0] + psubd m3, m0 + psubd m4, m1 + PABS m3, m5 + PABS m4, m5 + paddd m3, m4 + psrld m3, 1 + PMAXSD m2, m3, m6 + mova [rsp+48], m2 + + paddd m1, m0 + paddd m0, m0 + psubd m0, m1 + psrld m1, 1 + PABS m0, m2 + + movu m2, [curq+t1-1*2] + movu m3, [curq+t0-1*2] + mova m4, m2 + psubusw m2, m3 + psubusw m3, m4 + PMAXUW m2, m3 + mova m3, m2 + RSHIFT m3, 4 + punpcklwd m2, m7 + punpcklwd m3, m7 + paddd m0, m2 + paddd m0, m3 + psubd m0, [pd_1] + + CHECK -2, 0 + CHECK1 + CHECK -3, 1 + CHECK2 + CHECK 0, -2 + CHECK1 + CHECK 1, -3 + CHECK2 + + mova m6, [rsp+48] + cmp DWORD r8m, 2 + jge .end%1 + LOAD m2, [%2+t1*2] + LOAD m4, [%3+t1*2] + LOAD m3, [%2+t0*2] + LOAD m5, [%3+t0*2] + paddd m2, m4 + paddd m3, m5 + psrld m2, 1 + psrld m3, 1 + mova m4, [rsp+ 0] + mova m5, [rsp+16] + mova m7, [rsp+32] + psubd m2, m4 + psubd m3, m7 + mova m0, m5 + psubd m5, m4 + psubd m0, m7 + mova m4, m2 + PMINSD m2, m3, m7 + PMAXSD m3, m4, m7 + PMAXSD m2, m5, m7 + PMINSD m3, m5, m7 + PMAXSD m2, m0, m7 + PMINSD m3, m0, m7 + pxor m4, m4 + PMAXSD m6, m3, m7 + psubd m4, m2 + PMAXSD m6, m4, m7 + +.end%1: + mova m2, [rsp+16] + mova m3, m2 + psubd m2, m6 + paddd m3, m6 + PMAXSD m1, m2, m7 + PMINSD m1, m3, m7 + PACK m1 + + movh [dstq], m1 + add dstq, mmsize/2 + add prevq, mmsize/2 + add curq, mmsize/2 + add nextq, mmsize/2 + sub DWORD r4m, mmsize/4 + jg .loop%1 +%endmacro + +%macro YADIF 0 +%if ARCH_X86_32 +cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ + prefs, mrefs, parity, mode +%else +cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ + prefs, mrefs, parity, mode +%endif +%if ARCH_X86_32 + mov r4, r5mp + mov r5, r6mp + DECLARE_REG_TMP 4,5 +%else + movsxd r5, DWORD r5m + movsxd r6, DWORD r6m + DECLARE_REG_TMP 5,6 +%endif + + cmp DWORD paritym, 0 + je .parity0 + FILTER 1, prevq, curq + jmp .ret + +.parity0: + FILTER 0, curq, nextq + +.ret: + RET +%endmacro + +INIT_XMM sse4 +YADIF +INIT_XMM ssse3 +YADIF +INIT_XMM sse2 +YADIF +%if ARCH_X86_32 +INIT_MMX mmxext +YADIF +%endif |