summaryrefslogtreecommitdiff
path: root/libavfilter/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libavfilter/x86')
-rw-r--r--libavfilter/x86/Makefile15
-rw-r--r--libavfilter/x86/af_volume.asm10
-rw-r--r--libavfilter/x86/af_volume_init.c8
-rw-r--r--libavfilter/x86/vf_eq.c96
-rw-r--r--libavfilter/x86/vf_fspp.asm727
-rw-r--r--libavfilter/x86/vf_fspp_init.c49
-rw-r--r--libavfilter/x86/vf_gradfun.asm8
-rw-r--r--libavfilter/x86/vf_gradfun_init.c61
-rw-r--r--libavfilter/x86/vf_hqdn3d.asm8
-rw-r--r--libavfilter/x86/vf_hqdn3d_init.c10
-rw-r--r--libavfilter/x86/vf_idet.asm170
-rw-r--r--libavfilter/x86/vf_idet_init.c87
-rw-r--r--libavfilter/x86/vf_interlace.asm8
-rw-r--r--libavfilter/x86/vf_interlace_init.c8
-rw-r--r--libavfilter/x86/vf_noise.c144
-rw-r--r--libavfilter/x86/vf_pp7.asm57
-rw-r--r--libavfilter/x86/vf_pp7_init.c34
-rw-r--r--libavfilter/x86/vf_pullup.asm178
-rw-r--r--libavfilter/x86/vf_pullup_init.c41
-rw-r--r--libavfilter/x86/vf_spp.c235
-rw-r--r--libavfilter/x86/vf_tinterlace_init.c47
-rw-r--r--libavfilter/x86/vf_yadif.asm53
-rw-r--r--libavfilter/x86/vf_yadif_init.c68
-rw-r--r--libavfilter/x86/yadif-10.asm255
-rw-r--r--libavfilter/x86/yadif-16.asm317
25 files changed, 2593 insertions, 101 deletions
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 13b5d318ec..49f45b630e 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,11 +1,24 @@
+OBJS-$(CONFIG_EQ_FILTER) += x86/vf_eq.o
+OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp_init.o
OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o
OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o
+OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o
OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace_init.o
+OBJS-$(CONFIG_NOISE_FILTER) += x86/vf_noise.o
+OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o
+OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o
+OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o
+OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o
OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o
OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o
+YASM-OBJS-$(CONFIG_FSPP_FILTER) += x86/vf_fspp.o
YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o
YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o
+YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o
YASM-OBJS-$(CONFIG_INTERLACE_FILTER) += x86/vf_interlace.o
+YASM-OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7.o
+YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o
+YASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o
YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o
-YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o
+YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o
diff --git a/libavfilter/x86/af_volume.asm b/libavfilter/x86/af_volume.asm
index 4e5ad2258c..f4cbcbc5de 100644
--- a/libavfilter/x86/af_volume.asm
+++ b/libavfilter/x86/af_volume.asm
@@ -2,20 +2,20 @@
;* x86-optimized functions for volume filter
;* Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -99,9 +99,11 @@ cglobal scale_samples_s32, 4,4,4, dst, src, len, volume
INIT_XMM sse2
%define CVTDQ2PD cvtdq2pd
SCALE_SAMPLES_S32
+%if HAVE_AVX_EXTERNAL
%define CVTDQ2PD vcvtdq2pd
INIT_YMM avx
SCALE_SAMPLES_S32
+%endif
%undef CVTDQ2PD
; NOTE: This is not bit-identical with the C version because it clips to
diff --git a/libavfilter/x86/af_volume_init.c b/libavfilter/x86/af_volume_init.c
index c59e0eda8e..57c7eab65f 100644
--- a/libavfilter/x86/af_volume_init.c
+++ b/libavfilter/x86/af_volume_init.c
@@ -1,18 +1,18 @@
/*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
diff --git a/libavfilter/x86/vf_eq.c b/libavfilter/x86/vf_eq.c
new file mode 100644
index 0000000000..16f399505f
--- /dev/null
+++ b/libavfilter/x86/vf_eq.c
@@ -0,0 +1,96 @@
+/*
+ *
+ * Original MPlayer filters by Richard Felker.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavfilter/vf_eq.h"
+
+#if HAVE_MMX_INLINE && HAVE_6REGS
+static void process_MMX(EQParameters *param, uint8_t *dst, int dst_stride,
+ const uint8_t *src, int src_stride, int w, int h)
+{
+ int i;
+ int pel;
+ int dstep = dst_stride - w;
+ int sstep = src_stride - w;
+ short brvec[4];
+ short contvec[4];
+ int contrast = (int) (param->contrast * 256 * 16);
+ int brightness = ((int) (100.0 * param->brightness + 100.0) * 511) / 200 - 128 - contrast / 32;
+
+ brvec[0] = brvec[1] = brvec[2] = brvec[3] = brightness;
+ contvec[0] = contvec[1] = contvec[2] = contvec[3] = contrast;
+
+ while (h--) {
+ __asm__ volatile (
+ "movq (%5), %%mm3 \n\t"
+ "movq (%6), %%mm4 \n\t"
+ "pxor %%mm0, %%mm0 \n\t"
+ "movl %4, %%eax \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%0), %%mm1 \n\t"
+ "movq (%0), %%mm2 \n\t"
+ "punpcklbw %%mm0, %%mm1\n\t"
+ "punpckhbw %%mm0, %%mm2\n\t"
+ "psllw $4, %%mm1 \n\t"
+ "psllw $4, %%mm2 \n\t"
+ "pmulhw %%mm4, %%mm1 \n\t"
+ "pmulhw %%mm4, %%mm2 \n\t"
+ "paddw %%mm3, %%mm1 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "packuswb %%mm2, %%mm1 \n\t"
+ "add $8, %0 \n\t"
+ "movq %%mm1, (%1) \n\t"
+ "add $8, %1 \n\t"
+ "decl %%eax \n\t"
+ "jnz 1b \n\t"
+ : "=r" (src), "=r" (dst)
+ : "0" (src), "1" (dst), "r" (w>>3), "r" (brvec), "r" (contvec)
+ : "%eax"
+ );
+
+ for (i = w&7; i; i--) {
+ pel = ((*src++ * contrast) >> 12) + brightness;
+ if (pel & ~255)
+ pel = (-pel) >> 31;
+ *dst++ = pel;
+ }
+
+ src += sstep;
+ dst += dstep;
+ }
+ __asm__ volatile ( "emms \n\t" ::: "memory" );
+}
+#endif
+
+av_cold void ff_eq_init_x86(EQContext *eq)
+{
+#if HAVE_MMX_INLINE && HAVE_6REGS
+ int cpu_flags = av_get_cpu_flags();
+
+ if (cpu_flags & AV_CPU_FLAG_MMX) {
+ eq->process = process_MMX;
+ }
+#endif
+}
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
new file mode 100644
index 0000000000..e88feb981a
--- /dev/null
+++ b/libavfilter/x86/vf_fspp.asm
@@ -0,0 +1,727 @@
+;*****************************************************************************
+;* x86-optimized functions for fspp filter
+;*
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \
+ 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \
+ 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \
+ 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21
+pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
+pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
+pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
+pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
+pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
+pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
+pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
+pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
+pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
+pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
+pw_4: times 4 dw 4
+pw_2: times 4 dw 2
+
+SECTION .text
+
+%define DCTSIZE 8
+
+INIT_MMX mmx
+
+;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
+; ptrdiff_t dst_stride, ptrdiff_t src_stride,
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+%if ARCH_X86_64
+cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+%else
+cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+%define dst_strideq r2m
+%define src_strideq r3m
+ mov widthq, r4m
+ mov dither_heightq, r5m
+ mov ditherq, r6m ; log2_scale
+%endif
+ add widthq, 7
+ mov tmpq, src_strideq
+ and widthq, ~7
+ sub dst_strideq, widthq
+ movd m5, ditherq ; log2_scale
+ xor ditherq, -1 ; log2_scale
+ mov tmp2q, tmpq
+ add ditherq, 7 ; log2_scale
+ neg tmpq
+ sub tmp2q, widthq
+ movd m2, ditherq ; log2_scale
+ add tmp2q, tmp2q
+ lea ditherq, [pb_dither]
+ mov src_strideq, tmp2q
+ shl tmpq, 4
+ lea dither_heightq, [ditherq+dither_heightq*8]
+ pxor m7, m7
+
+.loop_height:
+ movq m3, [ditherq]
+ movq m4, m3
+ punpcklbw m3, m7
+ punpckhbw m4, m7
+ mov tmp2q, widthq
+ psraw m3, m5
+ psraw m4, m5
+
+.loop_width:
+ movq [srcq+tmpq], m7
+ movq m0, [srcq]
+ movq m1, [srcq+8]
+ movq [srcq+tmpq+8], m7
+ paddw m0, m3
+ paddw m1, m4
+ movq [srcq], m7
+ psraw m0, m2
+ psraw m1, m2
+ movq [srcq+8], m7
+ packuswb m0, m1
+ add srcq, 16
+ movq [dstq], m0
+ add dstq, 8
+ sub tmp2q, 8
+ jg .loop_width
+
+ add srcq, src_strideq
+ add ditherq, 8
+ add dstq, dst_strideq
+ cmp ditherq, dither_heightq
+ jl .loop_height
+ RET
+
+;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+; ptrdiff_t dst_stride, ptrdiff_t src_stride,
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+%if ARCH_X86_64
+cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+%else
+cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+%define dst_strideq r2m
+%define src_strideq r3m
+ mov dstq, dstm
+ mov srcq, srcm
+ mov widthq, r4m
+ mov dither_heightq, r5m
+ mov ditherq, r6m ; log2_scale
+%endif
+ add widthq, 7
+ mov tmpq, src_strideq
+ and widthq, ~7
+ sub dst_strideq, widthq
+ movd m5, ditherq ; log2_scale
+ xor ditherq, -1 ; log2_scale
+ mov tmp2q, tmpq
+ add ditherq, 7 ; log2_scale
+ sub tmp2q, widthq
+ movd m2, ditherq ; log2_scale
+ add tmp2q, tmp2q
+ lea ditherq, [pb_dither]
+ mov src_strideq, tmp2q
+ shl tmpq, 5
+ lea dither_heightq, [ditherq+dither_heightq*8]
+ pxor m7, m7
+
+.loop_height:
+ movq m3, [ditherq]
+ movq m4, m3
+ punpcklbw m3, m7
+ punpckhbw m4, m7
+ mov tmp2q,widthq
+ psraw m3, m5
+ psraw m4, m5
+
+.loop_width:
+ movq m0, [srcq]
+ movq m1, [srcq+8]
+ paddw m0, m3
+ paddw m0, [srcq+tmpq]
+ paddw m1, m4
+ movq m6, [srcq+tmpq+8]
+ movq [srcq+tmpq], m7
+ psraw m0, m2
+ paddw m1, m6
+ movq [srcq+tmpq+8], m7
+ psraw m1, m2
+ packuswb m0, m1
+ movq [dstq], m0
+ add srcq, 16
+ add dstq, 8
+ sub tmp2q, 8
+ jg .loop_width
+
+ add srcq, src_strideq
+ add ditherq, 8
+ add dstq, dst_strideq
+ cmp ditherq, dither_heightq
+ jl .loop_height
+ RET
+
+;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
+ movd m7, qd
+ movq m0, [thrnq]
+ punpcklwd m7, m7
+ movq m1, [thrnq+8]
+ punpckldq m7, m7
+ pmullw m0, m7
+ movq m2, [thrnq+8*2]
+ pmullw m1, m7
+ movq m3, [thrnq+8*3]
+ pmullw m2, m7
+ movq [thrq], m0
+ movq m4, [thrnq+8*4]
+ pmullw m3, m7
+ movq [thrq+8], m1
+ movq m5, [thrnq+8*5]
+ pmullw m4, m7
+ movq [thrq+8*2], m2
+ movq m6, [thrnq+8*6]
+ pmullw m5, m7
+ movq [thrq+8*3], m3
+ movq m0, [thrnq+8*7]
+ pmullw m6, m7
+ movq [thrq+8*4], m4
+ movq m1, [thrnq+8*7+8]
+ pmullw m0, m7
+ movq [thrq+8*5], m5
+ movq m2, [thrnq+8*7+8*2]
+ pmullw m1, m7
+ movq [thrq+8*6], m6
+ movq m3, [thrnq+8*7+8*3]
+ pmullw m2, m7
+ movq [thrq+8*7], m0
+ movq m4, [thrnq+8*7+8*4]
+ pmullw m3, m7
+ movq [thrq+8*7+8], m1
+ movq m5, [thrnq+8*7+8*5]
+ pmullw m4, m7
+ movq [thrq+8*7+8*2], m2
+ movq m6, [thrnq+8*7+8*6]
+ pmullw m5, m7
+ movq [thrq+8*7+8*3], m3
+ movq m0, [thrnq+14*8]
+ pmullw m6, m7
+ movq [thrq+8*7+8*4], m4
+ movq m1, [thrnq+14*8+8]
+ pmullw m0, m7
+ movq [thrq+8*7+8*5], m5
+ pmullw m1, m7
+ movq [thrq+8*7+8*6], m6
+ movq [thrq+14*8], m0
+ movq [thrq+14*8+8], m1
+ RET
+
+%macro COLUMN_FDCT 1-3 0, 0
+ movq m1, [srcq+DCTSIZE*0*2]
+ movq m7, [srcq+DCTSIZE*3*2]
+ movq m0, m1
+ paddw m1, [srcq+DCTSIZE*7*2]
+ movq m3, m7
+ paddw m7, [srcq+DCTSIZE*4*2]
+ movq m5, m1
+ movq m6, [srcq+DCTSIZE*1*2]
+ psubw m1, m7
+ movq m2, [srcq+DCTSIZE*2*2]
+ movq m4, m6
+ paddw m6, [srcq+DCTSIZE*6*2]
+ paddw m5, m7
+ paddw m2, [srcq+DCTSIZE*5*2]
+ movq m7, m6
+ paddw m6, m2
+ psubw m7, m2
+ movq m2, m5
+ paddw m5, m6
+ psubw m2, m6
+ paddw m7, m1
+ movq m6, [thrq+4*16+%2]
+ psllw m7, 2
+ psubw m5, [thrq+%2]
+ psubw m2, m6
+ paddusw m5, [thrq+%2]
+ paddusw m2, m6
+ pmulhw m7, [pw_2D41]
+ paddw m5, [thrq+%2]
+ paddw m2, m6
+ psubusw m5, [thrq+%2]
+ psubusw m2, m6
+ paddw m5, [pw_2]
+ movq m6, m2
+ paddw m2, m5
+ psubw m5, m6
+ movq m6, m1
+ paddw m1, m7
+ psubw m1, [thrq+2*16+%2]
+ psubw m6, m7
+ movq m7, [thrq+6*16+%2]
+ psraw m5, 2
+ paddusw m1, [thrq+2*16+%2]
+ psubw m6, m7
+ paddw m1, [thrq+2*16+%2]
+ paddusw m6, m7
+ psubusw m1, [thrq+2*16+%2]
+ paddw m6, m7
+ psubw m3, [srcq+DCTSIZE*4*2]
+ psubusw m6, m7
+ movq m7, m1
+ psraw m2, 2
+ psubw m4, [srcq+DCTSIZE*6*2]
+ psubw m1, m6
+ psubw m0, [srcq+DCTSIZE*7*2]
+ paddw m6, m7
+ psraw m6, 2
+ movq m7, m2
+ pmulhw m1, [pw_5A82]
+ paddw m2, m6
+ movq [rsp], m2
+ psubw m7, m6
+ movq m2, [srcq+DCTSIZE*2*2]
+ psubw m1, m6
+ psubw m2, [srcq+DCTSIZE*5*2]
+ movq m6, m5
+ movq [rsp+8*3], m7
+ paddw m3, m2
+ paddw m2, m4
+ paddw m4, m0
+ movq m7, m3
+ psubw m3, m4
+ psllw m3, 2
+ psllw m7, 2
+ pmulhw m3, [pw_187E]
+ psllw m4, 2
+ pmulhw m7, [pw_22A3]
+ psllw m2, 2
+ pmulhw m4, [pw_539F]
+ paddw m5, m1
+ pmulhw m2, [pw_2D41]
+ psubw m6, m1
+ paddw m7, m3
+ movq [rsp+8], m5
+ paddw m4, m3
+ movq m3, [thrq+3*16+%2]
+ movq m1, m0
+ movq [rsp+8*2], m6
+ psubw m1, m2
+ paddw m0, m2
+ movq m5, m1
+ movq m2, [thrq+5*16+%2]
+ psubw m1, m7
+ paddw m5, m7
+ psubw m1, m3
+ movq m7, [thrq+16+%2]
+ psubw m5, m2
+ movq m6, m0
+ paddw m0, m4
+ paddusw m1, m3
+ psubw m6, m4
+ movq m4, [thrq+7*16+%2]
+ psubw m0, m7
+ psubw m6, m4
+ paddusw m5, m2
+ paddusw m6, m4
+ paddw m1, m3
+ paddw m5, m2
+ paddw m6, m4
+ psubusw m1, m3
+ psubusw m5, m2
+ psubusw m6, m4
+ movq m4, m1
+ por m4, m5
+ paddusw m0, m7
+ por m4, m6
+ paddw m0, m7
+ packssdw m4, m4
+ psubusw m0, m7
+ movd tmpd, m4
+ or tmpd, tmpd
+ jnz %1
+ movq m4, [rsp]
+ movq m1, m0
+ pmulhw m0, [pw_3642]
+ movq m2, m1
+ movq m5, [outq+DCTSIZE*0*2]
+ movq m3, m2
+ pmulhw m1, [pw_2441]
+ paddw m5, m4
+ movq m6, [rsp+8]
+ psraw m3, 2
+ pmulhw m2, [pw_0CBB]
+ psubw m4, m3
+ movq m7, [outq+DCTSIZE*1*2]
+ paddw m5, m3
+ movq [outq+DCTSIZE*7*2], m4
+ paddw m7, m6
+ movq m3, [rsp+8*2]
+ psubw m6, m0
+ movq m4, [outq+DCTSIZE*2*2]
+ paddw m7, m0
+ movq [outq], m5
+ paddw m4, m3
+ movq [outq+DCTSIZE*6*2], m6
+ psubw m3, m1
+ movq m5, [outq+DCTSIZE*5*2]
+ paddw m4, m1
+ movq m6, [outq+DCTSIZE*3*2]
+ paddw m5, m3
+ movq m0, [rsp+8*3]
+ add srcq, 8+%3
+ movq [outq+DCTSIZE*1*2], m7
+ paddw m6, m0
+ movq [outq+DCTSIZE*2*2], m4
+ psubw m0, m2
+ movq m7, [outq+DCTSIZE*4*2]
+ paddw m6, m2
+ movq [outq+DCTSIZE*5*2], m5
+ paddw m7, m0
+ movq [outq+DCTSIZE*3*2], m6
+ movq [outq+DCTSIZE*4*2], m7
+ add outq, 8+%3
+%endmacro
+
+%macro COLUMN_IDCT 0-1 0
+ movq m3, m5
+ psubw m5, m1
+ psllw m5, 1
+ paddw m3, m1
+ movq m2, m0
+ psubw m0, m6
+ movq m1, m5
+ psllw m0, 1
+ pmulhw m1, [pw_AC62]
+ paddw m5, m0
+ pmulhw m5, [pw_3B21]
+ paddw m2, m6
+ pmulhw m0, [pw_22A3]
+ movq m7, m2
+ movq m4, [rsp]
+ psubw m2, m3
+ psllw m2, 1
+ paddw m7, m3
+ pmulhw m2, [pw_2D41]
+ movq m6, m4
+ psraw m7, 2
+ paddw m4, [outq]
+ psubw m6, m7
+ movq m3, [rsp+8]
+ paddw m4, m7
+ movq [outq+DCTSIZE*7*2], m6
+ paddw m1, m5
+ movq [outq], m4
+ psubw m1, m7
+ movq m7, [rsp+8*2]
+ psubw m0, m5
+ movq m6, [rsp+8*3]
+ movq m5, m3
+ paddw m3, [outq+DCTSIZE*1*2]
+ psubw m5, m1
+ psubw m2, m1
+ paddw m3, m1
+ movq [outq+DCTSIZE*6*2], m5
+ movq m4, m7
+ paddw m7, [outq+DCTSIZE*2*2]
+ psubw m4, m2
+ paddw m4, [outq+DCTSIZE*5*2]
+ paddw m7, m2
+ movq [outq+DCTSIZE*1*2], m3
+ paddw m0, m2
+ movq [outq+DCTSIZE*2*2], m7
+ movq m1, m6
+ paddw m6, [outq+DCTSIZE*4*2]
+ psubw m1, m0
+ paddw m1, [outq+DCTSIZE*3*2]
+ paddw m6, m0
+ movq [outq+DCTSIZE*5*2], m4
+ add srcq, 8+%1
+ movq [outq+DCTSIZE*4*2], m6
+ movq [outq+DCTSIZE*3*2], m1
+ add outq, 8+%1
+%endmacro
+
+;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
+.fdct1:
+ COLUMN_FDCT .idct1
+ jmp .fdct2
+
+.idct1:
+ COLUMN_IDCT
+
+.fdct2:
+ COLUMN_FDCT .idct2, 8, 16
+ sub cntd, 2
+ jg .fdct1
+ RET
+
+.idct2:
+ COLUMN_IDCT 16
+ sub cntd, 2
+ jg .fdct1
+ RET
+
+;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
+ add strideq, strideq
+ lea stride3q, [strideq+strideq*2]
+.loop:
+ movq m0, [srcq+DCTSIZE*0*2]
+ movq m1, [srcq+DCTSIZE*1*2]
+ movq m4, m0
+ movq m2, [srcq+DCTSIZE*2*2]
+ punpcklwd m0, m1
+ movq m3, [srcq+DCTSIZE*3*2]
+ punpckhwd m4, m1
+ movq m7, m2
+ punpcklwd m2, m3
+ movq m6, m0
+ punpckldq m0, m2
+ punpckhdq m6, m2
+ movq m5, m0
+ punpckhwd m7, m3
+ psubw m0, m6
+ pmulhw m0, [pw_5A82]
+ movq m2, m4
+ punpckldq m4, m7
+ paddw m5, m6
+ punpckhdq m2, m7
+ movq m1, m4
+ psllw m0, 2
+ paddw m4, m2
+ movq m3, [srcq+DCTSIZE*0*2+8]
+ psubw m1, m2
+ movq m2, [srcq+DCTSIZE*1*2+8]
+ psubw m0, m5
+ movq m6, m4
+ paddw m4, m5
+ psubw m6, m5
+ movq m7, m1
+ movq m5, [srcq+DCTSIZE*2*2+8]
+ paddw m1, m0
+ movq [rsp], m4
+ movq m4, m3
+ movq [rsp+8], m6
+ punpcklwd m3, m2
+ movq m6, [srcq+DCTSIZE*3*2+8]
+ punpckhwd m4, m2
+ movq m2, m5
+ punpcklwd m5, m6
+ psubw m7, m0
+ punpckhwd m2, m6
+ movq m0, m3
+ punpckldq m3, m5
+ punpckhdq m0, m5
+ movq m5, m4
+ movq m6, m3
+ punpckldq m4, m2
+ psubw m3, m0
+ punpckhdq m5, m2
+ paddw m6, m0
+ movq m2, m4
+ movq m0, m3
+ psubw m4, m5
+ pmulhw m0, [pw_AC62]
+ paddw m3, m4
+ pmulhw m3, [pw_3B21]
+ paddw m2, m5
+ pmulhw m4, [pw_22A3]
+ movq m5, m2
+ psubw m2, m6
+ paddw m5, m6
+ pmulhw m2, [pw_2D41]
+ paddw m0, m3
+ psllw m0, 3
+ psubw m4, m3
+ movq m6, [rsp]
+ movq m3, m1
+ psllw m4, 3
+ psubw m0, m5
+ psllw m2, 3
+ paddw m1, m0
+ psubw m2, m0
+ psubw m3, m0
+ paddw m4, m2
+ movq m0, m7
+ paddw m7, m2
+ psubw m0, m2
+ movq m2, [pw_4]
+ psubw m6, m5
+ paddw m5, [rsp]
+ paddw m1, m2
+ paddw m5, m2
+ psraw m1, 3
+ paddw m7, m2
+ psraw m5, 3
+ paddw m5, [dstq]
+ psraw m7, 3
+ paddw m1, [dstq+strideq*1]
+ paddw m0, m2
+ paddw m7, [dstq+strideq*2]
+ paddw m3, m2
+ movq [dstq], m5
+ paddw m6, m2
+ movq [dstq+strideq*1], m1
+ psraw m0, 3
+ movq [dstq+strideq*2], m7
+ add dstq, stride3q
+ movq m5, [rsp+8]
+ psraw m3, 3
+ paddw m0, [dstq+strideq*2]
+ psubw m5, m4
+ paddw m3, [dstq+stride3q*1]
+ psraw m6, 3
+ paddw m4, [rsp+8]
+ paddw m5, m2
+ paddw m6, [dstq+strideq*4]
+ paddw m4, m2
+ movq [dstq+strideq*2], m0
+ psraw m5, 3
+ paddw m5, [dstq]
+ psraw m4, 3
+ paddw m4, [dstq+strideq*1]
+ add srcq, DCTSIZE*2*4
+ movq [dstq+stride3q*1], m3
+ movq [dstq+strideq*4], m6
+ movq [dstq], m5
+ movq [dstq+strideq*1], m4
+ sub dstq, stride3q
+ add dstq, 8
+ dec r3d
+ jnz .loop
+ RET
+
+;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
+cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3
+ lea stride3q, [strideq+strideq*2]
+.loop:
+ movd m0, [pixq]
+ pxor m7, m7
+ movd m1, [pixq+strideq*1]
+ punpcklbw m0, m7
+ movd m2, [pixq+strideq*2]
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ add pixq,stride3q
+ movq m5, m0
+ movd m3, [pixq+strideq*4]
+ movq m6, m1
+ movd m4, [pixq+stride3q*1]
+ punpcklbw m3, m7
+ psubw m5, m3
+ punpcklbw m4, m7
+ paddw m0, m3
+ psubw m6, m4
+ movd m3, [pixq+strideq*2]
+ paddw m1, m4
+ movq [rsp], m5
+ punpcklbw m3, m7
+ movq [rsp+8], m6
+ movq m4, m2
+ movd m5, [pixq]
+ paddw m2, m3
+ movd m6, [pixq+strideq*1]
+ punpcklbw m5, m7
+ psubw m4, m3
+ punpcklbw m6, m7
+ movq m3, m5
+ paddw m5, m6
+ psubw m3, m6
+ movq m6, m0
+ movq m7, m1
+ psubw m0, m5
+ psubw m1, m2
+ paddw m7, m2
+ paddw m1, m0
+ movq m2, m7
+ psllw m1, 2
+ paddw m6, m5
+ pmulhw m1, [pw_2D41]
+ paddw m7, m6
+ psubw m6, m2
+ movq m5, m0
+ movq m2, m7
+ punpcklwd m7, m6
+ paddw m0, m1
+ punpckhwd m2, m6
+ psubw m5, m1
+ movq m6, m0
+ movq m1, [rsp+8]
+ punpcklwd m0, m5
+ punpckhwd m6, m5
+ movq m5, m0
+ punpckldq m0, m7
+ paddw m3, m4
+ punpckhdq m5, m7
+ movq m7, m6
+ movq [srcq+DCTSIZE*0*2], m0
+ punpckldq m6, m2
+ movq [srcq+DCTSIZE*1*2], m5
+ punpckhdq m7, m2
+ movq [srcq+DCTSIZE*2*2], m6
+ paddw m4, m1
+ movq [srcq+DCTSIZE*3*2], m7
+ psllw m3, 2
+ movq m2, [rsp]
+ psllw m4, 2
+ pmulhw m4, [pw_2D41]
+ paddw m1, m2
+ psllw m1, 2
+ movq m0, m3
+ pmulhw m0, [pw_22A3]
+ psubw m3, m1
+ pmulhw m3, [pw_187E]
+ movq m5, m2
+ pmulhw m1, [pw_539F]
+ psubw m2, m4
+ paddw m5, m4
+ movq m6, m2
+ paddw m0, m3
+ movq m7, m5
+ paddw m2, m0
+ psubw m6, m0
+ movq m4, m2
+ paddw m1, m3
+ punpcklwd m2, m6
+ paddw m5, m1
+ punpckhwd m4, m6
+ psubw m7, m1
+ movq m6, m5
+ punpcklwd m5, m7
+ punpckhwd m6, m7
+ movq m7, m2
+ punpckldq m2, m5
+ sub pixq, stride3q
+ punpckhdq m7, m5
+ movq m5, m4
+ movq [srcq+DCTSIZE*0*2+8], m2
+ punpckldq m4, m6
+ movq [srcq+DCTSIZE*1*2+8], m7
+ punpckhdq m5, m6
+ movq [srcq+DCTSIZE*2*2+8], m4
+ add pixq, 4
+ movq [srcq+DCTSIZE*3*2+8], m5
+ add srcq, DCTSIZE*4*2
+ dec cntd
+ jnz .loop
+ RET
diff --git a/libavfilter/x86/vf_fspp_init.c b/libavfilter/x86/vf_fspp_init.c
new file mode 100644
index 0000000000..8e00317cb7
--- /dev/null
+++ b/libavfilter/x86/vf_fspp_init.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_fspp.h"
+
+void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+ ptrdiff_t dst_stride, ptrdiff_t src_stride,
+ ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale);
+void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
+
+av_cold void ff_fspp_init_x86(FSPPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ s->store_slice = ff_store_slice_mmx;
+ s->store_slice2 = ff_store_slice2_mmx;
+ s->mul_thrmat = ff_mul_thrmat_mmx;
+ s->column_fidct = ff_column_fidct_mmx;
+ s->row_idct = ff_row_idct_mmx;
+ s->row_fdct = ff_row_fdct_mmx;
+ }
+}
diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm
index 00fcb166fb..3581f89fe8 100644
--- a/libavfilter/x86/vf_gradfun.asm
+++ b/libavfilter/x86/vf_gradfun.asm
@@ -1,20 +1,20 @@
;******************************************************************************
;* x86-optimized functions for gradfun filter
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavfilter/x86/vf_gradfun_init.c b/libavfilter/x86/vf_gradfun_init.c
index 3f23bf6799..c638a05e87 100644
--- a/libavfilter/x86/vf_gradfun_init.c
+++ b/libavfilter/x86/vf_gradfun_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (C) 2009 Loren Merritt <lorenm@u.washington.edu>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -26,29 +26,29 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/gradfun.h"
-void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, uint8_t *src,
- uint16_t *dc, int thresh,
+void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, const uint8_t *src,
+ const uint16_t *dc, int thresh,
const uint16_t *dithers);
-
-void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, uint8_t *src,
- uint16_t *dc, int thresh,
+void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, const uint8_t *src,
+ const uint16_t *dc, int thresh,
const uint16_t *dithers);
void ff_gradfun_blur_line_movdqa_sse2(intptr_t x, uint16_t *buf,
- uint16_t *buf1, uint16_t *dc,
- uint8_t *src1, uint8_t *src2);
+ const uint16_t *buf1, uint16_t *dc,
+ const uint8_t *src1, const uint8_t *src2);
void ff_gradfun_blur_line_movdqu_sse2(intptr_t x, uint16_t *buf,
- uint16_t *buf1, uint16_t *dc,
- uint8_t *src1, uint8_t *src2);
+ const uint16_t *buf1, uint16_t *dc,
+ const uint8_t *src1, const uint8_t *src2);
#if HAVE_YASM
-static void gradfun_filter_line(uint8_t *dst, uint8_t *src, uint16_t *dc,
- int width, int thresh, const uint16_t *dithers,
- int alignment)
+static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src,
+ const uint16_t *dc,
+ int width, int thresh,
+ const uint16_t *dithers)
{
intptr_t x;
- if (width & alignment) {
- x = width & ~alignment;
+ if (width & 3) {
+ x = width & ~3;
ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2,
width - x, thresh, dithers);
width = x;
@@ -58,22 +58,25 @@ static void gradfun_filter_line(uint8_t *dst, uint8_t *src, uint16_t *dc,
thresh, dithers);
}
-static void gradfun_filter_line_mmxext(uint8_t *dst, uint8_t *src, uint16_t *dc,
- int width, int thresh,
- const uint16_t *dithers)
-{
- gradfun_filter_line(dst, src, dc, width, thresh, dithers, 3);
-}
-
-static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc,
+static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const uint16_t *dc,
int width, int thresh,
const uint16_t *dithers)
{
- gradfun_filter_line(dst, src, dc, width, thresh, dithers, 7);
+ intptr_t x;
+ if (width & 7) {
+ // could be 10% faster if I somehow eliminated this
+ x = width & ~7;
+ ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2,
+ width - x, thresh, dithers);
+ width = x;
+ }
+ x = -width;
+ ff_gradfun_filter_line_ssse3(x, dst + width, src + width, dc + width / 2,
+ thresh, dithers);
}
-static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
- uint8_t *src, int src_linesize, int width)
+static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, const uint16_t *buf1,
+ const uint8_t *src, int src_linesize, int width)
{
intptr_t x = -2 * width;
if (((intptr_t) src | src_linesize) & 15)
diff --git a/libavfilter/x86/vf_hqdn3d.asm b/libavfilter/x86/vf_hqdn3d.asm
index 02632a1f09..961127e670 100644
--- a/libavfilter/x86/vf_hqdn3d.asm
+++ b/libavfilter/x86/vf_hqdn3d.asm
@@ -1,20 +1,20 @@
;******************************************************************************
;* Copyright (c) 2012 Loren Merritt
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
diff --git a/libavfilter/x86/vf_hqdn3d_init.c b/libavfilter/x86/vf_hqdn3d_init.c
index 06f9e00ec9..b63916b674 100644
--- a/libavfilter/x86/vf_hqdn3d_init.c
+++ b/libavfilter/x86/vf_hqdn3d_init.c
@@ -1,18 +1,20 @@
/*
- * This file is part of Libav.
+ * Copyright (c) 2012 Loren Merritt
*
- * Libav is free software; you can redistribute it and/or modify
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
- * with Libav; if not, write to the Free Software Foundation, Inc.,
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
new file mode 100644
index 0000000000..007e63deb9
--- /dev/null
+++ b/libavfilter/x86/vf_idet.asm
@@ -0,0 +1,170 @@
+;*****************************************************************************
+;* x86-optimized functions for idet filter
+;*
+;* Copyright (C) 2014 Pascal Massimino (pascal.massimino@gmail.com)
+;* Copyright (c) 2014 Neil Birkbeck (birkbeck@google.com)
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+; Implementation that does 8-bytes at a time using single-word operations.
+%macro IDET_FILTER_LINE 1
+INIT_MMX %1
+cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
+ xor indexq, indexq
+%define m_zero m2
+%define m_sum m5
+ pxor m_sum, m_sum
+ pxor m_zero, m_zero
+
+.loop:
+ movu m0, [aq + indexq*1]
+ punpckhbw m1, m0, m_zero
+ punpcklbw m0, m_zero
+
+ movu m3, [cq + indexq*1]
+ punpckhbw m4, m3, m_zero
+ punpcklbw m3, m_zero
+
+ paddsw m1, m4
+ paddsw m0, m3
+
+ movu m3, [bq + indexq*1]
+ punpckhbw m4, m3, m_zero
+ punpcklbw m3, m_zero
+
+ paddw m4, m4
+ paddw m3, m3
+ psubsw m1, m4
+ psubsw m0, m3
+
+ ABS2 m1, m0, m4, m3
+
+ paddw m0, m1
+ punpckhwd m1, m0, m_zero
+ punpcklwd m0, m_zero
+
+ paddd m0, m1
+ paddd m_sum, m0
+
+ add indexq, 0x8
+ CMP widthd, indexd
+ jg .loop
+
+ HADDD m_sum, m0
+ movd eax, m_sum
+ RET
+%endmacro
+
+%if ARCH_X86_32
+IDET_FILTER_LINE mmxext
+IDET_FILTER_LINE mmx
+%endif
+
+;******************************************************************************
+; 16bit implementation that does 4/8-pixels at a time
+
+%macro PABS_DIFF_WD 3 ; a, b, junk , output=a
+ psubusw %3, %2, %1
+ psubusw %1, %2
+ por %1, %3
+
+ mova %2, %1
+ punpcklwd %1, m_zero
+ punpckhwd %2, m_zero
+ paddd %1, %2
+%endmacro
+
+%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
+cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
+ xor indexq, indexq
+%define m_zero m1
+%define m_sum m0
+ pxor m_sum, m_sum
+ pxor m_zero, m_zero
+
+.loop_16bit:
+ movu m2, [bq + indexq * 2] ; B
+ movu m3, [aq + indexq * 2] ; A
+ mova m6, m2
+ psubusw m5, m2, m3 ; ba
+
+ movu m4, [cq + indexq * 2] ; C
+ add indexq, %1
+ psubusw m3, m2 ; ab
+ CMP indexd, widthd
+
+ psubusw m6, m4 ; bc
+ psubusw m4, m2 ; cb
+
+ PABS_DIFF_WD m3, m6, m7 ; |ab - bc|
+ PABS_DIFF_WD m5, m4, m7 ; |ba - cb|
+ paddd m_sum, m3
+ paddd m_sum, m5
+ jl .loop_16bit
+
+ HADDD m_sum, m2
+ movd eax, m_sum
+ RET
+%endmacro
+
+INIT_XMM sse2
+IDET_FILTER_LINE_16BIT 8
+%if ARCH_X86_32
+INIT_MMX mmx
+IDET_FILTER_LINE_16BIT 4
+%endif
+
+;******************************************************************************
+; SSE2 8-bit implementation that does 16-bytes at a time:
+
+INIT_XMM sse2
+cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
+ xor indexq, indexq
+ pxor m0, m0
+ pxor m1, m1
+
+.sse2_loop:
+ movu m2, [bq + indexq*1] ; B
+ movu m3, [aq + indexq*1] ; A
+ mova m6, m2
+ mova m4, m3
+ psubusb m5, m2, m3 ; ba
+
+ movu m3, [cq + indexq*1] ; C
+ add indexq, 0x10
+ psubusb m4, m2 ; ab
+ CMP indexd, widthd
+
+ psubusb m6, m3 ; bc
+ psubusb m3, m2 ; cb
+
+ psadbw m4, m6 ; |ab - bc|
+ paddq m0, m4
+ psadbw m5, m3 ; |ba - cb|
+ paddq m1, m5
+ jl .sse2_loop
+
+ paddq m0, m1
+ movhlps m1, m0
+ paddq m0, m1
+ movd eax, m0
+ RET
diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c
new file mode 100644
index 0000000000..1147ca8ba8
--- /dev/null
+++ b/libavfilter/x86/vf_idet_init.c
@@ -0,0 +1,87 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_idet.h"
+
+#if HAVE_YASM
+
+/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */
+#define FUNC_MAIN_DECL(KIND, SPAN) \
+int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
+ const uint8_t *c, int w); \
+static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
+ const uint8_t *c, int w) { \
+ int sum = 0; \
+ const int left_over = w & (SPAN - 1); \
+ w -= left_over; \
+ if (w > 0) \
+ sum += ff_idet_filter_line_##KIND(a, b, c, w); \
+ if (left_over > 0) \
+ sum += ff_idet_filter_line_c(a + w, b + w, c + w, left_over); \
+ return sum; \
+}
+
+
+#define FUNC_MAIN_DECL_16bit(KIND, SPAN) \
+int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
+ const uint16_t *c, int w); \
+static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
+ const uint16_t *c, int w) { \
+ int sum = 0; \
+ const int left_over = w & (SPAN - 1); \
+ w -= left_over; \
+ if (w > 0) \
+ sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w); \
+ if (left_over > 0) \
+ sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over); \
+ return sum; \
+}
+
+FUNC_MAIN_DECL(sse2, 16)
+FUNC_MAIN_DECL_16bit(sse2, 8)
+#if ARCH_X86_32
+FUNC_MAIN_DECL(mmx, 8)
+FUNC_MAIN_DECL(mmxext, 8)
+FUNC_MAIN_DECL_16bit(mmx, 4)
+#endif
+
+#endif
+av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b)
+{
+#if HAVE_YASM
+ const int cpu_flags = av_get_cpu_flags();
+
+#if ARCH_X86_32
+ if (EXTERNAL_MMX(cpu_flags)) {
+ idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx;
+ }
+ if (EXTERNAL_MMXEXT(cpu_flags)) {
+ idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext;
+ }
+#endif // ARCH_x86_32
+
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2;
+ }
+#endif // HAVE_YASM
+}
diff --git a/libavfilter/x86/vf_interlace.asm b/libavfilter/x86/vf_interlace.asm
index 85811da8d1..ce3dd81311 100644
--- a/libavfilter/x86/vf_interlace.asm
+++ b/libavfilter/x86/vf_interlace.asm
@@ -4,20 +4,20 @@
;* Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv>
;* Copyright (c) 2014 Michael Niedermayer <michaelni@gmx.at>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or modify
+;* FFmpeg is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License along
-;* with Libav; if not, write to the Free Software Foundation, Inc.,
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
;******************************************************************************
diff --git a/libavfilter/x86/vf_interlace_init.c b/libavfilter/x86/vf_interlace_init.c
index 231ab85a1c..68ee47d9bc 100644
--- a/libavfilter/x86/vf_interlace_init.c
+++ b/libavfilter/x86/vf_interlace_init.c
@@ -1,20 +1,20 @@
/*
* Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or modify
+ * FFmpeg is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
- * with Libav; if not, write to the Free Software Foundation, Inc.,
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
diff --git a/libavfilter/x86/vf_noise.c b/libavfilter/x86/vf_noise.c
new file mode 100644
index 0000000000..0a86cb084b
--- /dev/null
+++ b/libavfilter/x86/vf_noise.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2013 Paul B Mahol
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/x86/cpu.h"
+#include "libavutil/x86/asm.h"
+#include "libavfilter/vf_noise.h"
+
+#if HAVE_INLINE_ASM
+static void line_noise_mmx(uint8_t *dst, const uint8_t *src,
+ const int8_t *noise, int len, int shift)
+{
+ x86_reg mmx_len= len & (~7);
+ noise += shift;
+
+ __asm__ volatile(
+ "mov %3, %%"REG_a" \n\t"
+ "pcmpeqb %%mm7, %%mm7 \n\t"
+ "psllw $15, %%mm7 \n\t"
+ "packsswb %%mm7, %%mm7 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq (%1, %%"REG_a"), %%mm1 \n\t"
+ "pxor %%mm7, %%mm0 \n\t"
+ "paddsb %%mm1, %%mm0 \n\t"
+ "pxor %%mm7, %%mm0 \n\t"
+ "movq %%mm0, (%2, %%"REG_a") \n\t"
+ "add $8, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ :: "r" (src+mmx_len), "r" (noise+mmx_len), "r" (dst+mmx_len), "g" (-mmx_len)
+ : "%"REG_a
+ );
+ if (mmx_len != len)
+ ff_line_noise_c(dst+mmx_len, src+mmx_len, noise+mmx_len, len-mmx_len, 0);
+}
+
+#if HAVE_6REGS
+static void line_noise_avg_mmx(uint8_t *dst, const uint8_t *src,
+ int len, const int8_t * const *shift)
+{
+ x86_reg mmx_len = len & (~7);
+
+ __asm__ volatile(
+ "mov %5, %%"REG_a" \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1, %%"REG_a"), %%mm1 \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "paddb (%2, %%"REG_a"), %%mm1 \n\t"
+ "paddb (%3, %%"REG_a"), %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "movq %%mm1, %%mm3 \n\t"
+ "punpcklbw %%mm0, %%mm0 \n\t"
+ "punpckhbw %%mm2, %%mm2 \n\t"
+ "punpcklbw %%mm1, %%mm1 \n\t"
+ "punpckhbw %%mm3, %%mm3 \n\t"
+ "pmulhw %%mm0, %%mm1 \n\t"
+ "pmulhw %%mm2, %%mm3 \n\t"
+ "paddw %%mm1, %%mm1 \n\t"
+ "paddw %%mm3, %%mm3 \n\t"
+ "paddw %%mm0, %%mm1 \n\t"
+ "paddw %%mm2, %%mm3 \n\t"
+ "psrlw $8, %%mm1 \n\t"
+ "psrlw $8, %%mm3 \n\t"
+ "packuswb %%mm3, %%mm1 \n\t"
+ "movq %%mm1, (%4, %%"REG_a") \n\t"
+ "add $8, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ :: "r" (src+mmx_len), "r" (shift[0]+mmx_len), "r" (shift[1]+mmx_len), "r" (shift[2]+mmx_len),
+ "r" (dst+mmx_len), "g" (-mmx_len)
+ : "%"REG_a
+ );
+
+ if (mmx_len != len){
+ const int8_t *shift2[3] = { shift[0]+mmx_len, shift[1]+mmx_len, shift[2]+mmx_len };
+ ff_line_noise_avg_c(dst+mmx_len, src+mmx_len, len-mmx_len, shift2);
+ }
+}
+#endif /* HAVE_6REGS */
+
+static void line_noise_mmxext(uint8_t *dst, const uint8_t *src,
+ const int8_t *noise, int len, int shift)
+{
+ x86_reg mmx_len = len & (~7);
+ noise += shift;
+
+ __asm__ volatile(
+ "mov %3, %%"REG_a" \n\t"
+ "pcmpeqb %%mm7, %%mm7 \n\t"
+ "psllw $15, %%mm7 \n\t"
+ "packsswb %%mm7, %%mm7 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%0, %%"REG_a"), %%mm0 \n\t"
+ "movq (%1, %%"REG_a"), %%mm1 \n\t"
+ "pxor %%mm7, %%mm0 \n\t"
+ "paddsb %%mm1, %%mm0 \n\t"
+ "pxor %%mm7, %%mm0 \n\t"
+ "movntq %%mm0, (%2, %%"REG_a") \n\t"
+ "add $8, %%"REG_a" \n\t"
+ " js 1b \n\t"
+ :: "r" (src+mmx_len), "r" (noise+mmx_len), "r" (dst+mmx_len), "g" (-mmx_len)
+ : "%"REG_a
+ );
+ if (mmx_len != len)
+ ff_line_noise_c(dst+mmx_len, src+mmx_len, noise+mmx_len, len-mmx_len, 0);
+}
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_noise_init_x86(NoiseContext *n)
+{
+#if HAVE_INLINE_ASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (INLINE_MMX(cpu_flags)) {
+ n->line_noise = line_noise_mmx;
+#if HAVE_6REGS
+ n->line_noise_avg = line_noise_avg_mmx;
+#endif
+ }
+ if (INLINE_MMXEXT(cpu_flags)) {
+ n->line_noise = line_noise_mmxext;
+ }
+#endif
+}
diff --git a/libavfilter/x86/vf_pp7.asm b/libavfilter/x86/vf_pp7.asm
new file mode 100644
index 0000000000..7b3e5cf5e3
--- /dev/null
+++ b/libavfilter/x86/vf_pp7.asm
@@ -0,0 +1,57 @@
+;*****************************************************************************
+;* x86-optimized functions for pp7 filter
+;*
+;* Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+INIT_MMX mmx
+
+;void ff_pp7_dctB_mmx(int16_t *dst, int16_t *src)
+cglobal pp7_dctB, 2, 2, 0, dst, src
+ movq m0, [srcq]
+ movq m1, [srcq+mmsize*1]
+ paddw m0, [srcq+mmsize*6]
+ paddw m1, [srcq+mmsize*5]
+ movq m2, [srcq+mmsize*2]
+ movq m3, [srcq+mmsize*3]
+ paddw m2, [srcq+mmsize*4]
+ paddw m3, m3
+ movq m4, m3
+ psubw m3, m0
+ paddw m4, m0
+ movq m0, m2
+ psubw m2, m1
+ paddw m0, m1
+ movq m1, m4
+ psubw m4, m0
+ paddw m1, m0
+ movq m0, m3
+ psubw m3, m2
+ psubw m3, m2
+ paddw m2, m0
+ paddw m2, m0
+ movq [dstq], m1
+ movq [dstq+mmsize*2], m4
+ movq [dstq+mmsize*1], m2
+ movq [dstq+mmsize*3], m3
+ RET
diff --git a/libavfilter/x86/vf_pp7_init.c b/libavfilter/x86/vf_pp7_init.c
new file mode 100644
index 0000000000..165b0dd5d0
--- /dev/null
+++ b/libavfilter/x86/vf_pp7_init.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_pp7.h"
+
+void ff_pp7_dctB_mmx(int16_t *dst, int16_t *src);
+
+av_cold void ff_pp7_init_x86(PP7Context *p)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags))
+ p->dctB = ff_pp7_dctB_mmx;
+}
diff --git a/libavfilter/x86/vf_pullup.asm b/libavfilter/x86/vf_pullup.asm
new file mode 100644
index 0000000000..d3a195511e
--- /dev/null
+++ b/libavfilter/x86/vf_pullup.asm
@@ -0,0 +1,178 @@
+;*****************************************************************************
+;* x86-optimized functions for pullup filter
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_TEXT
+
+INIT_MMX mmx
+cglobal pullup_filter_diff, 3, 5, 8, first, second, size
+ mov r3, 4
+ pxor m4, m4
+ pxor m7, m7
+
+.loop:
+ movq m0, [firstq]
+ movq m2, [firstq]
+ add firstq, sizeq
+ movq m1, [secondq]
+ add secondq, sizeq
+ psubusb m2, m1
+ psubusb m1, m0
+ movq m0, m2
+ movq m3, m1
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpckhbw m2, m7
+ punpckhbw m3, m7
+ paddw m4, m0
+ paddw m4, m1
+ paddw m4, m2
+ paddw m4, m3
+
+ dec r3
+ jnz .loop
+
+ movq m3, m4
+ punpcklwd m4, m7
+ punpckhwd m3, m7
+ paddd m3, m4
+ movd eax, m3
+ psrlq m3, 32
+ movd r4d, m3
+ add eax, r4d
+ RET
+
+INIT_MMX mmx
+cglobal pullup_filter_comb, 3, 5, 8, first, second, size
+ mov r3, 4
+ pxor m6, m6
+ pxor m7, m7
+ sub secondq, sizeq
+
+.loop:
+ movq m0, [firstq]
+ movq m1, [secondq]
+ punpcklbw m0, m7
+ movq m2, [secondq+sizeq]
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ paddw m0, m0
+ paddw m1, m2
+ movq m2, m0
+ psubusw m0, m1
+ psubusw m1, m2
+ paddw m6, m0
+ paddw m6, m1
+
+ movq m0, [firstq]
+ movq m1, [secondq]
+ punpckhbw m0, m7
+ movq m2, [secondq+sizeq]
+ punpckhbw m1, m7
+ punpckhbw m2, m7
+ paddw m0, m0
+ paddw m1, m2
+ movq m2, m0
+ psubusw m0, m1
+ psubusw m1, m2
+ paddw m6, m0
+ paddw m6, m1
+
+ movq m0, [secondq+sizeq]
+ movq m1, [firstq]
+ punpcklbw m0, m7
+ movq m2, [firstq+sizeq]
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ paddw m0, m0
+ paddw m1, m2
+ movq m2, m0
+ psubusw m0, m1
+ psubusw m1, m2
+ paddw m6, m0
+ paddw m6, m1
+
+ movq m0, [secondq+sizeq]
+ movq m1, [firstq]
+ punpckhbw m0, m7
+ movq m2, [firstq+sizeq]
+ punpckhbw m1, m7
+ punpckhbw m2, m7
+ paddw m0, m0
+ paddw m1, m2
+ movq m2, m0
+ psubusw m0, m1
+ psubusw m1, m2
+ paddw m6, m0
+ paddw m6, m1
+
+ add firstq, sizeq
+ add secondq, sizeq
+ dec r3
+ jnz .loop
+
+ movq m5, m6
+ punpcklwd m6, m7
+ punpckhwd m5, m7
+ paddd m5, m6
+ movd eax, m5
+ psrlq m5, 32
+ movd r4d, m5
+ add eax, r4d
+ RET
+
+INIT_MMX mmx
+cglobal pullup_filter_var, 3, 5, 8, first, second, size
+ mov r3, 3
+ pxor m4, m4
+ pxor m7, m7
+
+.loop:
+ movq m0, [firstq]
+ movq m2, [firstq]
+ movq m1, [firstq+sizeq]
+ add firstq, sizeq
+ psubusb m2, m1
+ psubusb m1, m0
+ movq m0, m2
+ movq m3, m1
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpckhbw m2, m7
+ punpckhbw m3, m7
+ paddw m4, m0
+ paddw m4, m1
+ paddw m4, m2
+ paddw m4, m3
+
+ dec r3
+ jnz .loop
+
+ movq m3, m4
+ punpcklwd m4, m7
+ punpckhwd m3, m7
+ paddd m3, m4
+ movd eax, m3
+ psrlq m3, 32
+ movd r4d, m3
+ add eax, r4d
+ shl eax, 2
+ RET
diff --git a/libavfilter/x86/vf_pullup_init.c b/libavfilter/x86/vf_pullup_init.c
new file mode 100644
index 0000000000..5b36b68e51
--- /dev/null
+++ b/libavfilter/x86/vf_pullup_init.c
@@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/vf_pullup.h"
+
+int ff_pullup_filter_diff_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_comb_mmx(const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+int ff_pullup_filter_var_mmx (const uint8_t *a, const uint8_t *b, ptrdiff_t s);
+
+av_cold void ff_pullup_init_x86(PullupContext *s)
+{
+#if HAVE_YASM
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_MMX(cpu_flags)) {
+ s->diff = ff_pullup_filter_diff_mmx;
+ s->comb = ff_pullup_filter_comb_mmx;
+ s->var = ff_pullup_filter_var_mmx;
+ }
+#endif
+}
diff --git a/libavfilter/x86/vf_spp.c b/libavfilter/x86/vf_spp.c
new file mode 100644
index 0000000000..1cfb9e81f7
--- /dev/null
+++ b/libavfilter/x86/vf_spp.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavfilter/vf_spp.h"
+
+#if HAVE_MMX_INLINE
+static void hardthresh_mmx(int16_t dst[64], const int16_t src[64],
+ int qp, const uint8_t *permutation)
+{
+ int bias = 0; //FIXME
+ unsigned int threshold1;
+
+ threshold1 = qp * ((1<<4) - bias) - 1;
+
+#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \
+ "movq " #src0 ", %%mm0 \n" \
+ "movq " #src1 ", %%mm1 \n" \
+ "movq " #src2 ", %%mm2 \n" \
+ "movq " #src3 ", %%mm3 \n" \
+ "psubw %%mm4, %%mm0 \n" \
+ "psubw %%mm4, %%mm1 \n" \
+ "psubw %%mm4, %%mm2 \n" \
+ "psubw %%mm4, %%mm3 \n" \
+ "paddusw %%mm5, %%mm0 \n" \
+ "paddusw %%mm5, %%mm1 \n" \
+ "paddusw %%mm5, %%mm2 \n" \
+ "paddusw %%mm5, %%mm3 \n" \
+ "paddw %%mm6, %%mm0 \n" \
+ "paddw %%mm6, %%mm1 \n" \
+ "paddw %%mm6, %%mm2 \n" \
+ "paddw %%mm6, %%mm3 \n" \
+ "psubusw %%mm6, %%mm0 \n" \
+ "psubusw %%mm6, %%mm1 \n" \
+ "psubusw %%mm6, %%mm2 \n" \
+ "psubusw %%mm6, %%mm3 \n" \
+ "psraw $3, %%mm0 \n" \
+ "psraw $3, %%mm1 \n" \
+ "psraw $3, %%mm2 \n" \
+ "psraw $3, %%mm3 \n" \
+ \
+ "movq %%mm0, %%mm7 \n" \
+ "punpcklwd %%mm2, %%mm0 \n" /*A*/ \
+ "punpckhwd %%mm2, %%mm7 \n" /*C*/ \
+ "movq %%mm1, %%mm2 \n" \
+ "punpcklwd %%mm3, %%mm1 \n" /*B*/ \
+ "punpckhwd %%mm3, %%mm2 \n" /*D*/ \
+ "movq %%mm0, %%mm3 \n" \
+ "punpcklwd %%mm1, %%mm0 \n" /*A*/ \
+ "punpckhwd %%mm7, %%mm3 \n" /*C*/ \
+ "punpcklwd %%mm2, %%mm7 \n" /*B*/ \
+ "punpckhwd %%mm2, %%mm1 \n" /*D*/ \
+ \
+ "movq %%mm0, " #dst0 " \n" \
+ "movq %%mm7, " #dst1 " \n" \
+ "movq %%mm3, " #dst2 " \n" \
+ "movq %%mm1, " #dst3 " \n"
+
+ __asm__ volatile(
+ "movd %2, %%mm4 \n"
+ "movd %3, %%mm5 \n"
+ "movd %4, %%mm6 \n"
+ "packssdw %%mm4, %%mm4 \n"
+ "packssdw %%mm5, %%mm5 \n"
+ "packssdw %%mm6, %%mm6 \n"
+ "packssdw %%mm4, %%mm4 \n"
+ "packssdw %%mm5, %%mm5 \n"
+ "packssdw %%mm6, %%mm6 \n"
+ REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0))
+ REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
+ REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
+ REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
+ : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed?
+ );
+ dst[0] = (src[0] + 4) >> 3;
+}
+
+static void softthresh_mmx(int16_t dst[64], const int16_t src[64],
+ int qp, const uint8_t *permutation)
+{
+ int bias = 0; //FIXME
+ unsigned int threshold1;
+
+ threshold1 = qp*((1<<4) - bias) - 1;
+
+#undef REQUANT_CORE
+#define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \
+ "movq " #src0 ", %%mm0 \n" \
+ "movq " #src1 ", %%mm1 \n" \
+ "pxor %%mm6, %%mm6 \n" \
+ "pxor %%mm7, %%mm7 \n" \
+ "pcmpgtw %%mm0, %%mm6 \n" \
+ "pcmpgtw %%mm1, %%mm7 \n" \
+ "pxor %%mm6, %%mm0 \n" \
+ "pxor %%mm7, %%mm1 \n" \
+ "psubusw %%mm4, %%mm0 \n" \
+ "psubusw %%mm4, %%mm1 \n" \
+ "pxor %%mm6, %%mm0 \n" \
+ "pxor %%mm7, %%mm1 \n" \
+ "movq " #src2 ", %%mm2 \n" \
+ "movq " #src3 ", %%mm3 \n" \
+ "pxor %%mm6, %%mm6 \n" \
+ "pxor %%mm7, %%mm7 \n" \
+ "pcmpgtw %%mm2, %%mm6 \n" \
+ "pcmpgtw %%mm3, %%mm7 \n" \
+ "pxor %%mm6, %%mm2 \n" \
+ "pxor %%mm7, %%mm3 \n" \
+ "psubusw %%mm4, %%mm2 \n" \
+ "psubusw %%mm4, %%mm3 \n" \
+ "pxor %%mm6, %%mm2 \n" \
+ "pxor %%mm7, %%mm3 \n" \
+ \
+ "paddsw %%mm5, %%mm0 \n" \
+ "paddsw %%mm5, %%mm1 \n" \
+ "paddsw %%mm5, %%mm2 \n" \
+ "paddsw %%mm5, %%mm3 \n" \
+ "psraw $3, %%mm0 \n" \
+ "psraw $3, %%mm1 \n" \
+ "psraw $3, %%mm2 \n" \
+ "psraw $3, %%mm3 \n" \
+ \
+ "movq %%mm0, %%mm7 \n" \
+ "punpcklwd %%mm2, %%mm0 \n" /*A*/ \
+ "punpckhwd %%mm2, %%mm7 \n" /*C*/ \
+ "movq %%mm1, %%mm2 \n" \
+ "punpcklwd %%mm3, %%mm1 \n" /*B*/ \
+ "punpckhwd %%mm3, %%mm2 \n" /*D*/ \
+ "movq %%mm0, %%mm3 \n" \
+ "punpcklwd %%mm1, %%mm0 \n" /*A*/ \
+ "punpckhwd %%mm7, %%mm3 \n" /*C*/ \
+ "punpcklwd %%mm2, %%mm7 \n" /*B*/ \
+ "punpckhwd %%mm2, %%mm1 \n" /*D*/ \
+ \
+ "movq %%mm0, " #dst0 " \n" \
+ "movq %%mm7, " #dst1 " \n" \
+ "movq %%mm3, " #dst2 " \n" \
+ "movq %%mm1, " #dst3 " \n"
+
+ __asm__ volatile(
+ "movd %2, %%mm4 \n"
+ "movd %3, %%mm5 \n"
+ "packssdw %%mm4, %%mm4 \n"
+ "packssdw %%mm5, %%mm5 \n"
+ "packssdw %%mm4, %%mm4 \n"
+ "packssdw %%mm5, %%mm5 \n"
+ REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0))
+ REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0))
+ REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0))
+ REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0))
+ : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed?
+ );
+
+ dst[0] = (src[0] + 4) >> 3;
+}
+
+static void store_slice_mmx(uint8_t *dst, const int16_t *src,
+ int dst_stride, int src_stride,
+ int width, int height, int log2_scale,
+ const uint8_t dither[8][8])
+{
+ int y;
+
+ for (y = 0; y < height; y++) {
+ uint8_t *dst1 = dst;
+ const int16_t *src1 = src;
+ __asm__ volatile(
+ "movq (%3), %%mm3 \n"
+ "movq (%3), %%mm4 \n"
+ "movd %4, %%mm2 \n"
+ "pxor %%mm0, %%mm0 \n"
+ "punpcklbw %%mm0, %%mm3 \n"
+ "punpckhbw %%mm0, %%mm4 \n"
+ "psraw %%mm2, %%mm3 \n"
+ "psraw %%mm2, %%mm4 \n"
+ "movd %5, %%mm2 \n"
+ "1: \n"
+ "movq (%0), %%mm0 \n"
+ "movq 8(%0), %%mm1 \n"
+ "paddw %%mm3, %%mm0 \n"
+ "paddw %%mm4, %%mm1 \n"
+ "psraw %%mm2, %%mm0 \n"
+ "psraw %%mm2, %%mm1 \n"
+ "packuswb %%mm1, %%mm0 \n"
+ "movq %%mm0, (%1) \n"
+ "add $16, %0 \n"
+ "add $8, %1 \n"
+ "cmp %2, %1 \n"
+ " jb 1b \n"
+ : "+r" (src1), "+r"(dst1)
+ : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale)
+ );
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+#endif /* HAVE_MMX_INLINE */
+
+av_cold void ff_spp_init_x86(SPPContext *s)
+{
+#if HAVE_MMX_INLINE
+ int cpu_flags = av_get_cpu_flags();
+
+ if (cpu_flags & AV_CPU_FLAG_MMX) {
+ s->store_slice = store_slice_mmx;
+ if (av_get_int(s->dct, "bits_per_sample", NULL) <= 8) {
+ switch (s->mode) {
+ case 0: s->requantize = hardthresh_mmx; break;
+ case 1: s->requantize = softthresh_mmx; break;
+ }
+ }
+ }
+#endif
+}
diff --git a/libavfilter/x86/vf_tinterlace_init.c b/libavfilter/x86/vf_tinterlace_init.c
new file mode 100644
index 0000000000..ddb0cced36
--- /dev/null
+++ b/libavfilter/x86/vf_tinterlace_init.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/internal.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+
+#include "libavfilter/tinterlace.h"
+
+void ff_lowpass_line_sse2(uint8_t *dstp, ptrdiff_t linesize,
+ const uint8_t *srcp,
+ const uint8_t *srcp_above,
+ const uint8_t *srcp_below);
+void ff_lowpass_line_avx (uint8_t *dstp, ptrdiff_t linesize,
+ const uint8_t *srcp,
+ const uint8_t *srcp_above,
+ const uint8_t *srcp_below);
+
+av_cold void ff_tinterlace_init_x86(TInterlaceContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (EXTERNAL_SSE2(cpu_flags))
+ s->lowpass_line = ff_lowpass_line_sse2;
+ if (EXTERNAL_AVX(cpu_flags))
+ s->lowpass_line = ff_lowpass_line_avx;
+}
diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm
index 3d8b2bc180..a29620ce55 100644
--- a/libavfilter/x86/vf_yadif.asm
+++ b/libavfilter/x86/vf_yadif.asm
@@ -4,20 +4,20 @@
;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
;*
-;* This file is part of Libav.
+;* This file is part of FFmpeg.
;*
-;* Libav is free software; you can redistribute it and/or
+;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* Libav is distributed in the hope that it will be useful,
+;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with Libav; if not, write to the Free Software
+;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
@@ -39,11 +39,7 @@ SECTION .text
pavgb m5, m3
pand m4, [pb_1]
psubusb m5, m4
-%if mmsize == 16
- psrldq m5, 1
-%else
- psrlq m5, 8
-%endif
+ RSHIFT m5, 1
punpcklbw m5, m7
mova m4, m2
psubusb m2, m3
@@ -51,13 +47,8 @@ SECTION .text
pmaxub m2, m3
mova m3, m2
mova m4, m2
-%if mmsize == 16
- psrldq m3, 1
- psrldq m4, 2
-%else
- psrlq m3, 8
- psrlq m4, 16
-%endif
+ RSHIFT m3, 1
+ RSHIFT m4, 2
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
@@ -90,17 +81,17 @@ SECTION .text
%endmacro
%macro LOAD 2
- movh m%1, %2
- punpcklbw m%1, m7
+ movh %1, %2
+ punpcklbw %1, m7
%endmacro
%macro FILTER 3
.loop%1:
pxor m7, m7
- LOAD 0, [curq+t1]
- LOAD 1, [curq+t0]
- LOAD 2, [%2]
- LOAD 3, [%3]
+ LOAD m0, [curq+t1]
+ LOAD m1, [curq+t0]
+ LOAD m2, [%2]
+ LOAD m3, [%3]
mova m4, m3
paddw m3, m2
psraw m3, 1
@@ -109,8 +100,8 @@ SECTION .text
mova [rsp+32], m1
psubw m2, m4
ABS1 m2, m4
- LOAD 3, [prevq+t1]
- LOAD 4, [prevq+t0]
+ LOAD m3, [prevq+t1]
+ LOAD m4, [prevq+t0]
psubw m3, m0
psubw m4, m1
ABS1 m3, m5
@@ -119,8 +110,8 @@ SECTION .text
psrlw m2, 1
psrlw m3, 1
pmaxsw m2, m3
- LOAD 3, [nextq+t1]
- LOAD 4, [nextq+t0]
+ LOAD m3, [nextq+t1]
+ LOAD m4, [nextq+t0]
psubw m3, m0
psubw m4, m1
ABS1 m3, m5
@@ -166,10 +157,10 @@ SECTION .text
mova m6, [rsp+48]
cmp DWORD r8m, 2
jge .end%1
- LOAD 2, [%2+t1*2]
- LOAD 4, [%3+t1*2]
- LOAD 3, [%2+t0*2]
- LOAD 5, [%3+t0*2]
+ LOAD m2, [%2+t1*2]
+ LOAD m4, [%3+t1*2]
+ LOAD m3, [%2+t0*2]
+ LOAD m5, [%3+t0*2]
paddw m2, m4
paddw m3, m5
psrlw m2, 1
@@ -220,8 +211,6 @@ cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \
cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
mrefs, parity, mode
%endif
- cmp DWORD wm, 0
- jle .ret
%if ARCH_X86_32
mov r4, r5mp
mov r5, r6mp
diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c
index 510a02394c..1460a642c3 100644
--- a/libavfilter/x86/vf_yadif_init.c
+++ b/libavfilter/x86/vf_yadif_init.c
@@ -1,26 +1,25 @@
/*
* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
-#include "libavutil/internal.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
@@ -36,16 +35,63 @@ void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur,
void *next, int w, int prefs,
int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_mmxext(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_ssse3(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+
+void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur,
+ void *next, int w, int prefs,
+ int mrefs, int parity, int mode);
+
av_cold void ff_yadif_init_x86(YADIFContext *yadif)
{
int cpu_flags = av_get_cpu_flags();
+ int bit_depth = (!yadif->csp) ? 8
+ : yadif->csp->comp[0].depth_minus1 + 1;
+ if (bit_depth >= 15) {
+#if ARCH_X86_32
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_16bit_mmxext;
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_SSE2(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_16bit_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_16bit_ssse3;
+ if (EXTERNAL_SSE4(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_16bit_sse4;
+ } else if ( bit_depth >= 9 && bit_depth <= 14) {
+#if ARCH_X86_32
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_10bit_mmxext;
+#endif /* ARCH_X86_32 */
+ if (EXTERNAL_SSE2(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_10bit_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_10bit_ssse3;
+ } else {
#if ARCH_X86_32
- if (EXTERNAL_MMXEXT(cpu_flags))
- yadif->filter_line = ff_yadif_filter_line_mmxext;
+ if (EXTERNAL_MMXEXT(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_mmxext;
#endif /* ARCH_X86_32 */
- if (EXTERNAL_SSE2(cpu_flags))
- yadif->filter_line = ff_yadif_filter_line_sse2;
- if (EXTERNAL_SSSE3(cpu_flags))
- yadif->filter_line = ff_yadif_filter_line_ssse3;
+ if (EXTERNAL_SSE2(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_sse2;
+ if (EXTERNAL_SSSE3(cpu_flags))
+ yadif->filter_line = ff_yadif_filter_line_ssse3;
+ }
}
diff --git a/libavfilter/x86/yadif-10.asm b/libavfilter/x86/yadif-10.asm
new file mode 100644
index 0000000000..8853e0d2c7
--- /dev/null
+++ b/libavfilter/x86/yadif-10.asm
@@ -0,0 +1,255 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+ pmaxuw %1, %2
+%else
+ psubusw %1, %2
+ paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+ movu m2, [curq+t1+%1*2]
+ movu m3, [curq+t0+%2*2]
+ mova m4, m2
+ mova m5, m2
+ pxor m4, m3
+ pavgw m5, m3
+ pand m4, [pw_1]
+ psubusw m5, m4
+ RSHIFT m5, 2
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+ mova m3, m2
+ mova m4, m2
+ RSHIFT m3, 2
+ RSHIFT m4, 4
+ paddw m2, m3
+ paddw m2, m4
+%endmacro
+
+%macro CHECK1 0
+ mova m3, m0
+ pcmpgtw m3, m2
+ pminsw m0, m2
+ mova m6, m3
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+; %macro CHECK2 0
+; paddw m6, [pw_1]
+; psllw m6, 14
+; paddsw m2, m6
+; mova m3, m0
+; pcmpgtw m3, m2
+; pminsw m0, m2
+; pand m5, m3
+; pandn m3, m1
+; por m3, m5
+; mova m1, m3
+; %endmacro
+
+; This version of CHECK2 is required for 14-bit samples. The left-shift trick
+; in the old code is not large enough to correctly select pixels or scores.
+
+%macro CHECK2 0
+ mova m3, m0
+ pcmpgtw m0, m2
+ pand m0, m6
+ mova m6, m0
+ pand m5, m6
+ pand m2, m0
+ pandn m6, m1
+ pandn m0, m3
+ por m6, m5
+ por m0, m2
+ mova m1, m6
+%endmacro
+
+%macro LOAD 2
+ movu %1, %2
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+ pxor m7, m7
+ LOAD m0, [curq+t1]
+ LOAD m1, [curq+t0]
+ LOAD m2, [%2]
+ LOAD m3, [%3]
+ mova m4, m3
+ paddw m3, m2
+ psraw m3, 1
+ mova [rsp+ 0], m0
+ mova [rsp+16], m3
+ mova [rsp+32], m1
+ psubw m2, m4
+ ABS1 m2, m4
+ LOAD m3, [prevq+t1]
+ LOAD m4, [prevq+t0]
+ psubw m3, m0
+ psubw m4, m1
+ ABS2 m3, m4, m5, m6
+ paddw m3, m4
+ psrlw m2, 1
+ psrlw m3, 1
+ pmaxsw m2, m3
+ LOAD m3, [nextq+t1]
+ LOAD m4, [nextq+t0]
+ psubw m3, m0
+ psubw m4, m1
+ ABS2 m3, m4, m5, m6
+ paddw m3, m4
+ psrlw m3, 1
+ pmaxsw m2, m3
+ mova [rsp+48], m2
+
+ paddw m1, m0
+ paddw m0, m0
+ psubw m0, m1
+ psrlw m1, 1
+ ABS1 m0, m2
+
+ movu m2, [curq+t1-1*2]
+ movu m3, [curq+t0-1*2]
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+ mova m3, m2
+ RSHIFT m3, 4
+ paddw m0, m2
+ paddw m0, m3
+ psubw m0, [pw_1]
+
+ CHECK -2, 0
+ CHECK1
+ CHECK -3, 1
+ CHECK2
+ CHECK 0, -2
+ CHECK1
+ CHECK 1, -3
+ CHECK2
+
+ mova m6, [rsp+48]
+ cmp DWORD r8m, 2
+ jge .end%1
+ LOAD m2, [%2+t1*2]
+ LOAD m4, [%3+t1*2]
+ LOAD m3, [%2+t0*2]
+ LOAD m5, [%3+t0*2]
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 1
+ psrlw m3, 1
+ mova m4, [rsp+ 0]
+ mova m5, [rsp+16]
+ mova m7, [rsp+32]
+ psubw m2, m4
+ psubw m3, m7
+ mova m0, m5
+ psubw m5, m4
+ psubw m0, m7
+ mova m4, m2
+ pminsw m2, m3
+ pmaxsw m3, m4
+ pmaxsw m2, m5
+ pminsw m3, m5
+ pmaxsw m2, m0
+ pminsw m3, m0
+ pxor m4, m4
+ pmaxsw m6, m3
+ psubw m4, m2
+ pmaxsw m6, m4
+
+.end%1:
+ mova m2, [rsp+16]
+ mova m3, m2
+ psubw m2, m6
+ paddw m3, m6
+ pmaxsw m1, m2
+ pminsw m1, m3
+
+ movu [dstq], m1
+ add dstq, mmsize-4
+ add prevq, mmsize-4
+ add curq, mmsize-4
+ add nextq, mmsize-4
+ sub DWORD r4m, mmsize/2-2
+ jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+ mov r4, r5mp
+ mov r5, r6mp
+ DECLARE_REG_TMP 4,5
+%else
+ movsxd r5, DWORD r5m
+ movsxd r6, DWORD r6m
+ DECLARE_REG_TMP 5,6
+%endif
+
+ cmp DWORD paritym, 0
+ je .parity0
+ FILTER 1, prevq, curq
+ jmp .ret
+
+.parity0:
+ FILTER 0, curq, nextq
+
+.ret:
+ RET
+%endmacro
+
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif
diff --git a/libavfilter/x86/yadif-16.asm b/libavfilter/x86/yadif-16.asm
new file mode 100644
index 0000000000..79d127dfaa
--- /dev/null
+++ b/libavfilter/x86/yadif-16.asm
@@ -0,0 +1,317 @@
+;*****************************************************************************
+;* x86-optimized functions for yadif filter
+;*
+;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com>
+;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_1: times 8 dw 1
+pw_8000: times 8 dw 0x8000
+pd_1: times 4 dd 1
+pd_8000: times 4 dd 0x8000
+
+SECTION .text
+
+%macro PABS 2
+%if cpuflag(ssse3)
+ pabsd %1, %1
+%else
+ pxor %2, %2
+ pcmpgtd %2, %1
+ pxor %1, %2
+ psubd %1, %2
+%endif
+%endmacro
+
+%macro PACK 1
+%if cpuflag(sse4)
+ packusdw %1, %1
+%else
+ psubd %1, [pd_8000]
+ packssdw %1, %1
+ paddw %1, [pw_8000]
+%endif
+%endmacro
+
+%macro PMINSD 3
+%if cpuflag(sse4)
+ pminsd %1, %2
+%else
+ mova %3, %2
+ pcmpgtd %3, %1
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+%endif
+%endmacro
+
+%macro PMAXSD 3
+%if cpuflag(sse4)
+ pmaxsd %1, %2
+%else
+ mova %3, %1
+ pcmpgtd %3, %2
+ pand %1, %3
+ pandn %3, %2
+ por %1, %3
+%endif
+%endmacro
+
+%macro PMAXUW 2
+%if cpuflag(sse4)
+ pmaxuw %1, %2
+%else
+ psubusw %1, %2
+ paddusw %1, %2
+%endif
+%endmacro
+
+%macro CHECK 2
+ movu m2, [curq+t1+%1*2]
+ movu m3, [curq+t0+%2*2]
+ mova m4, m2
+ mova m5, m2
+ pxor m4, m3
+ pavgw m5, m3
+ pand m4, [pw_1]
+ psubusw m5, m4
+ RSHIFT m5, 2
+ punpcklwd m5, m7
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+ mova m3, m2
+ mova m4, m2
+ RSHIFT m3, 2
+ RSHIFT m4, 4
+ punpcklwd m2, m7
+ punpcklwd m3, m7
+ punpcklwd m4, m7
+ paddd m2, m3
+ paddd m2, m4
+%endmacro
+
+%macro CHECK1 0
+ mova m3, m0
+ pcmpgtd m3, m2
+ PMINSD m0, m2, m6
+ mova m6, m3
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+%macro CHECK2 0
+ paddd m6, [pd_1]
+ pslld m6, 30
+ paddd m2, m6
+ mova m3, m0
+ pcmpgtd m3, m2
+ PMINSD m0, m2, m4
+ pand m5, m3
+ pandn m3, m1
+ por m3, m5
+ mova m1, m3
+%endmacro
+
+; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
+; am not sure whether it is any faster. A rewrite or refactor of the filter
+; code should make it possible to eliminate the move instruction at the end. It
+; exists to satisfy the expectation that the "score" values are in m1.
+
+; %macro CHECK2 0
+; mova m3, m0
+; pcmpgtd m0, m2
+; pand m0, m6
+; mova m6, m0
+; pand m5, m6
+; pand m2, m0
+; pandn m6, m1
+; pandn m0, m3
+; por m6, m5
+; por m0, m2
+; mova m1, m6
+; %endmacro
+
+%macro LOAD 2
+ movh %1, %2
+ punpcklwd %1, m7
+%endmacro
+
+%macro FILTER 3
+.loop%1:
+ pxor m7, m7
+ LOAD m0, [curq+t1]
+ LOAD m1, [curq+t0]
+ LOAD m2, [%2]
+ LOAD m3, [%3]
+ mova m4, m3
+ paddd m3, m2
+ psrad m3, 1
+ mova [rsp+ 0], m0
+ mova [rsp+16], m3
+ mova [rsp+32], m1
+ psubd m2, m4
+ PABS m2, m4
+ LOAD m3, [prevq+t1]
+ LOAD m4, [prevq+t0]
+ psubd m3, m0
+ psubd m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddd m3, m4
+ psrld m2, 1
+ psrld m3, 1
+ PMAXSD m2, m3, m6
+ LOAD m3, [nextq+t1]
+ LOAD m4, [nextq+t0]
+ psubd m3, m0
+ psubd m4, m1
+ PABS m3, m5
+ PABS m4, m5
+ paddd m3, m4
+ psrld m3, 1
+ PMAXSD m2, m3, m6
+ mova [rsp+48], m2
+
+ paddd m1, m0
+ paddd m0, m0
+ psubd m0, m1
+ psrld m1, 1
+ PABS m0, m2
+
+ movu m2, [curq+t1-1*2]
+ movu m3, [curq+t0-1*2]
+ mova m4, m2
+ psubusw m2, m3
+ psubusw m3, m4
+ PMAXUW m2, m3
+ mova m3, m2
+ RSHIFT m3, 4
+ punpcklwd m2, m7
+ punpcklwd m3, m7
+ paddd m0, m2
+ paddd m0, m3
+ psubd m0, [pd_1]
+
+ CHECK -2, 0
+ CHECK1
+ CHECK -3, 1
+ CHECK2
+ CHECK 0, -2
+ CHECK1
+ CHECK 1, -3
+ CHECK2
+
+ mova m6, [rsp+48]
+ cmp DWORD r8m, 2
+ jge .end%1
+ LOAD m2, [%2+t1*2]
+ LOAD m4, [%3+t1*2]
+ LOAD m3, [%2+t0*2]
+ LOAD m5, [%3+t0*2]
+ paddd m2, m4
+ paddd m3, m5
+ psrld m2, 1
+ psrld m3, 1
+ mova m4, [rsp+ 0]
+ mova m5, [rsp+16]
+ mova m7, [rsp+32]
+ psubd m2, m4
+ psubd m3, m7
+ mova m0, m5
+ psubd m5, m4
+ psubd m0, m7
+ mova m4, m2
+ PMINSD m2, m3, m7
+ PMAXSD m3, m4, m7
+ PMAXSD m2, m5, m7
+ PMINSD m3, m5, m7
+ PMAXSD m2, m0, m7
+ PMINSD m3, m0, m7
+ pxor m4, m4
+ PMAXSD m6, m3, m7
+ psubd m4, m2
+ PMAXSD m6, m4, m7
+
+.end%1:
+ mova m2, [rsp+16]
+ mova m3, m2
+ psubd m2, m6
+ paddd m3, m6
+ PMAXSD m1, m2, m7
+ PMINSD m1, m3, m7
+ PACK m1
+
+ movh [dstq], m1
+ add dstq, mmsize/2
+ add prevq, mmsize/2
+ add curq, mmsize/2
+ add nextq, mmsize/2
+ sub DWORD r4m, mmsize/4
+ jg .loop%1
+%endmacro
+
+%macro YADIF 0
+%if ARCH_X86_32
+cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%else
+cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
+ prefs, mrefs, parity, mode
+%endif
+%if ARCH_X86_32
+ mov r4, r5mp
+ mov r5, r6mp
+ DECLARE_REG_TMP 4,5
+%else
+ movsxd r5, DWORD r5m
+ movsxd r6, DWORD r6m
+ DECLARE_REG_TMP 5,6
+%endif
+
+ cmp DWORD paritym, 0
+ je .parity0
+ FILTER 1, prevq, curq
+ jmp .ret
+
+.parity0:
+ FILTER 0, curq, nextq
+
+.ret:
+ RET
+%endmacro
+
+INIT_XMM sse4
+YADIF
+INIT_XMM ssse3
+YADIF
+INIT_XMM sse2
+YADIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+YADIF
+%endif