From bcc45d6348174241e1ec7ce9f8129bebbde4bde6 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Wed, 8 Aug 2012 02:12:17 +0200 Subject: x86: avcodec: Drop silly "_mmx" suffixes from filenames --- libavcodec/x86/Makefile | 16 +- libavcodec/x86/cavsdsp.c | 492 ++++++++++++++ libavcodec/x86/cavsdsp_mmx.c | 492 -------------- libavcodec/x86/dsputil_mmx.c | 2 +- libavcodec/x86/fdct.c | 585 +++++++++++++++++ libavcodec/x86/fdct_mmx.c | 585 ----------------- libavcodec/x86/fft.asm | 1105 +++++++++++++++++++++++++++++++ libavcodec/x86/fft_mmx.asm | 1105 ------------------------------- libavcodec/x86/h264_qpel.c | 1291 +++++++++++++++++++++++++++++++++++++ libavcodec/x86/h264_qpel_mmx.c | 1291 ------------------------------------- libavcodec/x86/lpc.c | 154 +++++ libavcodec/x86/lpc_mmx.c | 154 ----- libavcodec/x86/motion_est.c | 469 ++++++++++++++ libavcodec/x86/motion_est_mmx.c | 469 -------------- libavcodec/x86/mpegaudiodec.c | 261 ++++++++ libavcodec/x86/mpegaudiodec_mmx.c | 261 -------- libavcodec/x86/simple_idct.c | 1169 +++++++++++++++++++++++++++++++++ libavcodec/x86/simple_idct_mmx.c | 1169 --------------------------------- libavcodec/x86/snowdsp.c | 902 ++++++++++++++++++++++++++ libavcodec/x86/snowdsp_mmx.c | 902 -------------------------- 20 files changed, 6437 insertions(+), 6437 deletions(-) create mode 100644 libavcodec/x86/cavsdsp.c delete mode 100644 libavcodec/x86/cavsdsp_mmx.c create mode 100644 libavcodec/x86/fdct.c delete mode 100644 libavcodec/x86/fdct_mmx.c create mode 100644 libavcodec/x86/fft.asm delete mode 100644 libavcodec/x86/fft_mmx.asm create mode 100644 libavcodec/x86/h264_qpel.c delete mode 100644 libavcodec/x86/h264_qpel_mmx.c create mode 100644 libavcodec/x86/lpc.c delete mode 100644 libavcodec/x86/lpc_mmx.c create mode 100644 libavcodec/x86/motion_est.c delete mode 100644 libavcodec/x86/motion_est_mmx.c create mode 100644 libavcodec/x86/mpegaudiodec.c delete mode 100644 libavcodec/x86/mpegaudiodec_mmx.c create mode 100644 libavcodec/x86/simple_idct.c delete mode 100644 libavcodec/x86/simple_idct_mmx.c create mode 100644 libavcodec/x86/snowdsp.c delete mode 100644 libavcodec/x86/snowdsp_mmx.c diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 8582e9ceae..43e1a3afb4 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -4,24 +4,24 @@ OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o MMX-OBJS += x86/dsputil_mmx.o \ - x86/fdct_mmx.o \ + x86/fdct.o \ x86/fmtconvert_init.o \ x86/idct_mmx_xvid.o \ x86/idct_sse2_xvid.o \ - x86/motion_est_mmx.o \ - x86/simple_idct_mmx.o \ + x86/motion_est.o \ + x86/simple_idct.o \ MMX-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o MMX-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o -MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o +MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o MMX-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o -MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o +MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp.o MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o MMX-OBJS-$(CONFIG_FFT) += x86/fft_init.o MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o -MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o -MMX-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec_mmx.o +MMX-OBJS-$(CONFIG_LPC) += x86/lpc.o +MMX-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec.o MMX-OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o MMX-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o @@ -38,7 +38,7 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o -YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o +YASM-OBJS-$(CONFIG_FFT) += x86/fft.o YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ x86/h264_chromamc_10bit.o YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c new file mode 100644 index 0000000000..e94003956f --- /dev/null +++ b/libavcodec/x86/cavsdsp.c @@ -0,0 +1,492 @@ +/* + * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. + * Copyright (c) 2006 Stefan Gehrer + * + * MMX-optimized DSP functions, based on H.264 optimizations by + * Michael Niedermayer and Loren Merritt + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/common.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/cavsdsp.h" +#include "dsputil_mmx.h" + +#if HAVE_INLINE_ASM + +/* in/out: mma=mma+mmb, mmb=mmb-mma */ +#define SUMSUB_BA( a, b ) \ + "paddw "#b", "#a" \n\t"\ + "paddw "#b", "#b" \n\t"\ + "psubw "#a", "#b" \n\t" + +/***************************************************************************** + * + * inverse transform + * + ****************************************************************************/ + +static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) +{ + __asm__ volatile( + "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ + "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ + "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ + "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ + "movq %%mm4, %%mm0 \n\t" + "movq %%mm5, %%mm3 \n\t" + "movq %%mm2, %%mm6 \n\t" + "movq %%mm7, %%mm1 \n\t" + + "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ + "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ + "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ + "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ + "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ + "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ + "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ + "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ + "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ + "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ + "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ + "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ + + "movq %%mm5, %%mm4 \n\t" + "movq %%mm7, %%mm6 \n\t" + "movq %%mm3, %%mm0 \n\t" + "movq %%mm1, %%mm2 \n\t" + SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ + "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ + "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ + "paddw %%mm7, %%mm7 \n\t" + "paddw %%mm5, %%mm5 \n\t" + "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ + "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ + + SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ + "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ + "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ + "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ + "paddw %%mm1, %%mm1 \n\t" + "paddw %%mm3, %%mm3 \n\t" + "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ + "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ + + "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ + "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ + "movq %%mm2, %%mm4 \n\t" + "movq %%mm6, %%mm0 \n\t" + "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ + "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ + "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ + "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ + "paddw %%mm2, %%mm2 \n\t" + "paddw %%mm0, %%mm0 \n\t" + "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ + "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ + + "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ + "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ + SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ + "psllw $3, %%mm0 \n\t" + "psllw $3, %%mm2 \n\t" + "paddw %1, %%mm0 \n\t" /* add rounding bias */ + "paddw %1, %%mm2 \n\t" /* add rounding bias */ + + SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ + SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ + SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ + SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ + SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ + SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ + :: "r"(block), "m"(bias) + ); +} + +static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) +{ + int i; + DECLARE_ALIGNED(8, int16_t, b2)[64]; + + for(i=0; i<2; i++){ + DECLARE_ALIGNED(8, uint64_t, tmp); + + cavs_idct8_1d(block+4*i, ff_pw_4.a); + + __asm__ volatile( + "psraw $3, %%mm7 \n\t" + "psraw $3, %%mm6 \n\t" + "psraw $3, %%mm5 \n\t" + "psraw $3, %%mm4 \n\t" + "psraw $3, %%mm3 \n\t" + "psraw $3, %%mm2 \n\t" + "psraw $3, %%mm1 \n\t" + "psraw $3, %%mm0 \n\t" + "movq %%mm7, %0 \n\t" + TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) + "movq %%mm0, 8(%1) \n\t" + "movq %%mm6, 24(%1) \n\t" + "movq %%mm7, 40(%1) \n\t" + "movq %%mm4, 56(%1) \n\t" + "movq %0, %%mm7 \n\t" + TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) + "movq %%mm7, (%1) \n\t" + "movq %%mm1, 16(%1) \n\t" + "movq %%mm0, 32(%1) \n\t" + "movq %%mm3, 48(%1) \n\t" + : "=m"(tmp) + : "r"(b2+32*i) + : "memory" + ); + } + + for(i=0; i<2; i++){ + cavs_idct8_1d(b2+4*i, ff_pw_64.a); + + __asm__ volatile( + "psraw $7, %%mm7 \n\t" + "psraw $7, %%mm6 \n\t" + "psraw $7, %%mm5 \n\t" + "psraw $7, %%mm4 \n\t" + "psraw $7, %%mm3 \n\t" + "psraw $7, %%mm2 \n\t" + "psraw $7, %%mm1 \n\t" + "psraw $7, %%mm0 \n\t" + "movq %%mm7, (%0) \n\t" + "movq %%mm5, 16(%0) \n\t" + "movq %%mm3, 32(%0) \n\t" + "movq %%mm1, 48(%0) \n\t" + "movq %%mm0, 64(%0) \n\t" + "movq %%mm2, 80(%0) \n\t" + "movq %%mm4, 96(%0) \n\t" + "movq %%mm6, 112(%0) \n\t" + :: "r"(b2+4*i) + : "memory" + ); + } + + ff_add_pixels_clamped_mmx(b2, dst, stride); +} + +/***************************************************************************** + * + * motion compensation + * + ****************************************************************************/ + +/* vertical filter [-1 -2 96 42 -7 0] */ +#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \ + "movd (%0), "#F" \n\t"\ + "movq "#C", %%mm6 \n\t"\ + "pmullw %5, %%mm6 \n\t"\ + "movq "#D", %%mm7 \n\t"\ + "pmullw "MANGLE(MUL2)", %%mm7\n\t"\ + "psllw $3, "#E" \n\t"\ + "psubw "#E", %%mm6 \n\t"\ + "psraw $3, "#E" \n\t"\ + "paddw %%mm7, %%mm6 \n\t"\ + "paddw "#E", %%mm6 \n\t"\ + "paddw "#B", "#B" \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, "#F" \n\t"\ + "psubw "#B", %%mm6 \n\t"\ + "psraw $1, "#B" \n\t"\ + "psubw "#A", %%mm6 \n\t"\ + "paddw %4, %%mm6 \n\t"\ + "psraw $7, %%mm6 \n\t"\ + "packuswb %%mm6, %%mm6 \n\t"\ + OP(%%mm6, (%1), A, d) \ + "add %3, %1 \n\t" + +/* vertical filter [ 0 -1 5 5 -1 0] */ +#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \ + "movd (%0), "#F" \n\t"\ + "movq "#C", %%mm6 \n\t"\ + "paddw "#D", %%mm6 \n\t"\ + "pmullw %5, %%mm6 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, "#F" \n\t"\ + "psubw "#B", %%mm6 \n\t"\ + "psubw "#E", %%mm6 \n\t"\ + "paddw %4, %%mm6 \n\t"\ + "psraw $3, %%mm6 \n\t"\ + "packuswb %%mm6, %%mm6 \n\t"\ + OP(%%mm6, (%1), A, d) \ + "add %3, %1 \n\t" + +/* vertical filter [ 0 -7 42 96 -2 -1] */ +#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \ + "movd (%0), "#F" \n\t"\ + "movq "#C", %%mm6 \n\t"\ + "pmullw "MANGLE(MUL2)", %%mm6\n\t"\ + "movq "#D", %%mm7 \n\t"\ + "pmullw %5, %%mm7 \n\t"\ + "psllw $3, "#B" \n\t"\ + "psubw "#B", %%mm6 \n\t"\ + "psraw $3, "#B" \n\t"\ + "paddw %%mm7, %%mm6 \n\t"\ + "paddw "#B", %%mm6 \n\t"\ + "paddw "#E", "#E" \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, "#F" \n\t"\ + "psubw "#E", %%mm6 \n\t"\ + "psraw $1, "#E" \n\t"\ + "psubw "#F", %%mm6 \n\t"\ + "paddw %4, %%mm6 \n\t"\ + "psraw $7, %%mm6 \n\t"\ + "packuswb %%mm6, %%mm6 \n\t"\ + OP(%%mm6, (%1), A, d) \ + "add %3, %1 \n\t" + + +#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ + int w= 2;\ + src -= 2*srcStride;\ + \ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ + VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ + VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ + : "memory"\ + );\ + if(h==16){\ + __asm__ volatile(\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ + VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ + VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ + VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ + VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ + VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ + VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ + : "memory"\ + );\ + }\ + src += 4-(h+5)*srcStride;\ + dst += 4-h*dstStride;\ + } + +#define QPEL_CAVS(OPNAME, OP, MMX)\ +static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %5, %%mm6 \n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "movq %6, %%mm5 \n\t"\ + "paddw %%mm5, %%mm0 \n\t"\ + "paddw %%mm5, %%mm1 \n\t"\ + "psraw $3, %%mm0 \n\t"\ + "psraw $3, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q) \ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+m"(h)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ + : "memory"\ + );\ +}\ +\ +static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ +}\ +\ +static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ +}\ +\ +static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ +}\ +\ +static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +}\ +\ +static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +}\ +\ +static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +}\ +\ +static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ + +#define CAVS_MC(OPNAME, SIZE, MMX) \ +static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ +}\ + +#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" +#define AVG_3DNOW_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgusb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" +#define AVG_MMX2_OP(a,b,temp, size) \ +"mov" #size " " #b ", " #temp " \n\t"\ +"pavgb " #temp ", " #a " \n\t"\ +"mov" #size " " #a ", " #b " \n\t" + +QPEL_CAVS(put_, PUT_OP, 3dnow) +QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) +QPEL_CAVS(put_, PUT_OP, mmx2) +QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) + +CAVS_MC(put_, 8, 3dnow) +CAVS_MC(put_, 16,3dnow) +CAVS_MC(avg_, 8, 3dnow) +CAVS_MC(avg_, 16,3dnow) +CAVS_MC(put_, 8, mmx2) +CAVS_MC(put_, 16,mmx2) +CAVS_MC(avg_, 8, mmx2) +CAVS_MC(avg_, 16,mmx2) + +static void ff_cavsdsp_init_mmx2(CAVSDSPContext* c, AVCodecContext *avctx) { +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \ + c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \ + + dspfunc(put_cavs_qpel, 0, 16); + dspfunc(put_cavs_qpel, 1, 8); + dspfunc(avg_cavs_qpel, 0, 16); + dspfunc(avg_cavs_qpel, 1, 8); +#undef dspfunc + c->cavs_idct8_add = cavs_idct8_add_mmx; +} + +static void ff_cavsdsp_init_3dnow(CAVSDSPContext* c, AVCodecContext *avctx) { +#define dspfunc(PFX, IDX, NUM) \ + c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ + c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ + c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ + c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ + + dspfunc(put_cavs_qpel, 0, 16); + dspfunc(put_cavs_qpel, 1, 8); + dspfunc(avg_cavs_qpel, 0, 16); + dspfunc(avg_cavs_qpel, 1, 8); +#undef dspfunc + c->cavs_idct8_add = cavs_idct8_add_mmx; +} + +#endif /* HAVE_INLINE_ASM */ + +void ff_cavsdsp_init_mmx(CAVSDSPContext *c, AVCodecContext *avctx) +{ + int mm_flags = av_get_cpu_flags(); + +#if HAVE_INLINE_ASM + if (mm_flags & AV_CPU_FLAG_MMXEXT) ff_cavsdsp_init_mmx2(c, avctx); + if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx); +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/x86/cavsdsp_mmx.c b/libavcodec/x86/cavsdsp_mmx.c deleted file mode 100644 index e94003956f..0000000000 --- a/libavcodec/x86/cavsdsp_mmx.c +++ /dev/null @@ -1,492 +0,0 @@ -/* - * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. - * Copyright (c) 2006 Stefan Gehrer - * - * MMX-optimized DSP functions, based on H.264 optimizations by - * Michael Niedermayer and Loren Merritt - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/common.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/cavsdsp.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -/* in/out: mma=mma+mmb, mmb=mmb-mma */ -#define SUMSUB_BA( a, b ) \ - "paddw "#b", "#a" \n\t"\ - "paddw "#b", "#b" \n\t"\ - "psubw "#a", "#b" \n\t" - -/***************************************************************************** - * - * inverse transform - * - ****************************************************************************/ - -static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) -{ - __asm__ volatile( - "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ - "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ - "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ - "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ - "movq %%mm4, %%mm0 \n\t" - "movq %%mm5, %%mm3 \n\t" - "movq %%mm2, %%mm6 \n\t" - "movq %%mm7, %%mm1 \n\t" - - "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ - "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ - "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ - "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ - "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ - "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ - "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ - "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ - "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ - "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ - "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ - "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ - - "movq %%mm5, %%mm4 \n\t" - "movq %%mm7, %%mm6 \n\t" - "movq %%mm3, %%mm0 \n\t" - "movq %%mm1, %%mm2 \n\t" - SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ - "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ - "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ - "paddw %%mm7, %%mm7 \n\t" - "paddw %%mm5, %%mm5 \n\t" - "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ - "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ - - SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ - "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ - "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ - "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ - "paddw %%mm1, %%mm1 \n\t" - "paddw %%mm3, %%mm3 \n\t" - "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ - "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ - - "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ - "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ - "movq %%mm2, %%mm4 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ - "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ - "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ - "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ - "paddw %%mm2, %%mm2 \n\t" - "paddw %%mm0, %%mm0 \n\t" - "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ - "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ - - "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ - "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ - SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ - "psllw $3, %%mm0 \n\t" - "psllw $3, %%mm2 \n\t" - "paddw %1, %%mm0 \n\t" /* add rounding bias */ - "paddw %1, %%mm2 \n\t" /* add rounding bias */ - - SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ - SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ - SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ - SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ - SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ - SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ - :: "r"(block), "m"(bias) - ); -} - -static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) -{ - int i; - DECLARE_ALIGNED(8, int16_t, b2)[64]; - - for(i=0; i<2; i++){ - DECLARE_ALIGNED(8, uint64_t, tmp); - - cavs_idct8_1d(block+4*i, ff_pw_4.a); - - __asm__ volatile( - "psraw $3, %%mm7 \n\t" - "psraw $3, %%mm6 \n\t" - "psraw $3, %%mm5 \n\t" - "psraw $3, %%mm4 \n\t" - "psraw $3, %%mm3 \n\t" - "psraw $3, %%mm2 \n\t" - "psraw $3, %%mm1 \n\t" - "psraw $3, %%mm0 \n\t" - "movq %%mm7, %0 \n\t" - TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) - "movq %%mm0, 8(%1) \n\t" - "movq %%mm6, 24(%1) \n\t" - "movq %%mm7, 40(%1) \n\t" - "movq %%mm4, 56(%1) \n\t" - "movq %0, %%mm7 \n\t" - TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) - "movq %%mm7, (%1) \n\t" - "movq %%mm1, 16(%1) \n\t" - "movq %%mm0, 32(%1) \n\t" - "movq %%mm3, 48(%1) \n\t" - : "=m"(tmp) - : "r"(b2+32*i) - : "memory" - ); - } - - for(i=0; i<2; i++){ - cavs_idct8_1d(b2+4*i, ff_pw_64.a); - - __asm__ volatile( - "psraw $7, %%mm7 \n\t" - "psraw $7, %%mm6 \n\t" - "psraw $7, %%mm5 \n\t" - "psraw $7, %%mm4 \n\t" - "psraw $7, %%mm3 \n\t" - "psraw $7, %%mm2 \n\t" - "psraw $7, %%mm1 \n\t" - "psraw $7, %%mm0 \n\t" - "movq %%mm7, (%0) \n\t" - "movq %%mm5, 16(%0) \n\t" - "movq %%mm3, 32(%0) \n\t" - "movq %%mm1, 48(%0) \n\t" - "movq %%mm0, 64(%0) \n\t" - "movq %%mm2, 80(%0) \n\t" - "movq %%mm4, 96(%0) \n\t" - "movq %%mm6, 112(%0) \n\t" - :: "r"(b2+4*i) - : "memory" - ); - } - - ff_add_pixels_clamped_mmx(b2, dst, stride); -} - -/***************************************************************************** - * - * motion compensation - * - ****************************************************************************/ - -/* vertical filter [-1 -2 96 42 -7 0] */ -#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "pmullw %5, %%mm6 \n\t"\ - "movq "#D", %%mm7 \n\t"\ - "pmullw "MANGLE(MUL2)", %%mm7\n\t"\ - "psllw $3, "#E" \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "psraw $3, "#E" \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw "#E", %%mm6 \n\t"\ - "paddw "#B", "#B" \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psraw $1, "#B" \n\t"\ - "psubw "#A", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ - "psraw $7, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - -/* vertical filter [ 0 -1 5 5 -1 0] */ -#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "paddw "#D", %%mm6 \n\t"\ - "pmullw %5, %%mm6 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ - "psraw $3, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - -/* vertical filter [ 0 -7 42 96 -2 -1] */ -#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \ - "movd (%0), "#F" \n\t"\ - "movq "#C", %%mm6 \n\t"\ - "pmullw "MANGLE(MUL2)", %%mm6\n\t"\ - "movq "#D", %%mm7 \n\t"\ - "pmullw %5, %%mm7 \n\t"\ - "psllw $3, "#B" \n\t"\ - "psubw "#B", %%mm6 \n\t"\ - "psraw $3, "#B" \n\t"\ - "paddw %%mm7, %%mm6 \n\t"\ - "paddw "#B", %%mm6 \n\t"\ - "paddw "#E", "#E" \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, "#F" \n\t"\ - "psubw "#E", %%mm6 \n\t"\ - "psraw $1, "#E" \n\t"\ - "psubw "#F", %%mm6 \n\t"\ - "paddw %4, %%mm6 \n\t"\ - "psraw $7, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm6 \n\t"\ - OP(%%mm6, (%1), A, d) \ - "add %3, %1 \n\t" - - -#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ - int w= 2;\ - src -= 2*srcStride;\ - \ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ - : "memory"\ - );\ - if(h==16){\ - __asm__ volatile(\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ - VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ - VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ - VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ - VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ - VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ - VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ - : "memory"\ - );\ - }\ - src += 4-(h+5)*srcStride;\ - dst += 4-h*dstStride;\ - } - -#define QPEL_CAVS(OPNAME, OP, MMX)\ -static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %5, %%mm6 \n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "movq %6, %%mm5 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm5, %%mm1 \n\t"\ - "psraw $3, %%mm0 \n\t"\ - "psraw $3, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q) \ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+m"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ - : "memory"\ - );\ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ -}\ -\ -static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ -}\ -\ -static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ - -#define CAVS_MC(OPNAME, SIZE, MMX) \ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ -}\ - -#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" -#define AVG_3DNOW_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgusb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" -#define AVG_MMX2_OP(a,b,temp, size) \ -"mov" #size " " #b ", " #temp " \n\t"\ -"pavgb " #temp ", " #a " \n\t"\ -"mov" #size " " #a ", " #b " \n\t" - -QPEL_CAVS(put_, PUT_OP, 3dnow) -QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) -QPEL_CAVS(put_, PUT_OP, mmx2) -QPEL_CAVS(avg_, AVG_MMX2_OP, mmx2) - -CAVS_MC(put_, 8, 3dnow) -CAVS_MC(put_, 16,3dnow) -CAVS_MC(avg_, 8, 3dnow) -CAVS_MC(avg_, 16,3dnow) -CAVS_MC(put_, 8, mmx2) -CAVS_MC(put_, 16,mmx2) -CAVS_MC(avg_, 8, mmx2) -CAVS_MC(avg_, 16,mmx2) - -static void ff_cavsdsp_init_mmx2(CAVSDSPContext* c, AVCodecContext *avctx) { -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ - c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \ - c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \ - c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \ - c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \ - - dspfunc(put_cavs_qpel, 0, 16); - dspfunc(put_cavs_qpel, 1, 8); - dspfunc(avg_cavs_qpel, 0, 16); - dspfunc(avg_cavs_qpel, 1, 8); -#undef dspfunc - c->cavs_idct8_add = cavs_idct8_add_mmx; -} - -static void ff_cavsdsp_init_3dnow(CAVSDSPContext* c, AVCodecContext *avctx) { -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \ - c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \ - c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \ - c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \ - c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \ - - dspfunc(put_cavs_qpel, 0, 16); - dspfunc(put_cavs_qpel, 1, 8); - dspfunc(avg_cavs_qpel, 0, 16); - dspfunc(avg_cavs_qpel, 1, 8); -#undef dspfunc - c->cavs_idct8_add = cavs_idct8_add_mmx; -} - -#endif /* HAVE_INLINE_ASM */ - -void ff_cavsdsp_init_mmx(CAVSDSPContext *c, AVCodecContext *avctx) -{ - int mm_flags = av_get_cpu_flags(); - -#if HAVE_INLINE_ASM - if (mm_flags & AV_CPU_FLAG_MMXEXT) ff_cavsdsp_init_mmx2(c, avctx); - if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx); -#endif /* HAVE_INLINE_ASM */ -} diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 93f9db8299..c1f54ac64f 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2081,7 +2081,7 @@ PREFETCH(prefetch_3dnow, prefetch) #endif /* HAVE_INLINE_ASM */ -#include "h264_qpel_mmx.c" +#include "h264_qpel.c" void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, int stride, int h, int x, int y); diff --git a/libavcodec/x86/fdct.c b/libavcodec/x86/fdct.c new file mode 100644 index 0000000000..f9bd3f2508 --- /dev/null +++ b/libavcodec/x86/fdct.c @@ -0,0 +1,585 @@ +/* + * MMX optimized forward DCT + * The gcc porting is Copyright (c) 2001 Fabrice Bellard. + * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer + * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. + * + * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT + * + * Intel Application Note AP-922 - fast, precise implementation of DCT + * http://developer.intel.com/vtune/cbts/appnotes.htm + * + * Also of inspiration: + * a page about fdct at http://www.geocities.com/ssavekar/dct.htm + * Skal's fdct at http://skal.planet-d.net/coding/dct.html + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/common.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/dsputil.h" + +#if HAVE_INLINE_ASM + +////////////////////////////////////////////////////////////////////// +// +// constants for the forward DCT +// ----------------------------- +// +// Be sure to check that your compiler is aligning all constants to QWORD +// (8-byte) memory boundaries! Otherwise the unaligned memory access will +// severely stall MMX execution. +// +////////////////////////////////////////////////////////////////////// + +#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy +#define SHIFT_FRW_COL BITS_FRW_ACC +#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) +#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) +//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) + +#define X8(x) x,x,x,x,x,x,x,x + +//concatenated table, for forward DCT transformation +DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { + X8(13036), // tg * (2<<16) + 0.5 + X8(27146), // tg * (2<<16) + 0.5 + X8(-21746) // tg * (2<<16) + 0.5 +}; + +DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { + X8(23170) //cos * (2<<15) + 0.5 +}; + +DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; + +DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; + +static struct +{ + DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; +} fdct_r_row_sse2 = +{{ + RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW +}}; +//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; + +DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table + 16384, 16384, 22725, 19266, + 16384, 16384, 12873, 4520, + 21407, 8867, 19266, -4520, + -8867, -21407, -22725, -12873, + 16384, -16384, 12873, -22725, + -16384, 16384, 4520, 19266, + 8867, -21407, 4520, -12873, + 21407, -8867, 19266, -22725, + + 22725, 22725, 31521, 26722, + 22725, 22725, 17855, 6270, + 29692, 12299, 26722, -6270, + -12299, -29692, -31521, -17855, + 22725, -22725, 17855, -31521, + -22725, 22725, 6270, 26722, + 12299, -29692, 6270, -17855, + 29692, -12299, 26722, -31521, + + 21407, 21407, 29692, 25172, + 21407, 21407, 16819, 5906, + 27969, 11585, 25172, -5906, + -11585, -27969, -29692, -16819, + 21407, -21407, 16819, -29692, + -21407, 21407, 5906, 25172, + 11585, -27969, 5906, -16819, + 27969, -11585, 25172, -29692, + + 19266, 19266, 26722, 22654, + 19266, 19266, 15137, 5315, + 25172, 10426, 22654, -5315, + -10426, -25172, -26722, -15137, + 19266, -19266, 15137, -26722, + -19266, 19266, 5315, 22654, + 10426, -25172, 5315, -15137, + 25172, -10426, 22654, -26722, + + 16384, 16384, 22725, 19266, + 16384, 16384, 12873, 4520, + 21407, 8867, 19266, -4520, + -8867, -21407, -22725, -12873, + 16384, -16384, 12873, -22725, + -16384, 16384, 4520, 19266, + 8867, -21407, 4520, -12873, + 21407, -8867, 19266, -22725, + + 19266, 19266, 26722, 22654, + 19266, 19266, 15137, 5315, + 25172, 10426, 22654, -5315, + -10426, -25172, -26722, -15137, + 19266, -19266, 15137, -26722, + -19266, 19266, 5315, 22654, + 10426, -25172, 5315, -15137, + 25172, -10426, 22654, -26722, + + 21407, 21407, 29692, 25172, + 21407, 21407, 16819, 5906, + 27969, 11585, 25172, -5906, + -11585, -27969, -29692, -16819, + 21407, -21407, 16819, -29692, + -21407, 21407, 5906, 25172, + 11585, -27969, 5906, -16819, + 27969, -11585, 25172, -29692, + + 22725, 22725, 31521, 26722, + 22725, 22725, 17855, 6270, + 29692, 12299, 26722, -6270, + -12299, -29692, -31521, -17855, + 22725, -22725, 17855, -31521, + -22725, 22725, 6270, 26722, + 12299, -29692, 6270, -17855, + 29692, -12299, 26722, -31521, +}; + +static struct +{ + DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; +} tab_frw_01234567_sse2 = +{{ +//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table +#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ + C4, C4, C5, C7, C2, C6, C3, -C7, \ + -C4, C4, C7, C3, C6, -C2, C7, -C5, \ + C4, -C4, C5, -C1, C2, -C6, C3, -C1, +// c1..c7 * cos(pi/4) * 2^15 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 +}}; + +#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long + +#define FDCT_COL(cpu, mm, mov)\ +static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ +{\ + __asm__ volatile (\ + #mov" 16(%0), %%"#mm"0 \n\t" \ + #mov" 96(%0), %%"#mm"1 \n\t" \ + #mov" %%"#mm"0, %%"#mm"2 \n\t" \ + #mov" 32(%0), %%"#mm"3 \n\t" \ + "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ + #mov" 80(%0), %%"#mm"4 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ + #mov" (%0), %%"#mm"5 \n\t" \ + "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ + "paddsw 112(%0), %%"#mm"5 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ + #mov" %%"#mm"0, %%"#mm"6 \n\t" \ + "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ + #mov" 16(%1), %%"#mm"1 \n\t" \ + "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ + #mov" 48(%0), %%"#mm"7 \n\t" \ + "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ + "paddsw 64(%0), %%"#mm"7 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ + "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ + #mov" %%"#mm"5, %%"#mm"4 \n\t" \ + "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ + "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ + "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ + "por (%2), %%"#mm"1 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ + "pmulhw 16(%1), %%"#mm"5 \n\t" \ + #mov" %%"#mm"4, %%"#mm"7 \n\t" \ + "psubsw 80(%0), %%"#mm"3 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ + #mov" %%"#mm"1, 32(%3) \n\t" \ + "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ + #mov" 48(%0), %%"#mm"1 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ + "psubsw 64(%0), %%"#mm"1 \n\t" \ + #mov" %%"#mm"2, %%"#mm"6 \n\t" \ + #mov" %%"#mm"4, 64(%3) \n\t" \ + "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ + "pmulhw (%4), %%"#mm"2 \n\t" \ + "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ + "pmulhw (%4), %%"#mm"6 \n\t" \ + "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ + "por (%2), %%"#mm"5 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ + "por (%2), %%"#mm"2 \n\t" \ + #mov" %%"#mm"1, %%"#mm"4 \n\t" \ + #mov" (%0), %%"#mm"3 \n\t" \ + "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ + "psubsw 112(%0), %%"#mm"3 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ + #mov" (%1), %%"#mm"0 \n\t" \ + "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ + #mov" 32(%1), %%"#mm"6 \n\t" \ + "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ + #mov" %%"#mm"7, (%3) \n\t" \ + "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ + #mov" %%"#mm"5, 96(%3) \n\t" \ + #mov" %%"#mm"3, %%"#mm"7 \n\t" \ + #mov" 32(%1), %%"#mm"5 \n\t" \ + "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ + "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ + "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ + "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ + "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ + "pmulhw (%1), %%"#mm"3 \n\t" \ + "por (%2), %%"#mm"0 \n\t" \ + "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ + "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ + #mov" %%"#mm"0, 16(%3) \n\t" \ + "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ + #mov" %%"#mm"7, 48(%3) \n\t" \ + "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ + #mov" %%"#mm"5, 80(%3) \n\t" \ + #mov" %%"#mm"3, 112(%3) \n\t" \ + : \ + : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ + "r" (out + offset), "r" (ocos_4_16)); \ +} + +FDCT_COL(mmx, mm, movq) +FDCT_COL(sse2, xmm, movdqa) + +static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) +{ + __asm__ volatile( +#define FDCT_ROW_SSE2_H1(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" \ + "movdqa " #t "(%1), %%xmm4 \n\t" \ + "movdqa " #t "+16(%1), %%xmm5 \n\t" + +#define FDCT_ROW_SSE2_H2(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" + +#define FDCT_ROW_SSE2(i) \ + "movq %%xmm2, %%xmm1 \n\t" \ + "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ + "paddsw %%xmm0, %%xmm1 \n\t" \ + "psubsw %%xmm0, %%xmm2 \n\t" \ + "punpckldq %%xmm2, %%xmm1 \n\t" \ + "pshufd $78, %%xmm1, %%xmm2 \n\t" \ + "pmaddwd %%xmm2, %%xmm3 \n\t" \ + "pmaddwd %%xmm1, %%xmm7 \n\t" \ + "pmaddwd %%xmm5, %%xmm2 \n\t" \ + "pmaddwd %%xmm4, %%xmm1 \n\t" \ + "paddd %%xmm7, %%xmm3 \n\t" \ + "paddd %%xmm2, %%xmm1 \n\t" \ + "paddd %%xmm6, %%xmm3 \n\t" \ + "paddd %%xmm6, %%xmm1 \n\t" \ + "psrad %3, %%xmm3 \n\t" \ + "psrad %3, %%xmm1 \n\t" \ + "packssdw %%xmm3, %%xmm1 \n\t" \ + "movdqa %%xmm1, " #i "(%4) \n\t" + + "movdqa (%2), %%xmm6 \n\t" + FDCT_ROW_SSE2_H1(0,0) + FDCT_ROW_SSE2(0) + FDCT_ROW_SSE2_H2(64,0) + FDCT_ROW_SSE2(64) + + FDCT_ROW_SSE2_H1(16,64) + FDCT_ROW_SSE2(16) + FDCT_ROW_SSE2_H2(112,64) + FDCT_ROW_SSE2(112) + + FDCT_ROW_SSE2_H1(32,128) + FDCT_ROW_SSE2(32) + FDCT_ROW_SSE2_H2(96,128) + FDCT_ROW_SSE2(96) + + FDCT_ROW_SSE2_H1(48,192) + FDCT_ROW_SSE2(48) + FDCT_ROW_SSE2_H2(80,192) + FDCT_ROW_SSE2(80) + : + : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), + "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7") + ); +} + +static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) +{ + __asm__ volatile ( + "pshufw $0x1B, 8(%0), %%mm5 \n\t" + "movq (%0), %%mm0 \n\t" + "movq %%mm0, %%mm1 \n\t" + "paddsw %%mm5, %%mm0 \n\t" + "psubsw %%mm5, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "punpckldq %%mm1, %%mm0 \n\t" + "punpckhdq %%mm1, %%mm2 \n\t" + "movq (%1), %%mm1 \n\t" + "movq 8(%1), %%mm3 \n\t" + "movq 16(%1), %%mm4 \n\t" + "movq 24(%1), %%mm5 \n\t" + "movq 32(%1), %%mm6 \n\t" + "movq 40(%1), %%mm7 \n\t" + "pmaddwd %%mm0, %%mm1 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "pmaddwd %%mm0, %%mm4 \n\t" + "pmaddwd %%mm2, %%mm5 \n\t" + "pmaddwd %%mm0, %%mm6 \n\t" + "pmaddwd %%mm2, %%mm7 \n\t" + "pmaddwd 48(%1), %%mm0 \n\t" + "pmaddwd 56(%1), %%mm2 \n\t" + "paddd %%mm1, %%mm3 \n\t" + "paddd %%mm4, %%mm5 \n\t" + "paddd %%mm6, %%mm7 \n\t" + "paddd %%mm0, %%mm2 \n\t" + "movq (%2), %%mm0 \n\t" + "paddd %%mm0, %%mm3 \n\t" + "paddd %%mm0, %%mm5 \n\t" + "paddd %%mm0, %%mm7 \n\t" + "paddd %%mm0, %%mm2 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" + "packssdw %%mm5, %%mm3 \n\t" + "packssdw %%mm2, %%mm7 \n\t" + "movq %%mm3, (%3) \n\t" + "movq %%mm7, 8(%3) \n\t" + : + : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); +} + +static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) +{ + //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) + __asm__ volatile( + "movd 12(%0), %%mm1 \n\t" + "punpcklwd 8(%0), %%mm1 \n\t" + "movq %%mm1, %%mm2 \n\t" + "psrlq $0x20, %%mm1 \n\t" + "movq 0(%0), %%mm0 \n\t" + "punpcklwd %%mm2, %%mm1 \n\t" + "movq %%mm0, %%mm5 \n\t" + "paddsw %%mm1, %%mm0 \n\t" + "psubsw %%mm1, %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "punpckldq %%mm5, %%mm0 \n\t" + "punpckhdq %%mm5, %%mm2 \n\t" + "movq 0(%1), %%mm1 \n\t" + "movq 8(%1), %%mm3 \n\t" + "movq 16(%1), %%mm4 \n\t" + "movq 24(%1), %%mm5 \n\t" + "movq 32(%1), %%mm6 \n\t" + "movq 40(%1), %%mm7 \n\t" + "pmaddwd %%mm0, %%mm1 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "pmaddwd %%mm0, %%mm4 \n\t" + "pmaddwd %%mm2, %%mm5 \n\t" + "pmaddwd %%mm0, %%mm6 \n\t" + "pmaddwd %%mm2, %%mm7 \n\t" + "pmaddwd 48(%1), %%mm0 \n\t" + "pmaddwd 56(%1), %%mm2 \n\t" + "paddd %%mm1, %%mm3 \n\t" + "paddd %%mm4, %%mm5 \n\t" + "paddd %%mm6, %%mm7 \n\t" + "paddd %%mm0, %%mm2 \n\t" + "movq (%2), %%mm0 \n\t" + "paddd %%mm0, %%mm3 \n\t" + "paddd %%mm0, %%mm5 \n\t" + "paddd %%mm0, %%mm7 \n\t" + "paddd %%mm0, %%mm2 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" + "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" + "packssdw %%mm5, %%mm3 \n\t" + "packssdw %%mm2, %%mm7 \n\t" + "movq %%mm3, 0(%3) \n\t" + "movq %%mm7, 8(%3) \n\t" + : + : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); +} + +void ff_fdct_mmx(int16_t *block) +{ + DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; + int16_t * block1= (int16_t*)align_tmp; + const int16_t *table= tab_frw_01234567; + int i; + + fdct_col_mmx(block, block1, 0); + fdct_col_mmx(block, block1, 4); + + for(i=8;i>0;i--) { + fdct_row_mmx(block1, block, table); + block1 += 8; + table += 32; + block += 8; + } +} + +void ff_fdct_mmx2(int16_t *block) +{ + DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; + int16_t *block1= (int16_t*)align_tmp; + const int16_t *table= tab_frw_01234567; + int i; + + fdct_col_mmx(block, block1, 0); + fdct_col_mmx(block, block1, 4); + + for(i=8;i>0;i--) { + fdct_row_mmx2(block1, block, table); + block1 += 8; + table += 32; + block += 8; + } +} + +void ff_fdct_sse2(int16_t *block) +{ + DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; + int16_t * const block1= (int16_t*)align_tmp; + + fdct_col_sse2(block, block1, 0); + fdct_row_sse2(block1, block); +} + +#endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/x86/fdct_mmx.c b/libavcodec/x86/fdct_mmx.c deleted file mode 100644 index f9bd3f2508..0000000000 --- a/libavcodec/x86/fdct_mmx.c +++ /dev/null @@ -1,585 +0,0 @@ -/* - * MMX optimized forward DCT - * The gcc porting is Copyright (c) 2001 Fabrice Bellard. - * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer - * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. - * - * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT - * - * Intel Application Note AP-922 - fast, precise implementation of DCT - * http://developer.intel.com/vtune/cbts/appnotes.htm - * - * Also of inspiration: - * a page about fdct at http://www.geocities.com/ssavekar/dct.htm - * Skal's fdct at http://skal.planet-d.net/coding/dct.html - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/common.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/dsputil.h" - -#if HAVE_INLINE_ASM - -////////////////////////////////////////////////////////////////////// -// -// constants for the forward DCT -// ----------------------------- -// -// Be sure to check that your compiler is aligning all constants to QWORD -// (8-byte) memory boundaries! Otherwise the unaligned memory access will -// severely stall MMX execution. -// -////////////////////////////////////////////////////////////////////// - -#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy -#define SHIFT_FRW_COL BITS_FRW_ACC -#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) -#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) -//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) - -#define X8(x) x,x,x,x,x,x,x,x - -//concatenated table, for forward DCT transformation -DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { - X8(13036), // tg * (2<<16) + 0.5 - X8(27146), // tg * (2<<16) + 0.5 - X8(-21746) // tg * (2<<16) + 0.5 -}; - -DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { - X8(23170) //cos * (2<<15) + 0.5 -}; - -DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; - -DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; - -static struct -{ - DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; -} fdct_r_row_sse2 = -{{ - RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW -}}; -//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; - -DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table - 16384, 16384, 22725, 19266, - 16384, 16384, 12873, 4520, - 21407, 8867, 19266, -4520, - -8867, -21407, -22725, -12873, - 16384, -16384, 12873, -22725, - -16384, 16384, 4520, 19266, - 8867, -21407, 4520, -12873, - 21407, -8867, 19266, -22725, - - 22725, 22725, 31521, 26722, - 22725, 22725, 17855, 6270, - 29692, 12299, 26722, -6270, - -12299, -29692, -31521, -17855, - 22725, -22725, 17855, -31521, - -22725, 22725, 6270, 26722, - 12299, -29692, 6270, -17855, - 29692, -12299, 26722, -31521, - - 21407, 21407, 29692, 25172, - 21407, 21407, 16819, 5906, - 27969, 11585, 25172, -5906, - -11585, -27969, -29692, -16819, - 21407, -21407, 16819, -29692, - -21407, 21407, 5906, 25172, - 11585, -27969, 5906, -16819, - 27969, -11585, 25172, -29692, - - 19266, 19266, 26722, 22654, - 19266, 19266, 15137, 5315, - 25172, 10426, 22654, -5315, - -10426, -25172, -26722, -15137, - 19266, -19266, 15137, -26722, - -19266, 19266, 5315, 22654, - 10426, -25172, 5315, -15137, - 25172, -10426, 22654, -26722, - - 16384, 16384, 22725, 19266, - 16384, 16384, 12873, 4520, - 21407, 8867, 19266, -4520, - -8867, -21407, -22725, -12873, - 16384, -16384, 12873, -22725, - -16384, 16384, 4520, 19266, - 8867, -21407, 4520, -12873, - 21407, -8867, 19266, -22725, - - 19266, 19266, 26722, 22654, - 19266, 19266, 15137, 5315, - 25172, 10426, 22654, -5315, - -10426, -25172, -26722, -15137, - 19266, -19266, 15137, -26722, - -19266, 19266, 5315, 22654, - 10426, -25172, 5315, -15137, - 25172, -10426, 22654, -26722, - - 21407, 21407, 29692, 25172, - 21407, 21407, 16819, 5906, - 27969, 11585, 25172, -5906, - -11585, -27969, -29692, -16819, - 21407, -21407, 16819, -29692, - -21407, 21407, 5906, 25172, - 11585, -27969, 5906, -16819, - 27969, -11585, 25172, -29692, - - 22725, 22725, 31521, 26722, - 22725, 22725, 17855, 6270, - 29692, 12299, 26722, -6270, - -12299, -29692, -31521, -17855, - 22725, -22725, 17855, -31521, - -22725, 22725, 6270, 26722, - 12299, -29692, 6270, -17855, - 29692, -12299, 26722, -31521, -}; - -static struct -{ - DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; -} tab_frw_01234567_sse2 = -{{ -//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table -#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ - C4, C4, C5, C7, C2, C6, C3, -C7, \ - -C4, C4, C7, C3, C6, -C2, C7, -C5, \ - C4, -C4, C5, -C1, C2, -C6, C3, -C1, -// c1..c7 * cos(pi/4) * 2^15 -#define C1 22725 -#define C2 21407 -#define C3 19266 -#define C4 16384 -#define C5 12873 -#define C6 8867 -#define C7 4520 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 31521 -#define C2 29692 -#define C3 26722 -#define C4 22725 -#define C5 17855 -#define C6 12299 -#define C7 6270 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 29692 -#define C2 27969 -#define C3 25172 -#define C4 21407 -#define C5 16819 -#define C6 11585 -#define C7 5906 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 26722 -#define C2 25172 -#define C3 22654 -#define C4 19266 -#define C5 15137 -#define C6 10426 -#define C7 5315 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 22725 -#define C2 21407 -#define C3 19266 -#define C4 16384 -#define C5 12873 -#define C6 8867 -#define C7 4520 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 26722 -#define C2 25172 -#define C3 22654 -#define C4 19266 -#define C5 15137 -#define C6 10426 -#define C7 5315 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 29692 -#define C2 27969 -#define C3 25172 -#define C4 21407 -#define C5 16819 -#define C6 11585 -#define C7 5906 -TABLE_SSE2 - -#undef C1 -#undef C2 -#undef C3 -#undef C4 -#undef C5 -#undef C6 -#undef C7 -#define C1 31521 -#define C2 29692 -#define C3 26722 -#define C4 22725 -#define C5 17855 -#define C6 12299 -#define C7 6270 -TABLE_SSE2 -}}; - -#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long - -#define FDCT_COL(cpu, mm, mov)\ -static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ -{\ - __asm__ volatile (\ - #mov" 16(%0), %%"#mm"0 \n\t" \ - #mov" 96(%0), %%"#mm"1 \n\t" \ - #mov" %%"#mm"0, %%"#mm"2 \n\t" \ - #mov" 32(%0), %%"#mm"3 \n\t" \ - "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ - #mov" 80(%0), %%"#mm"4 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ - #mov" (%0), %%"#mm"5 \n\t" \ - "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ - "paddsw 112(%0), %%"#mm"5 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ - #mov" %%"#mm"0, %%"#mm"6 \n\t" \ - "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ - #mov" 16(%1), %%"#mm"1 \n\t" \ - "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ - #mov" 48(%0), %%"#mm"7 \n\t" \ - "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ - "paddsw 64(%0), %%"#mm"7 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ - "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ - #mov" %%"#mm"5, %%"#mm"4 \n\t" \ - "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ - "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ - "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ - "por (%2), %%"#mm"1 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ - "pmulhw 16(%1), %%"#mm"5 \n\t" \ - #mov" %%"#mm"4, %%"#mm"7 \n\t" \ - "psubsw 80(%0), %%"#mm"3 \n\t" \ - "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ - #mov" %%"#mm"1, 32(%3) \n\t" \ - "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ - #mov" 48(%0), %%"#mm"1 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ - "psubsw 64(%0), %%"#mm"1 \n\t" \ - #mov" %%"#mm"2, %%"#mm"6 \n\t" \ - #mov" %%"#mm"4, 64(%3) \n\t" \ - "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ - "pmulhw (%4), %%"#mm"2 \n\t" \ - "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ - "pmulhw (%4), %%"#mm"6 \n\t" \ - "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ - "por (%2), %%"#mm"5 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ - "por (%2), %%"#mm"2 \n\t" \ - #mov" %%"#mm"1, %%"#mm"4 \n\t" \ - #mov" (%0), %%"#mm"3 \n\t" \ - "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ - "psubsw 112(%0), %%"#mm"3 \n\t" \ - "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ - #mov" (%1), %%"#mm"0 \n\t" \ - "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ - #mov" 32(%1), %%"#mm"6 \n\t" \ - "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ - #mov" %%"#mm"7, (%3) \n\t" \ - "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ - #mov" %%"#mm"5, 96(%3) \n\t" \ - #mov" %%"#mm"3, %%"#mm"7 \n\t" \ - #mov" 32(%1), %%"#mm"5 \n\t" \ - "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ - "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ - "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ - "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ - "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ - "pmulhw (%1), %%"#mm"3 \n\t" \ - "por (%2), %%"#mm"0 \n\t" \ - "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ - "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ - #mov" %%"#mm"0, 16(%3) \n\t" \ - "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ - #mov" %%"#mm"7, 48(%3) \n\t" \ - "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ - #mov" %%"#mm"5, 80(%3) \n\t" \ - #mov" %%"#mm"3, 112(%3) \n\t" \ - : \ - : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ - "r" (out + offset), "r" (ocos_4_16)); \ -} - -FDCT_COL(mmx, mm, movq) -FDCT_COL(sse2, xmm, movdqa) - -static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) -{ - __asm__ volatile( -#define FDCT_ROW_SSE2_H1(i,t) \ - "movq " #i "(%0), %%xmm2 \n\t" \ - "movq " #i "+8(%0), %%xmm0 \n\t" \ - "movdqa " #t "+32(%1), %%xmm3 \n\t" \ - "movdqa " #t "+48(%1), %%xmm7 \n\t" \ - "movdqa " #t "(%1), %%xmm4 \n\t" \ - "movdqa " #t "+16(%1), %%xmm5 \n\t" - -#define FDCT_ROW_SSE2_H2(i,t) \ - "movq " #i "(%0), %%xmm2 \n\t" \ - "movq " #i "+8(%0), %%xmm0 \n\t" \ - "movdqa " #t "+32(%1), %%xmm3 \n\t" \ - "movdqa " #t "+48(%1), %%xmm7 \n\t" - -#define FDCT_ROW_SSE2(i) \ - "movq %%xmm2, %%xmm1 \n\t" \ - "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ - "paddsw %%xmm0, %%xmm1 \n\t" \ - "psubsw %%xmm0, %%xmm2 \n\t" \ - "punpckldq %%xmm2, %%xmm1 \n\t" \ - "pshufd $78, %%xmm1, %%xmm2 \n\t" \ - "pmaddwd %%xmm2, %%xmm3 \n\t" \ - "pmaddwd %%xmm1, %%xmm7 \n\t" \ - "pmaddwd %%xmm5, %%xmm2 \n\t" \ - "pmaddwd %%xmm4, %%xmm1 \n\t" \ - "paddd %%xmm7, %%xmm3 \n\t" \ - "paddd %%xmm2, %%xmm1 \n\t" \ - "paddd %%xmm6, %%xmm3 \n\t" \ - "paddd %%xmm6, %%xmm1 \n\t" \ - "psrad %3, %%xmm3 \n\t" \ - "psrad %3, %%xmm1 \n\t" \ - "packssdw %%xmm3, %%xmm1 \n\t" \ - "movdqa %%xmm1, " #i "(%4) \n\t" - - "movdqa (%2), %%xmm6 \n\t" - FDCT_ROW_SSE2_H1(0,0) - FDCT_ROW_SSE2(0) - FDCT_ROW_SSE2_H2(64,0) - FDCT_ROW_SSE2(64) - - FDCT_ROW_SSE2_H1(16,64) - FDCT_ROW_SSE2(16) - FDCT_ROW_SSE2_H2(112,64) - FDCT_ROW_SSE2(112) - - FDCT_ROW_SSE2_H1(32,128) - FDCT_ROW_SSE2(32) - FDCT_ROW_SSE2_H2(96,128) - FDCT_ROW_SSE2(96) - - FDCT_ROW_SSE2_H1(48,192) - FDCT_ROW_SSE2(48) - FDCT_ROW_SSE2_H2(80,192) - FDCT_ROW_SSE2(80) - : - : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), - "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7") - ); -} - -static av_always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) -{ - __asm__ volatile ( - "pshufw $0x1B, 8(%0), %%mm5 \n\t" - "movq (%0), %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "paddsw %%mm5, %%mm0 \n\t" - "psubsw %%mm5, %%mm1 \n\t" - "movq %%mm0, %%mm2 \n\t" - "punpckldq %%mm1, %%mm0 \n\t" - "punpckhdq %%mm1, %%mm2 \n\t" - "movq (%1), %%mm1 \n\t" - "movq 8(%1), %%mm3 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm5 \n\t" - "movq 32(%1), %%mm6 \n\t" - "movq 40(%1), %%mm7 \n\t" - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm0, %%mm4 \n\t" - "pmaddwd %%mm2, %%mm5 \n\t" - "pmaddwd %%mm0, %%mm6 \n\t" - "pmaddwd %%mm2, %%mm7 \n\t" - "pmaddwd 48(%1), %%mm0 \n\t" - "pmaddwd 56(%1), %%mm2 \n\t" - "paddd %%mm1, %%mm3 \n\t" - "paddd %%mm4, %%mm5 \n\t" - "paddd %%mm6, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "movq (%2), %%mm0 \n\t" - "paddd %%mm0, %%mm3 \n\t" - "paddd %%mm0, %%mm5 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" - "packssdw %%mm5, %%mm3 \n\t" - "packssdw %%mm2, %%mm7 \n\t" - "movq %%mm3, (%3) \n\t" - "movq %%mm7, 8(%3) \n\t" - : - : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); -} - -static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) -{ - //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) - __asm__ volatile( - "movd 12(%0), %%mm1 \n\t" - "punpcklwd 8(%0), %%mm1 \n\t" - "movq %%mm1, %%mm2 \n\t" - "psrlq $0x20, %%mm1 \n\t" - "movq 0(%0), %%mm0 \n\t" - "punpcklwd %%mm2, %%mm1 \n\t" - "movq %%mm0, %%mm5 \n\t" - "paddsw %%mm1, %%mm0 \n\t" - "psubsw %%mm1, %%mm5 \n\t" - "movq %%mm0, %%mm2 \n\t" - "punpckldq %%mm5, %%mm0 \n\t" - "punpckhdq %%mm5, %%mm2 \n\t" - "movq 0(%1), %%mm1 \n\t" - "movq 8(%1), %%mm3 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm5 \n\t" - "movq 32(%1), %%mm6 \n\t" - "movq 40(%1), %%mm7 \n\t" - "pmaddwd %%mm0, %%mm1 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "pmaddwd %%mm0, %%mm4 \n\t" - "pmaddwd %%mm2, %%mm5 \n\t" - "pmaddwd %%mm0, %%mm6 \n\t" - "pmaddwd %%mm2, %%mm7 \n\t" - "pmaddwd 48(%1), %%mm0 \n\t" - "pmaddwd 56(%1), %%mm2 \n\t" - "paddd %%mm1, %%mm3 \n\t" - "paddd %%mm4, %%mm5 \n\t" - "paddd %%mm6, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "movq (%2), %%mm0 \n\t" - "paddd %%mm0, %%mm3 \n\t" - "paddd %%mm0, %%mm5 \n\t" - "paddd %%mm0, %%mm7 \n\t" - "paddd %%mm0, %%mm2 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" - "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" - "packssdw %%mm5, %%mm3 \n\t" - "packssdw %%mm2, %%mm7 \n\t" - "movq %%mm3, 0(%3) \n\t" - "movq %%mm7, 8(%3) \n\t" - : - : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); -} - -void ff_fdct_mmx(int16_t *block) -{ - DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; - int16_t * block1= (int16_t*)align_tmp; - const int16_t *table= tab_frw_01234567; - int i; - - fdct_col_mmx(block, block1, 0); - fdct_col_mmx(block, block1, 4); - - for(i=8;i>0;i--) { - fdct_row_mmx(block1, block, table); - block1 += 8; - table += 32; - block += 8; - } -} - -void ff_fdct_mmx2(int16_t *block) -{ - DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; - int16_t *block1= (int16_t*)align_tmp; - const int16_t *table= tab_frw_01234567; - int i; - - fdct_col_mmx(block, block1, 0); - fdct_col_mmx(block, block1, 4); - - for(i=8;i>0;i--) { - fdct_row_mmx2(block1, block, table); - block1 += 8; - table += 32; - block += 8; - } -} - -void ff_fdct_sse2(int16_t *block) -{ - DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; - int16_t * const block1= (int16_t*)align_tmp; - - fdct_col_sse2(block, block1, 0); - fdct_row_sse2(block1, block); -} - -#endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/x86/fft.asm b/libavcodec/x86/fft.asm new file mode 100644 index 0000000000..645253cbd3 --- /dev/null +++ b/libavcodec/x86/fft.asm @@ -0,0 +1,1105 @@ +;****************************************************************************** +;* FFT transform with SSE/3DNow optimizations +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2011 Vitor Sessak +;* +;* This algorithm (though not any of the implementation details) is +;* based on libdjbfft by D. J. Bernstein. +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +; These functions are not individually interchangeable with the C versions. +; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results +; in blocks as conventient to the vector size. +; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) + +%include "x86inc.asm" +%include "x86util.asm" + +%if ARCH_X86_64 +%define pointer resq +%else +%define pointer resd +%endif + +struc FFTContext + .nbits: resd 1 + .reverse: resd 1 + .revtab: pointer 1 + .tmpbuf: pointer 1 + .mdctsize: resd 1 + .mdctbits: resd 1 + .tcos: pointer 1 + .tsin: pointer 1 + .fftperm: pointer 1 + .fftcalc: pointer 1 + .imdctcalc:pointer 1 + .imdcthalf:pointer 1 +endstruc + +SECTION_RODATA + +%define M_SQRT1_2 0.70710678118654752440 +%define M_COS_PI_1_8 0.923879532511287 +%define M_COS_PI_3_8 0.38268343236509 + +align 32 +ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 +ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 + +ps_root2: times 8 dd M_SQRT1_2 +ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 + +perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 +perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 +ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 +ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 +ps_m1m1m1m1: times 4 dd 1<<31 +ps_m1p1: dd 1<<31, 0 + +%assign i 16 +%rep 13 +cextern cos_ %+ i +%assign i i<<1 +%endrep + +%if ARCH_X86_64 + %define pointer dq +%else + %define pointer dd +%endif + +%macro IF0 1+ +%endmacro +%macro IF1 1+ + %1 +%endmacro + +SECTION_TEXT + +%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 + mova %1, %3 + mova %2, %1 + pfadd %1, %4 + pfsub %2, %4 +%endmacro + +%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 + mova %5, %3 + pfsub %3, %4 + pfadd %5, %4 ; {t6,t5} + pxor %3, [ps_m1p1] ; {t8,t7} + mova %6, %1 + PSWAPD %3, %3 + pfadd %1, %5 ; {r0,i0} + pfsub %6, %5 ; {r2,i2} + mova %4, %2 + pfadd %2, %3 ; {r1,i1} + pfsub %4, %3 ; {r3,i3} + SWAP %3, %6 +%endmacro + +; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} +; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} +; %3, %4, %5 tmp +; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} +; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} +%macro T8_AVX 5 + vsubps %5, %1, %2 ; v = %1 - %2 + vaddps %3, %1, %2 ; w = %1 + %2 + vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 + vpermilps %2, %2, [perm1] + vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} + vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} + vsubps %4, %5, %1 ; s = r - q + vaddps %1, %5, %1 ; u = r + q + vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} + vshufps %5, %4, %1, 0xbb + vshufps %3, %4, %1, 0xee + vperm2f128 %3, %3, %5, 0x13 + vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} + vshufps %2, %1, %4, 0xdd + vshufps %1, %1, %4, 0x88 + vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} + vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} + vsubps %5, %1, %3 + vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} + vsubps %2, %4, %1 ; %2 = v - w + vaddps %1, %4, %1 ; %1 = v + w +%endmacro + +; In SSE mode do one fft4 transforms +; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} +; +; In AVX mode do two fft4 transforms +; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} +%macro T4_SSE 3 + subps %3, %1, %2 ; {t3,t4,-t8,t7} + addps %1, %1, %2 ; {t1,t2,t6,t5} + xorps %3, %3, [ps_p1p1m1p1] + shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} + shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} + subps %3, %1, %2 ; {r2,i2,r3,i3} + addps %1, %1, %2 ; {r0,i0,r1,i1} + shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} + shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} +%endmacro + +; In SSE mode do one FFT8 +; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} +; +; In AVX mode do two FFT8 +; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} +; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} +; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} +; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} +%macro T8_SSE 6 + addps %6, %3, %4 ; {t1,t2,t3,t4} + subps %3, %3, %4 ; {r5,i5,r7,i7} + shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} + mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} + mulps %4, %4, [ps_root2] + addps %3, %3, %4 ; {t8,t7,ta,t9} + shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} + shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} + subps %3, %6, %4 ; {t6,t5,tc,tb} + addps %6, %6, %4 ; {t1,t2,t9,ta} + shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} + shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} + subps %3, %1, %6 ; {r4,r5,r6,r7} + addps %1, %1, %6 ; {r0,r1,r2,r3} + subps %4, %2, %5 ; {i4,i5,i6,i7} + addps %2, %2, %5 ; {i0,i1,i2,i3} +%endmacro + +; scheduled for cpu-bound sizes +%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim +IF%1 mova m4, Z(4) +IF%1 mova m5, Z(5) + mova m0, %2 ; wre + mova m1, %3 ; wim + mulps m2, m4, m0 ; r2*wre +IF%1 mova m6, Z2(6) + mulps m3, m5, m1 ; i2*wim +IF%1 mova m7, Z2(7) + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m1, m1, m6 ; r3*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 + mova Z2(6), m4 + mova Z(2), m3 + mova m2, Z(3) + addps m3, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 + mova Z2(7), m2 + mova Z(3), m1 + subps m4, m7, m3 ; i2 + addps m3, m3, m7 ; i0 + mova Z(5), m4 + mova Z(1), m3 +%endmacro + +; scheduled to avoid store->load aliasing +%macro PASS_BIG 1 ; (!interleave) + mova m4, Z(4) ; r2 + mova m5, Z(5) ; i2 + mova m0, [wq] ; wre + mova m1, [wq+o1q] ; wim + mulps m2, m4, m0 ; r2*wre + mova m6, Z2(6) ; r3 + mulps m3, m5, m1 ; i2*wim + mova m7, Z2(7) ; i3 + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + mulps m1, m1, m6 ; r3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 +IF%1 mova Z2(6), m4 +IF%1 mova Z(2), m3 + mova m2, Z(3) + addps m5, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 +IF%1 mova Z2(7), m2 +IF%1 mova Z(3), m1 + subps m6, m7, m5 ; i2 + addps m5, m5, m7 ; i0 +IF%1 mova Z(5), m6 +IF%1 mova Z(1), m5 +%if %1==0 + INTERL m1, m3, m7, Z, 2 + INTERL m2, m4, m0, Z2, 6 + + mova m1, Z(0) + mova m2, Z(4) + + INTERL m5, m1, m3, Z, 0 + INTERL m6, m2, m7, Z, 4 +%endif +%endmacro + +%macro PUNPCK 3 + mova %3, %1 + punpckldq %1, %2 + punpckhdq %3, %2 +%endmacro + +%define Z(x) [r0+mmsize*x] +%define Z2(x) [r0+mmsize*x] +%define ZH(x) [r0+mmsize*x+mmsize/2] + +INIT_YMM avx + +%if HAVE_AVX +align 16 +fft8_avx: + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m2, m3, m4 + mova Z(0), m0 + mova Z(1), m1 + ret + + +align 16 +fft16_avx: + mova m2, Z(2) + mova m3, Z(3) + T4_SSE m2, m3, m7 + + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m4, m5, m7 + + mova m4, [ps_cos16_1] + mova m5, [ps_cos16_2] + vmulps m6, m2, m4 + vmulps m7, m3, m5 + vaddps m7, m7, m6 + vmulps m2, m2, m5 + vmulps m3, m3, m4 + vsubps m3, m3, m2 + vblendps m2, m7, m3, 0xf0 + vperm2f128 m3, m7, m3, 0x21 + vaddps m4, m2, m3 + vsubps m2, m3, m2 + vperm2f128 m2, m2, m2, 0x01 + vsubps m3, m1, m2 + vaddps m1, m1, m2 + vsubps m5, m0, m4 + vaddps m0, m0, m4 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + vextractf128 Z(2), m5, 0 + vextractf128 ZH(2), m3, 0 + vextractf128 Z(3), m5, 1 + vextractf128 ZH(3), m3, 1 + ret + +align 16 +fft32_avx: + call fft16_avx + + mova m0, Z(4) + mova m1, Z(5) + + T4_SSE m0, m1, m4 + + mova m2, Z(6) + mova m3, Z(7) + + T8_SSE m0, m1, m2, m3, m4, m6 + ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} + ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} + + vperm2f128 m4, m0, m2, 0x20 + vperm2f128 m5, m1, m3, 0x20 + vperm2f128 m6, m0, m2, 0x31 + vperm2f128 m7, m1, m3, 0x31 + + PASS_SMALL 0, [cos_32], [cos_32+32] + + ret + +fft32_interleave_avx: + call fft32_avx + mov r2d, 32 +.deint_loop: + mova m2, Z(0) + mova m3, Z(1) + vunpcklps m0, m2, m3 + vunpckhps m1, m2, m3 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + add r0, mmsize*2 + sub r2d, mmsize/4 + jg .deint_loop + ret +%endif + +INIT_XMM sse + +align 16 +fft4_avx: +fft4_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova Z(0), m0 + mova Z(1), m1 + ret + +align 16 +fft8_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + ret + +align 16 +fft16_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova m4, Z(4) + mova m5, Z(5) + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + T4_SSE m4, m5, m6 + mova m6, Z2(6) + mova m7, Z2(7) + T4_SSE m6, m7, m0 + PASS_SMALL 0, [cos_16], [cos_16+16] + ret + + +%macro FFT48_3DNOW 0 +align 16 +fft4 %+ SUFFIX: + T2_3DNOW m0, m1, Z(0), Z(1) + mova m2, Z(2) + mova m3, Z(3) + T4_3DNOW m0, m1, m2, m3, m4, m5 + PUNPCK m0, m1, m4 + PUNPCK m2, m3, m5 + mova Z(0), m0 + mova Z(1), m4 + mova Z(2), m2 + mova Z(3), m5 + ret + +align 16 +fft8 %+ SUFFIX: + T2_3DNOW m0, m1, Z(0), Z(1) + mova m2, Z(2) + mova m3, Z(3) + T4_3DNOW m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(2), m2 + T2_3DNOW m4, m5, Z(4), Z(5) + T2_3DNOW m6, m7, Z2(6), Z2(7) + PSWAPD m0, m5 + PSWAPD m2, m7 + pxor m0, [ps_m1p1] + pxor m2, [ps_m1p1] + pfsub m5, m0 + pfadd m7, m2 + pfmul m5, [ps_root2] + pfmul m7, [ps_root2] + T4_3DNOW m1, m3, m5, m7, m0, m2 + mova Z(5), m5 + mova Z2(7), m7 + mova m0, Z(0) + mova m2, Z(2) + T4_3DNOW m0, m2, m4, m6, m5, m7 + PUNPCK m0, m1, m5 + PUNPCK m2, m3, m7 + mova Z(0), m0 + mova Z(1), m5 + mova Z(2), m2 + mova Z(3), m7 + PUNPCK m4, Z(5), m5 + PUNPCK m6, Z2(7), m7 + mova Z(4), m4 + mova Z(5), m5 + mova Z2(6), m6 + mova Z2(7), m7 + ret +%endmacro + +%if ARCH_X86_32 +%macro PSWAPD 2 +%if cpuflag(3dnowext) + pswapd %1, %2 +%elifidn %1, %2 + movd [r0+12], %1 + punpckhdq %1, [r0+8] +%else + movq %1, %2 + psrlq %1, 32 + punpckldq %1, %2 +%endif +%endmacro + +INIT_MMX 3dnowext +FFT48_3DNOW + +INIT_MMX 3dnow +FFT48_3DNOW +%endif + +%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] +%define Z2(x) [zcq + o3q + mmsize*(x&1)] +%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] +%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] + +%macro DECL_PASS 2+ ; name, payload +align 16 +%1: +DEFINE_ARGS zc, w, n, o1, o3 + lea o3q, [nq*3] + lea o1q, [nq*8] + shl o3q, 4 +.loop: + %2 + add zcq, mmsize*2 + add wq, mmsize + sub nd, mmsize/8 + jg .loop + rep ret +%endmacro + +%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs + lea r2, [dispatch_tab%1] + mov r2, [r2 + (%2q-2)*gprsize] +%ifdef PIC + lea r3, [$$] + add r2, r3 +%endif + call r2 +%endmacro ; FFT_DISPATCH + +INIT_YMM avx + +%if HAVE_AVX +%macro INTERL_AVX 5 + vunpckhps %3, %2, %1 + vunpcklps %2, %2, %1 + vextractf128 %4(%5), %2, 0 + vextractf128 %4 %+ H(%5), %3, 0 + vextractf128 %4(%5 + 1), %2, 1 + vextractf128 %4 %+ H(%5 + 1), %3, 1 +%endmacro + +%define INTERL INTERL_AVX + +DECL_PASS pass_avx, PASS_BIG 1 +DECL_PASS pass_interleave_avx, PASS_BIG 0 + +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + REP_RET + +%endif + +INIT_XMM sse + +%macro INTERL_SSE 5 + mova %3, %2 + unpcklps %2, %1 + unpckhps %3, %1 + mova %4(%5), %2 + mova %4(%5+1), %3 +%endmacro + +%define INTERL INTERL_SSE + +DECL_PASS pass_sse, PASS_BIG 1 +DECL_PASS pass_interleave_sse, PASS_BIG 0 + +%macro FFT_CALC_FUNC 0 +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + PUSH r1 + PUSH r3 + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + POP rcx + POP r4 + cmp rcx, 3+(mmsize/16) + jg .end + mov r2, -1 + add rcx, 3 + shl r2, cl + sub r4, r2 +.loop: +%if mmsize == 8 + PSWAPD m0, [r4 + r2 + 4] + mova [r4 + r2 + 4], m0 +%else + movaps xmm0, [r4 + r2] + movaps xmm1, xmm0 + unpcklps xmm0, [r4 + r2 + 16] + unpckhps xmm1, [r4 + r2 + 16] + movaps [r4 + r2], xmm0 + movaps [r4 + r2 + 16], xmm1 +%endif + add r2, mmsize*2 + jl .loop +.end: +%if cpuflag(3dnow) + femms + RET +%else + REP_RET +%endif +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnow +FFT_CALC_FUNC +INIT_MMX 3dnowext +FFT_CALC_FUNC +%endif +INIT_XMM sse +FFT_CALC_FUNC + +cglobal fft_permute, 2,7,1 + mov r4, [r0 + FFTContext.revtab] + mov r5, [r0 + FFTContext.tmpbuf] + mov ecx, [r0 + FFTContext.nbits] + mov r2, 1 + shl r2, cl + xor r0, r0 +%if ARCH_X86_32 + mov r1, r1m +%endif +.loop: + movaps xmm0, [r1 + 8*r0] + movzx r6, word [r4 + 2*r0] + movzx r3, word [r4 + 2*r0 + 2] + movlps [r5 + 8*r6], xmm0 + movhps [r5 + 8*r3], xmm0 + add r0, 2 + cmp r0, r2 + jl .loop + shl r2, 3 + add r1, r2 + add r5, r2 + neg r2 +; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B +.loopcopy: + movaps xmm0, [r5 + r2] + movaps xmm1, [r5 + r2 + 16] + movaps [r1 + r2], xmm0 + movaps [r1 + r2 + 16], xmm1 + add r2, 32 + jl .loopcopy + REP_RET + +%macro IMDCT_CALC_FUNC 0 +cglobal imdct_calc, 3,5,3 + mov r3d, [r0 + FFTContext.mdctsize] + mov r4, [r0 + FFTContext.imdcthalf] + add r1, r3 + PUSH r3 + PUSH r1 +%if ARCH_X86_32 + push r2 + push r1 + push r0 +%else + sub rsp, 8 +%endif + call r4 +%if ARCH_X86_32 + add esp, 12 +%else + add rsp, 8 +%endif + POP r1 + POP r3 + lea r0, [r1 + 2*r3] + mov r2, r3 + sub r3, mmsize + neg r2 + mova m2, [ps_m1m1m1m1] +.loop: +%if mmsize == 8 + PSWAPD m0, [r1 + r3] + PSWAPD m1, [r0 + r2] + pxor m0, m2 +%else + mova m0, [r1 + r3] + mova m1, [r0 + r2] + shufps m0, m0, 0x1b + shufps m1, m1, 0x1b + xorps m0, m2 +%endif + mova [r0 + r3], m1 + mova [r1 + r2], m0 + sub r3, mmsize + add r2, mmsize + jl .loop +%if cpuflag(3dnow) + femms + RET +%else + REP_RET +%endif +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnow +IMDCT_CALC_FUNC +INIT_MMX 3dnowext +IMDCT_CALC_FUNC +%endif + +INIT_XMM sse +IMDCT_CALC_FUNC + +%if ARCH_X86_32 +INIT_MMX 3dnow +%define mulps pfmul +%define addps pfadd +%define subps pfsub +%define unpcklps punpckldq +%define unpckhps punpckhdq +DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] +DECL_PASS pass_interleave_3dnow, PASS_BIG 0 +%define pass_3dnowext pass_3dnow +%define pass_interleave_3dnowext pass_interleave_3dnow +%endif + +%ifdef PIC +%define SECTION_REL - $$ +%else +%define SECTION_REL +%endif + +%macro DECL_FFT 1-2 ; nbits, suffix +%ifidn %0, 1 +%xdefine fullsuffix SUFFIX +%else +%xdefine fullsuffix %2 %+ SUFFIX +%endif +%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL +%if %1>=5 +%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL +%endif +%if %1>=6 +%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL +%endif + +%assign n 1<<%1 +%rep 17-%1 +%assign n2 n/2 +%assign n4 n/4 +%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL + +align 16 +fft %+ n %+ fullsuffix: + call fft %+ n2 %+ SUFFIX + add r0, n*4 - (n&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + add r0, n*2 - (n2&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + sub r0, n*6 + (n2&(-2<<%1)) + lea r1, [cos_ %+ n] + mov r2d, n4/2 + jmp pass %+ fullsuffix + +%assign n n*2 +%endrep +%undef n + +align 8 +dispatch_tab %+ fullsuffix: pointer list_of_fft +%endmacro ; DECL_FFT + +%if HAVE_AVX +INIT_YMM avx +DECL_FFT 6 +DECL_FFT 6, _interleave +%endif +INIT_XMM sse +DECL_FFT 5 +DECL_FFT 5, _interleave +%if ARCH_X86_32 +INIT_MMX 3dnow +DECL_FFT 4 +DECL_FFT 4, _interleave +INIT_MMX 3dnowext +DECL_FFT 4 +DECL_FFT 4, _interleave +%endif + +INIT_XMM sse +%undef mulps +%undef addps +%undef subps +%undef unpcklps +%undef unpckhps + +%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 +%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 + PSWAPD m0, [%3+%2*4] + movq m2, [%3+%1*4-8] + movq m3, m0 + punpckldq m0, m2 + punpckhdq m2, m3 + movd m1, [%4+%1*2-4] ; tcos[j] + movd m3, [%4+%2*2] ; tcos[n4-j-1] + punpckldq m1, [%5+%1*2-4] ; tsin[j] + punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] + + mova m4, m0 + PSWAPD m5, m1 + pfmul m0, m1 + pfmul m4, m5 + mova m6, m2 + PSWAPD m5, m3 + pfmul m2, m3 + pfmul m6, m5 +%if cpuflag(3dnowext) + pfpnacc m0, m4 + pfpnacc m2, m6 +%else + SBUTTERFLY dq, 0, 4, 1 + SBUTTERFLY dq, 2, 6, 3 + pxor m4, m7 + pxor m6, m7 + pfadd m0, m4 + pfadd m2, m6 +%endif +%else + movaps xmm0, [%3+%2*4] + movaps xmm1, [%3+%1*4-0x10] + movaps xmm2, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm1, xmm2, 0x77 + movlps xmm4, [%4+%2*2] + movlps xmm5, [%5+%2*2+0x0] + movhps xmm4, [%4+%1*2-0x8] + movhps xmm5, [%5+%1*2-0x8] + movaps xmm2, xmm0 + movaps xmm3, xmm1 + mulps xmm0, xmm5 + mulps xmm1, xmm4 + mulps xmm2, xmm4 + mulps xmm3, xmm5 + subps xmm1, xmm0 + addps xmm2, xmm3 + movaps xmm0, xmm1 + unpcklps xmm1, xmm2 + unpckhps xmm0, xmm2 +%endif +%endmacro + +%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 + mulps m6, %3, [%5+%1] + mulps m7, %2, [%5+%1] + mulps %2, %2, [%6+%1] + mulps %3, %3, [%6+%1] + subps %2, %2, m6 + addps %3, %3, m7 +%endmacro + +%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: + vmovaps ymm1, [%3+%1*2] + vmovaps ymm0, [%3+%1*2+0x20] + vmovaps ymm3, [%3+%2*2] + vmovaps ymm2, [%3+%2*2+0x20] + + CMUL %1, ymm0, ymm1, %3, %4, %5 + CMUL %2, ymm2, ymm3, %3, %4, %5 + vshufps ymm1, ymm1, ymm1, 0x1b + vshufps ymm3, ymm3, ymm3, 0x1b + vperm2f128 ymm1, ymm1, ymm1, 0x01 + vperm2f128 ymm3, ymm3, ymm3, 0x01 + vunpcklps ymm6, ymm2, ymm1 + vunpckhps ymm4, ymm2, ymm1 + vunpcklps ymm7, ymm0, ymm3 + vunpckhps ymm5, ymm0, ymm3 + + vextractf128 [%3+%1*2], ymm7, 0 + vextractf128 [%3+%1*2+0x10], ymm5, 0 + vextractf128 [%3+%1*2+0x20], ymm7, 1 + vextractf128 [%3+%1*2+0x30], ymm5, 1 + + vextractf128 [%3+%2*2], ymm6, 0 + vextractf128 [%3+%2*2+0x10], ymm4, 0 + vextractf128 [%3+%2*2+0x20], ymm6, 1 + vextractf128 [%3+%2*2+0x30], ymm4, 1 + sub %2, 0x20 + add %1, 0x20 + jl .post +%endmacro + +%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: + movaps xmm1, [%3+%1*2] + movaps xmm0, [%3+%1*2+0x10] + CMUL %1, xmm0, xmm1, %3, %4, %5 + movaps xmm5, [%3+%2*2] + movaps xmm4, [%3+%2*2+0x10] + CMUL %2, xmm4, xmm5, %3, %4, %5 + shufps xmm1, xmm1, 0x1b + shufps xmm5, xmm5, 0x1b + movaps xmm6, xmm4 + unpckhps xmm4, xmm1 + unpcklps xmm6, xmm1 + movaps xmm2, xmm0 + unpcklps xmm0, xmm5 + unpckhps xmm2, xmm5 + movaps [%3+%2*2], xmm6 + movaps [%3+%2*2+0x10], xmm4 + movaps [%3+%1*2], xmm0 + movaps [%3+%1*2+0x10], xmm2 + sub %2, 0x10 + add %1, 0x10 + jl .post +%endmacro + +%macro CMUL_3DNOW 6 + mova m6, [%1+%2*2] + mova %3, [%1+%2*2+8] + mova %4, m6 + mova m7, %3 + pfmul m6, [%5+%2] + pfmul %3, [%6+%2] + pfmul %4, [%6+%2] + pfmul m7, [%5+%2] + pfsub %3, m6 + pfadd %4, m7 +%endmacro + +%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: + CMUL_3DNOW %3, %1, m0, m1, %4, %5 + CMUL_3DNOW %3, %2, m2, m3, %4, %5 + movd [%3+%1*2+ 0], m0 + movd [%3+%2*2+12], m1 + movd [%3+%2*2+ 0], m2 + movd [%3+%1*2+12], m3 + psrlq m0, 32 + psrlq m1, 32 + psrlq m2, 32 + psrlq m3, 32 + movd [%3+%1*2+ 8], m0 + movd [%3+%2*2+ 4], m1 + movd [%3+%2*2+ 8], m2 + movd [%3+%1*2+ 4], m3 + sub %2, 8 + add %1, 8 + jl .post +%endmacro + +%macro DECL_IMDCT 1 +cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input +%if ARCH_X86_64 +%define rrevtab r7 +%define rtcos r8 +%define rtsin r9 +%else +%define rrevtab r6 +%define rtsin r6 +%define rtcos r5 +%endif + mov r3d, [r0+FFTContext.mdctsize] + add r2, r3 + shr r3, 1 + mov rtcos, [r0+FFTContext.tcos] + mov rtsin, [r0+FFTContext.tsin] + add rtcos, r3 + add rtsin, r3 +%if ARCH_X86_64 == 0 + push rtcos + push rtsin +%endif + shr r3, 1 + mov rrevtab, [r0+FFTContext.revtab] + add rrevtab, r3 +%if ARCH_X86_64 == 0 + push rrevtab +%endif + +%if mmsize == 8 + sub r3, 2 +%else + sub r3, 4 +%endif +%if ARCH_X86_64 || mmsize == 8 + xor r4, r4 + sub r4, r3 +%endif +%if notcpuflag(3dnowext) && mmsize == 8 + movd m7, [ps_m1m1m1m1] +%endif +.pre: +%if ARCH_X86_64 == 0 +;unspill +%if mmsize != 8 + xor r4, r4 + sub r4, r3 +%endif + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + + PREROTATER r4, r3, r2, rtcos, rtsin +%if mmsize == 8 + mov r6, [esp] ; rrevtab = ptr+n8 + movzx r5, word [rrevtab+r4-2] ; rrevtab[j] + movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] + mova [r1+r5*8], m0 + mova [r1+r6*8], m2 + add r4, 2 + sub r3, 2 +%else +%if ARCH_X86_64 + movzx r5, word [rrevtab+r4-4] + movzx r6, word [rrevtab+r4-2] + movzx r10, word [rrevtab+r3] + movzx r11, word [rrevtab+r3+2] + movlps [r1+r5 *8], xmm0 + movhps [r1+r6 *8], xmm0 + movlps [r1+r10*8], xmm1 + movhps [r1+r11*8], xmm1 + add r4, 4 +%else + mov r6, [esp] + movzx r5, word [r6+r4-4] + movzx r4, word [r6+r4-2] + movlps [r1+r5*8], xmm0 + movhps [r1+r4*8], xmm0 + movzx r5, word [r6+r3] + movzx r4, word [r6+r3+2] + movlps [r1+r5*8], xmm1 + movhps [r1+r4*8], xmm1 +%endif + sub r3, 4 +%endif + jns .pre + + mov r5, r0 + mov r6, r1 + mov r0, r1 + mov r1d, [r5+FFTContext.nbits] + + FFT_DISPATCH SUFFIX, r1 + + mov r0d, [r5+FFTContext.mdctsize] + add r6, r0 + shr r0, 1 +%if ARCH_X86_64 == 0 +%define rtcos r2 +%define rtsin r3 + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + neg r0 + mov r1, -mmsize + sub r1, r0 + %1 r0, r1, r6, rtcos, rtsin +%if ARCH_X86_64 == 0 + add esp, 12 +%endif +%if mmsize == 8 + femms +%endif + RET +%endmacro + +DECL_IMDCT POSROTATESHUF + +%if ARCH_X86_32 +INIT_MMX 3dnow +DECL_IMDCT POSROTATESHUF_3DNOW + +INIT_MMX 3dnowext +DECL_IMDCT POSROTATESHUF_3DNOW +%endif + +INIT_YMM avx + +%if HAVE_AVX +DECL_IMDCT POSROTATESHUF_AVX +%endif diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm deleted file mode 100644 index 645253cbd3..0000000000 --- a/libavcodec/x86/fft_mmx.asm +++ /dev/null @@ -1,1105 +0,0 @@ -;****************************************************************************** -;* FFT transform with SSE/3DNow optimizations -;* Copyright (c) 2008 Loren Merritt -;* Copyright (c) 2011 Vitor Sessak -;* -;* This algorithm (though not any of the implementation details) is -;* based on libdjbfft by D. J. Bernstein. -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;****************************************************************************** - -; These functions are not individually interchangeable with the C versions. -; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results -; in blocks as conventient to the vector size. -; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) - -%include "x86inc.asm" -%include "x86util.asm" - -%if ARCH_X86_64 -%define pointer resq -%else -%define pointer resd -%endif - -struc FFTContext - .nbits: resd 1 - .reverse: resd 1 - .revtab: pointer 1 - .tmpbuf: pointer 1 - .mdctsize: resd 1 - .mdctbits: resd 1 - .tcos: pointer 1 - .tsin: pointer 1 - .fftperm: pointer 1 - .fftcalc: pointer 1 - .imdctcalc:pointer 1 - .imdcthalf:pointer 1 -endstruc - -SECTION_RODATA - -%define M_SQRT1_2 0.70710678118654752440 -%define M_COS_PI_1_8 0.923879532511287 -%define M_COS_PI_3_8 0.38268343236509 - -align 32 -ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 -ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 - -ps_root2: times 8 dd M_SQRT1_2 -ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 -ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 - -perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 -perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 -ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 -ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 -ps_m1m1m1m1: times 4 dd 1<<31 -ps_m1p1: dd 1<<31, 0 - -%assign i 16 -%rep 13 -cextern cos_ %+ i -%assign i i<<1 -%endrep - -%if ARCH_X86_64 - %define pointer dq -%else - %define pointer dd -%endif - -%macro IF0 1+ -%endmacro -%macro IF1 1+ - %1 -%endmacro - -SECTION_TEXT - -%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 - mova %1, %3 - mova %2, %1 - pfadd %1, %4 - pfsub %2, %4 -%endmacro - -%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 - mova %5, %3 - pfsub %3, %4 - pfadd %5, %4 ; {t6,t5} - pxor %3, [ps_m1p1] ; {t8,t7} - mova %6, %1 - PSWAPD %3, %3 - pfadd %1, %5 ; {r0,i0} - pfsub %6, %5 ; {r2,i2} - mova %4, %2 - pfadd %2, %3 ; {r1,i1} - pfsub %4, %3 ; {r3,i3} - SWAP %3, %6 -%endmacro - -; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} -; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} -; %3, %4, %5 tmp -; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} -; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} -%macro T8_AVX 5 - vsubps %5, %1, %2 ; v = %1 - %2 - vaddps %3, %1, %2 ; w = %1 + %2 - vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 - vpermilps %2, %2, [perm1] - vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} - vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} - vsubps %4, %5, %1 ; s = r - q - vaddps %1, %5, %1 ; u = r + q - vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} - vshufps %5, %4, %1, 0xbb - vshufps %3, %4, %1, 0xee - vperm2f128 %3, %3, %5, 0x13 - vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} - vshufps %2, %1, %4, 0xdd - vshufps %1, %1, %4, 0x88 - vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} - vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} - vsubps %5, %1, %3 - vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} - vsubps %2, %4, %1 ; %2 = v - w - vaddps %1, %4, %1 ; %1 = v + w -%endmacro - -; In SSE mode do one fft4 transforms -; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} -; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} -; -; In AVX mode do two fft4 transforms -; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} -; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} -%macro T4_SSE 3 - subps %3, %1, %2 ; {t3,t4,-t8,t7} - addps %1, %1, %2 ; {t1,t2,t6,t5} - xorps %3, %3, [ps_p1p1m1p1] - shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} - shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} - subps %3, %1, %2 ; {r2,i2,r3,i3} - addps %1, %1, %2 ; {r0,i0,r1,i1} - shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} - shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} -%endmacro - -; In SSE mode do one FFT8 -; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} -; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} -; -; In AVX mode do two FFT8 -; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} -; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} -; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} -; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} -%macro T8_SSE 6 - addps %6, %3, %4 ; {t1,t2,t3,t4} - subps %3, %3, %4 ; {r5,i5,r7,i7} - shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} - mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} - mulps %4, %4, [ps_root2] - addps %3, %3, %4 ; {t8,t7,ta,t9} - shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} - shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} - subps %3, %6, %4 ; {t6,t5,tc,tb} - addps %6, %6, %4 ; {t1,t2,t9,ta} - shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} - shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} - subps %3, %1, %6 ; {r4,r5,r6,r7} - addps %1, %1, %6 ; {r0,r1,r2,r3} - subps %4, %2, %5 ; {i4,i5,i6,i7} - addps %2, %2, %5 ; {i0,i1,i2,i3} -%endmacro - -; scheduled for cpu-bound sizes -%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim -IF%1 mova m4, Z(4) -IF%1 mova m5, Z(5) - mova m0, %2 ; wre - mova m1, %3 ; wim - mulps m2, m4, m0 ; r2*wre -IF%1 mova m6, Z2(6) - mulps m3, m5, m1 ; i2*wim -IF%1 mova m7, Z2(7) - mulps m4, m4, m1 ; r2*wim - mulps m5, m5, m0 ; i2*wre - addps m2, m2, m3 ; r2*wre + i2*wim - mulps m3, m1, m7 ; i3*wim - subps m5, m5, m4 ; i2*wre - r2*wim - mulps m1, m1, m6 ; r3*wim - mulps m4, m0, m6 ; r3*wre - mulps m0, m0, m7 ; i3*wre - subps m4, m4, m3 ; r3*wre - i3*wim - mova m3, Z(0) - addps m0, m0, m1 ; i3*wre + r3*wim - subps m1, m4, m2 ; t3 - addps m4, m4, m2 ; t5 - subps m3, m3, m4 ; r2 - addps m4, m4, Z(0) ; r0 - mova m6, Z(2) - mova Z(4), m3 - mova Z(0), m4 - subps m3, m5, m0 ; t4 - subps m4, m6, m3 ; r3 - addps m3, m3, m6 ; r1 - mova Z2(6), m4 - mova Z(2), m3 - mova m2, Z(3) - addps m3, m5, m0 ; t6 - subps m2, m2, m1 ; i3 - mova m7, Z(1) - addps m1, m1, Z(3) ; i1 - mova Z2(7), m2 - mova Z(3), m1 - subps m4, m7, m3 ; i2 - addps m3, m3, m7 ; i0 - mova Z(5), m4 - mova Z(1), m3 -%endmacro - -; scheduled to avoid store->load aliasing -%macro PASS_BIG 1 ; (!interleave) - mova m4, Z(4) ; r2 - mova m5, Z(5) ; i2 - mova m0, [wq] ; wre - mova m1, [wq+o1q] ; wim - mulps m2, m4, m0 ; r2*wre - mova m6, Z2(6) ; r3 - mulps m3, m5, m1 ; i2*wim - mova m7, Z2(7) ; i3 - mulps m4, m4, m1 ; r2*wim - mulps m5, m5, m0 ; i2*wre - addps m2, m2, m3 ; r2*wre + i2*wim - mulps m3, m1, m7 ; i3*wim - mulps m1, m1, m6 ; r3*wim - subps m5, m5, m4 ; i2*wre - r2*wim - mulps m4, m0, m6 ; r3*wre - mulps m0, m0, m7 ; i3*wre - subps m4, m4, m3 ; r3*wre - i3*wim - mova m3, Z(0) - addps m0, m0, m1 ; i3*wre + r3*wim - subps m1, m4, m2 ; t3 - addps m4, m4, m2 ; t5 - subps m3, m3, m4 ; r2 - addps m4, m4, Z(0) ; r0 - mova m6, Z(2) - mova Z(4), m3 - mova Z(0), m4 - subps m3, m5, m0 ; t4 - subps m4, m6, m3 ; r3 - addps m3, m3, m6 ; r1 -IF%1 mova Z2(6), m4 -IF%1 mova Z(2), m3 - mova m2, Z(3) - addps m5, m5, m0 ; t6 - subps m2, m2, m1 ; i3 - mova m7, Z(1) - addps m1, m1, Z(3) ; i1 -IF%1 mova Z2(7), m2 -IF%1 mova Z(3), m1 - subps m6, m7, m5 ; i2 - addps m5, m5, m7 ; i0 -IF%1 mova Z(5), m6 -IF%1 mova Z(1), m5 -%if %1==0 - INTERL m1, m3, m7, Z, 2 - INTERL m2, m4, m0, Z2, 6 - - mova m1, Z(0) - mova m2, Z(4) - - INTERL m5, m1, m3, Z, 0 - INTERL m6, m2, m7, Z, 4 -%endif -%endmacro - -%macro PUNPCK 3 - mova %3, %1 - punpckldq %1, %2 - punpckhdq %3, %2 -%endmacro - -%define Z(x) [r0+mmsize*x] -%define Z2(x) [r0+mmsize*x] -%define ZH(x) [r0+mmsize*x+mmsize/2] - -INIT_YMM avx - -%if HAVE_AVX -align 16 -fft8_avx: - mova m0, Z(0) - mova m1, Z(1) - T8_AVX m0, m1, m2, m3, m4 - mova Z(0), m0 - mova Z(1), m1 - ret - - -align 16 -fft16_avx: - mova m2, Z(2) - mova m3, Z(3) - T4_SSE m2, m3, m7 - - mova m0, Z(0) - mova m1, Z(1) - T8_AVX m0, m1, m4, m5, m7 - - mova m4, [ps_cos16_1] - mova m5, [ps_cos16_2] - vmulps m6, m2, m4 - vmulps m7, m3, m5 - vaddps m7, m7, m6 - vmulps m2, m2, m5 - vmulps m3, m3, m4 - vsubps m3, m3, m2 - vblendps m2, m7, m3, 0xf0 - vperm2f128 m3, m7, m3, 0x21 - vaddps m4, m2, m3 - vsubps m2, m3, m2 - vperm2f128 m2, m2, m2, 0x01 - vsubps m3, m1, m2 - vaddps m1, m1, m2 - vsubps m5, m0, m4 - vaddps m0, m0, m4 - vextractf128 Z(0), m0, 0 - vextractf128 ZH(0), m1, 0 - vextractf128 Z(1), m0, 1 - vextractf128 ZH(1), m1, 1 - vextractf128 Z(2), m5, 0 - vextractf128 ZH(2), m3, 0 - vextractf128 Z(3), m5, 1 - vextractf128 ZH(3), m3, 1 - ret - -align 16 -fft32_avx: - call fft16_avx - - mova m0, Z(4) - mova m1, Z(5) - - T4_SSE m0, m1, m4 - - mova m2, Z(6) - mova m3, Z(7) - - T8_SSE m0, m1, m2, m3, m4, m6 - ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} - ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} - - vperm2f128 m4, m0, m2, 0x20 - vperm2f128 m5, m1, m3, 0x20 - vperm2f128 m6, m0, m2, 0x31 - vperm2f128 m7, m1, m3, 0x31 - - PASS_SMALL 0, [cos_32], [cos_32+32] - - ret - -fft32_interleave_avx: - call fft32_avx - mov r2d, 32 -.deint_loop: - mova m2, Z(0) - mova m3, Z(1) - vunpcklps m0, m2, m3 - vunpckhps m1, m2, m3 - vextractf128 Z(0), m0, 0 - vextractf128 ZH(0), m1, 0 - vextractf128 Z(1), m0, 1 - vextractf128 ZH(1), m1, 1 - add r0, mmsize*2 - sub r2d, mmsize/4 - jg .deint_loop - ret -%endif - -INIT_XMM sse - -align 16 -fft4_avx: -fft4_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova Z(0), m0 - mova Z(1), m1 - ret - -align 16 -fft8_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova m2, Z(2) - mova m3, Z(3) - T8_SSE m0, m1, m2, m3, m4, m5 - mova Z(0), m0 - mova Z(1), m1 - mova Z(2), m2 - mova Z(3), m3 - ret - -align 16 -fft16_sse: - mova m0, Z(0) - mova m1, Z(1) - T4_SSE m0, m1, m2 - mova m2, Z(2) - mova m3, Z(3) - T8_SSE m0, m1, m2, m3, m4, m5 - mova m4, Z(4) - mova m5, Z(5) - mova Z(0), m0 - mova Z(1), m1 - mova Z(2), m2 - mova Z(3), m3 - T4_SSE m4, m5, m6 - mova m6, Z2(6) - mova m7, Z2(7) - T4_SSE m6, m7, m0 - PASS_SMALL 0, [cos_16], [cos_16+16] - ret - - -%macro FFT48_3DNOW 0 -align 16 -fft4 %+ SUFFIX: - T2_3DNOW m0, m1, Z(0), Z(1) - mova m2, Z(2) - mova m3, Z(3) - T4_3DNOW m0, m1, m2, m3, m4, m5 - PUNPCK m0, m1, m4 - PUNPCK m2, m3, m5 - mova Z(0), m0 - mova Z(1), m4 - mova Z(2), m2 - mova Z(3), m5 - ret - -align 16 -fft8 %+ SUFFIX: - T2_3DNOW m0, m1, Z(0), Z(1) - mova m2, Z(2) - mova m3, Z(3) - T4_3DNOW m0, m1, m2, m3, m4, m5 - mova Z(0), m0 - mova Z(2), m2 - T2_3DNOW m4, m5, Z(4), Z(5) - T2_3DNOW m6, m7, Z2(6), Z2(7) - PSWAPD m0, m5 - PSWAPD m2, m7 - pxor m0, [ps_m1p1] - pxor m2, [ps_m1p1] - pfsub m5, m0 - pfadd m7, m2 - pfmul m5, [ps_root2] - pfmul m7, [ps_root2] - T4_3DNOW m1, m3, m5, m7, m0, m2 - mova Z(5), m5 - mova Z2(7), m7 - mova m0, Z(0) - mova m2, Z(2) - T4_3DNOW m0, m2, m4, m6, m5, m7 - PUNPCK m0, m1, m5 - PUNPCK m2, m3, m7 - mova Z(0), m0 - mova Z(1), m5 - mova Z(2), m2 - mova Z(3), m7 - PUNPCK m4, Z(5), m5 - PUNPCK m6, Z2(7), m7 - mova Z(4), m4 - mova Z(5), m5 - mova Z2(6), m6 - mova Z2(7), m7 - ret -%endmacro - -%if ARCH_X86_32 -%macro PSWAPD 2 -%if cpuflag(3dnowext) - pswapd %1, %2 -%elifidn %1, %2 - movd [r0+12], %1 - punpckhdq %1, [r0+8] -%else - movq %1, %2 - psrlq %1, 32 - punpckldq %1, %2 -%endif -%endmacro - -INIT_MMX 3dnowext -FFT48_3DNOW - -INIT_MMX 3dnow -FFT48_3DNOW -%endif - -%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] -%define Z2(x) [zcq + o3q + mmsize*(x&1)] -%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] -%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] - -%macro DECL_PASS 2+ ; name, payload -align 16 -%1: -DEFINE_ARGS zc, w, n, o1, o3 - lea o3q, [nq*3] - lea o1q, [nq*8] - shl o3q, 4 -.loop: - %2 - add zcq, mmsize*2 - add wq, mmsize - sub nd, mmsize/8 - jg .loop - rep ret -%endmacro - -%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs - lea r2, [dispatch_tab%1] - mov r2, [r2 + (%2q-2)*gprsize] -%ifdef PIC - lea r3, [$$] - add r2, r3 -%endif - call r2 -%endmacro ; FFT_DISPATCH - -INIT_YMM avx - -%if HAVE_AVX -%macro INTERL_AVX 5 - vunpckhps %3, %2, %1 - vunpcklps %2, %2, %1 - vextractf128 %4(%5), %2, 0 - vextractf128 %4 %+ H(%5), %3, 0 - vextractf128 %4(%5 + 1), %2, 1 - vextractf128 %4 %+ H(%5 + 1), %3, 1 -%endmacro - -%define INTERL INTERL_AVX - -DECL_PASS pass_avx, PASS_BIG 1 -DECL_PASS pass_interleave_avx, PASS_BIG 0 - -cglobal fft_calc, 2,5,8 - mov r3d, [r0 + FFTContext.nbits] - mov r0, r1 - mov r1, r3 - FFT_DISPATCH _interleave %+ SUFFIX, r1 - REP_RET - -%endif - -INIT_XMM sse - -%macro INTERL_SSE 5 - mova %3, %2 - unpcklps %2, %1 - unpckhps %3, %1 - mova %4(%5), %2 - mova %4(%5+1), %3 -%endmacro - -%define INTERL INTERL_SSE - -DECL_PASS pass_sse, PASS_BIG 1 -DECL_PASS pass_interleave_sse, PASS_BIG 0 - -%macro FFT_CALC_FUNC 0 -cglobal fft_calc, 2,5,8 - mov r3d, [r0 + FFTContext.nbits] - PUSH r1 - PUSH r3 - mov r0, r1 - mov r1, r3 - FFT_DISPATCH _interleave %+ SUFFIX, r1 - POP rcx - POP r4 - cmp rcx, 3+(mmsize/16) - jg .end - mov r2, -1 - add rcx, 3 - shl r2, cl - sub r4, r2 -.loop: -%if mmsize == 8 - PSWAPD m0, [r4 + r2 + 4] - mova [r4 + r2 + 4], m0 -%else - movaps xmm0, [r4 + r2] - movaps xmm1, xmm0 - unpcklps xmm0, [r4 + r2 + 16] - unpckhps xmm1, [r4 + r2 + 16] - movaps [r4 + r2], xmm0 - movaps [r4 + r2 + 16], xmm1 -%endif - add r2, mmsize*2 - jl .loop -.end: -%if cpuflag(3dnow) - femms - RET -%else - REP_RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX 3dnow -FFT_CALC_FUNC -INIT_MMX 3dnowext -FFT_CALC_FUNC -%endif -INIT_XMM sse -FFT_CALC_FUNC - -cglobal fft_permute, 2,7,1 - mov r4, [r0 + FFTContext.revtab] - mov r5, [r0 + FFTContext.tmpbuf] - mov ecx, [r0 + FFTContext.nbits] - mov r2, 1 - shl r2, cl - xor r0, r0 -%if ARCH_X86_32 - mov r1, r1m -%endif -.loop: - movaps xmm0, [r1 + 8*r0] - movzx r6, word [r4 + 2*r0] - movzx r3, word [r4 + 2*r0 + 2] - movlps [r5 + 8*r6], xmm0 - movhps [r5 + 8*r3], xmm0 - add r0, 2 - cmp r0, r2 - jl .loop - shl r2, 3 - add r1, r2 - add r5, r2 - neg r2 -; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B -.loopcopy: - movaps xmm0, [r5 + r2] - movaps xmm1, [r5 + r2 + 16] - movaps [r1 + r2], xmm0 - movaps [r1 + r2 + 16], xmm1 - add r2, 32 - jl .loopcopy - REP_RET - -%macro IMDCT_CALC_FUNC 0 -cglobal imdct_calc, 3,5,3 - mov r3d, [r0 + FFTContext.mdctsize] - mov r4, [r0 + FFTContext.imdcthalf] - add r1, r3 - PUSH r3 - PUSH r1 -%if ARCH_X86_32 - push r2 - push r1 - push r0 -%else - sub rsp, 8 -%endif - call r4 -%if ARCH_X86_32 - add esp, 12 -%else - add rsp, 8 -%endif - POP r1 - POP r3 - lea r0, [r1 + 2*r3] - mov r2, r3 - sub r3, mmsize - neg r2 - mova m2, [ps_m1m1m1m1] -.loop: -%if mmsize == 8 - PSWAPD m0, [r1 + r3] - PSWAPD m1, [r0 + r2] - pxor m0, m2 -%else - mova m0, [r1 + r3] - mova m1, [r0 + r2] - shufps m0, m0, 0x1b - shufps m1, m1, 0x1b - xorps m0, m2 -%endif - mova [r0 + r3], m1 - mova [r1 + r2], m0 - sub r3, mmsize - add r2, mmsize - jl .loop -%if cpuflag(3dnow) - femms - RET -%else - REP_RET -%endif -%endmacro - -%if ARCH_X86_32 -INIT_MMX 3dnow -IMDCT_CALC_FUNC -INIT_MMX 3dnowext -IMDCT_CALC_FUNC -%endif - -INIT_XMM sse -IMDCT_CALC_FUNC - -%if ARCH_X86_32 -INIT_MMX 3dnow -%define mulps pfmul -%define addps pfadd -%define subps pfsub -%define unpcklps punpckldq -%define unpckhps punpckhdq -DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] -DECL_PASS pass_interleave_3dnow, PASS_BIG 0 -%define pass_3dnowext pass_3dnow -%define pass_interleave_3dnowext pass_interleave_3dnow -%endif - -%ifdef PIC -%define SECTION_REL - $$ -%else -%define SECTION_REL -%endif - -%macro DECL_FFT 1-2 ; nbits, suffix -%ifidn %0, 1 -%xdefine fullsuffix SUFFIX -%else -%xdefine fullsuffix %2 %+ SUFFIX -%endif -%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL -%if %1>=5 -%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL -%endif -%if %1>=6 -%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL -%endif - -%assign n 1<<%1 -%rep 17-%1 -%assign n2 n/2 -%assign n4 n/4 -%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL - -align 16 -fft %+ n %+ fullsuffix: - call fft %+ n2 %+ SUFFIX - add r0, n*4 - (n&(-2<<%1)) - call fft %+ n4 %+ SUFFIX - add r0, n*2 - (n2&(-2<<%1)) - call fft %+ n4 %+ SUFFIX - sub r0, n*6 + (n2&(-2<<%1)) - lea r1, [cos_ %+ n] - mov r2d, n4/2 - jmp pass %+ fullsuffix - -%assign n n*2 -%endrep -%undef n - -align 8 -dispatch_tab %+ fullsuffix: pointer list_of_fft -%endmacro ; DECL_FFT - -%if HAVE_AVX -INIT_YMM avx -DECL_FFT 6 -DECL_FFT 6, _interleave -%endif -INIT_XMM sse -DECL_FFT 5 -DECL_FFT 5, _interleave -%if ARCH_X86_32 -INIT_MMX 3dnow -DECL_FFT 4 -DECL_FFT 4, _interleave -INIT_MMX 3dnowext -DECL_FFT 4 -DECL_FFT 4, _interleave -%endif - -INIT_XMM sse -%undef mulps -%undef addps -%undef subps -%undef unpcklps -%undef unpckhps - -%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 -%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 - PSWAPD m0, [%3+%2*4] - movq m2, [%3+%1*4-8] - movq m3, m0 - punpckldq m0, m2 - punpckhdq m2, m3 - movd m1, [%4+%1*2-4] ; tcos[j] - movd m3, [%4+%2*2] ; tcos[n4-j-1] - punpckldq m1, [%5+%1*2-4] ; tsin[j] - punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] - - mova m4, m0 - PSWAPD m5, m1 - pfmul m0, m1 - pfmul m4, m5 - mova m6, m2 - PSWAPD m5, m3 - pfmul m2, m3 - pfmul m6, m5 -%if cpuflag(3dnowext) - pfpnacc m0, m4 - pfpnacc m2, m6 -%else - SBUTTERFLY dq, 0, 4, 1 - SBUTTERFLY dq, 2, 6, 3 - pxor m4, m7 - pxor m6, m7 - pfadd m0, m4 - pfadd m2, m6 -%endif -%else - movaps xmm0, [%3+%2*4] - movaps xmm1, [%3+%1*4-0x10] - movaps xmm2, xmm0 - shufps xmm0, xmm1, 0x88 - shufps xmm1, xmm2, 0x77 - movlps xmm4, [%4+%2*2] - movlps xmm5, [%5+%2*2+0x0] - movhps xmm4, [%4+%1*2-0x8] - movhps xmm5, [%5+%1*2-0x8] - movaps xmm2, xmm0 - movaps xmm3, xmm1 - mulps xmm0, xmm5 - mulps xmm1, xmm4 - mulps xmm2, xmm4 - mulps xmm3, xmm5 - subps xmm1, xmm0 - addps xmm2, xmm3 - movaps xmm0, xmm1 - unpcklps xmm1, xmm2 - unpckhps xmm0, xmm2 -%endif -%endmacro - -%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 - mulps m6, %3, [%5+%1] - mulps m7, %2, [%5+%1] - mulps %2, %2, [%6+%1] - mulps %3, %3, [%6+%1] - subps %2, %2, m6 - addps %3, %3, m7 -%endmacro - -%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 -.post: - vmovaps ymm1, [%3+%1*2] - vmovaps ymm0, [%3+%1*2+0x20] - vmovaps ymm3, [%3+%2*2] - vmovaps ymm2, [%3+%2*2+0x20] - - CMUL %1, ymm0, ymm1, %3, %4, %5 - CMUL %2, ymm2, ymm3, %3, %4, %5 - vshufps ymm1, ymm1, ymm1, 0x1b - vshufps ymm3, ymm3, ymm3, 0x1b - vperm2f128 ymm1, ymm1, ymm1, 0x01 - vperm2f128 ymm3, ymm3, ymm3, 0x01 - vunpcklps ymm6, ymm2, ymm1 - vunpckhps ymm4, ymm2, ymm1 - vunpcklps ymm7, ymm0, ymm3 - vunpckhps ymm5, ymm0, ymm3 - - vextractf128 [%3+%1*2], ymm7, 0 - vextractf128 [%3+%1*2+0x10], ymm5, 0 - vextractf128 [%3+%1*2+0x20], ymm7, 1 - vextractf128 [%3+%1*2+0x30], ymm5, 1 - - vextractf128 [%3+%2*2], ymm6, 0 - vextractf128 [%3+%2*2+0x10], ymm4, 0 - vextractf128 [%3+%2*2+0x20], ymm6, 1 - vextractf128 [%3+%2*2+0x30], ymm4, 1 - sub %2, 0x20 - add %1, 0x20 - jl .post -%endmacro - -%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 -.post: - movaps xmm1, [%3+%1*2] - movaps xmm0, [%3+%1*2+0x10] - CMUL %1, xmm0, xmm1, %3, %4, %5 - movaps xmm5, [%3+%2*2] - movaps xmm4, [%3+%2*2+0x10] - CMUL %2, xmm4, xmm5, %3, %4, %5 - shufps xmm1, xmm1, 0x1b - shufps xmm5, xmm5, 0x1b - movaps xmm6, xmm4 - unpckhps xmm4, xmm1 - unpcklps xmm6, xmm1 - movaps xmm2, xmm0 - unpcklps xmm0, xmm5 - unpckhps xmm2, xmm5 - movaps [%3+%2*2], xmm6 - movaps [%3+%2*2+0x10], xmm4 - movaps [%3+%1*2], xmm0 - movaps [%3+%1*2+0x10], xmm2 - sub %2, 0x10 - add %1, 0x10 - jl .post -%endmacro - -%macro CMUL_3DNOW 6 - mova m6, [%1+%2*2] - mova %3, [%1+%2*2+8] - mova %4, m6 - mova m7, %3 - pfmul m6, [%5+%2] - pfmul %3, [%6+%2] - pfmul %4, [%6+%2] - pfmul m7, [%5+%2] - pfsub %3, m6 - pfadd %4, m7 -%endmacro - -%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 -.post: - CMUL_3DNOW %3, %1, m0, m1, %4, %5 - CMUL_3DNOW %3, %2, m2, m3, %4, %5 - movd [%3+%1*2+ 0], m0 - movd [%3+%2*2+12], m1 - movd [%3+%2*2+ 0], m2 - movd [%3+%1*2+12], m3 - psrlq m0, 32 - psrlq m1, 32 - psrlq m2, 32 - psrlq m3, 32 - movd [%3+%1*2+ 8], m0 - movd [%3+%2*2+ 4], m1 - movd [%3+%2*2+ 8], m2 - movd [%3+%1*2+ 4], m3 - sub %2, 8 - add %1, 8 - jl .post -%endmacro - -%macro DECL_IMDCT 1 -cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input -%if ARCH_X86_64 -%define rrevtab r7 -%define rtcos r8 -%define rtsin r9 -%else -%define rrevtab r6 -%define rtsin r6 -%define rtcos r5 -%endif - mov r3d, [r0+FFTContext.mdctsize] - add r2, r3 - shr r3, 1 - mov rtcos, [r0+FFTContext.tcos] - mov rtsin, [r0+FFTContext.tsin] - add rtcos, r3 - add rtsin, r3 -%if ARCH_X86_64 == 0 - push rtcos - push rtsin -%endif - shr r3, 1 - mov rrevtab, [r0+FFTContext.revtab] - add rrevtab, r3 -%if ARCH_X86_64 == 0 - push rrevtab -%endif - -%if mmsize == 8 - sub r3, 2 -%else - sub r3, 4 -%endif -%if ARCH_X86_64 || mmsize == 8 - xor r4, r4 - sub r4, r3 -%endif -%if notcpuflag(3dnowext) && mmsize == 8 - movd m7, [ps_m1m1m1m1] -%endif -.pre: -%if ARCH_X86_64 == 0 -;unspill -%if mmsize != 8 - xor r4, r4 - sub r4, r3 -%endif - mov rtcos, [esp+8] - mov rtsin, [esp+4] -%endif - - PREROTATER r4, r3, r2, rtcos, rtsin -%if mmsize == 8 - mov r6, [esp] ; rrevtab = ptr+n8 - movzx r5, word [rrevtab+r4-2] ; rrevtab[j] - movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] - mova [r1+r5*8], m0 - mova [r1+r6*8], m2 - add r4, 2 - sub r3, 2 -%else -%if ARCH_X86_64 - movzx r5, word [rrevtab+r4-4] - movzx r6, word [rrevtab+r4-2] - movzx r10, word [rrevtab+r3] - movzx r11, word [rrevtab+r3+2] - movlps [r1+r5 *8], xmm0 - movhps [r1+r6 *8], xmm0 - movlps [r1+r10*8], xmm1 - movhps [r1+r11*8], xmm1 - add r4, 4 -%else - mov r6, [esp] - movzx r5, word [r6+r4-4] - movzx r4, word [r6+r4-2] - movlps [r1+r5*8], xmm0 - movhps [r1+r4*8], xmm0 - movzx r5, word [r6+r3] - movzx r4, word [r6+r3+2] - movlps [r1+r5*8], xmm1 - movhps [r1+r4*8], xmm1 -%endif - sub r3, 4 -%endif - jns .pre - - mov r5, r0 - mov r6, r1 - mov r0, r1 - mov r1d, [r5+FFTContext.nbits] - - FFT_DISPATCH SUFFIX, r1 - - mov r0d, [r5+FFTContext.mdctsize] - add r6, r0 - shr r0, 1 -%if ARCH_X86_64 == 0 -%define rtcos r2 -%define rtsin r3 - mov rtcos, [esp+8] - mov rtsin, [esp+4] -%endif - neg r0 - mov r1, -mmsize - sub r1, r0 - %1 r0, r1, r6, rtcos, rtsin -%if ARCH_X86_64 == 0 - add esp, 12 -%endif -%if mmsize == 8 - femms -%endif - RET -%endmacro - -DECL_IMDCT POSROTATESHUF - -%if ARCH_X86_32 -INIT_MMX 3dnow -DECL_IMDCT POSROTATESHUF_3DNOW - -INIT_MMX 3dnowext -DECL_IMDCT POSROTATESHUF_3DNOW -%endif - -INIT_YMM avx - -%if HAVE_AVX -DECL_IMDCT POSROTATESHUF_AVX -%endif diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c new file mode 100644 index 0000000000..fc1635de8b --- /dev/null +++ b/libavcodec/x86/h264_qpel.c @@ -0,0 +1,1291 @@ +/* + * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt + * Copyright (c) 2011 Daniel Kang + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dsputil_mmx.h" + +#if HAVE_INLINE_ASM + +/***********************************/ +/* motion compensation */ + +#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ + "mov"#q" "#C", "#T" \n\t"\ + "mov"#d" (%0), "#F" \n\t"\ + "paddw "#D", "#T" \n\t"\ + "psllw $2, "#T" \n\t"\ + "psubw "#B", "#T" \n\t"\ + "psubw "#E", "#T" \n\t"\ + "punpcklbw "#Z", "#F" \n\t"\ + "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\ + "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\ + "add %2, %0 \n\t"\ + "paddw "#F", "#A" \n\t"\ + "paddw "#A", "#T" \n\t"\ + "psraw $5, "#T" \n\t"\ + "packuswb "#T", "#T" \n\t"\ + OP(T, (%1), A, d)\ + "add %3, %1 \n\t" + +#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ + "mov"#q" "#C", "#T" \n\t"\ + "mov"#d" (%0), "#F" \n\t"\ + "paddw "#D", "#T" \n\t"\ + "psllw $2, "#T" \n\t"\ + "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\ + "psubw "#B", "#T" \n\t"\ + "psubw "#E", "#T" \n\t"\ + "punpcklbw "#Z", "#F" \n\t"\ + "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\ + "paddw "#F", "#A" \n\t"\ + "add %2, %0 \n\t"\ + "paddw "#A", "#T" \n\t"\ + "mov"#q" "#T", "#OF"(%1) \n\t" + +#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) +#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) +#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) +#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) + + +#define QPEL_H264(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=4;\ +\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\ + "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ + "1: \n\t"\ + "movd -1(%0), %%mm1 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "movd 1(%0), %%mm3 \n\t"\ + "movd 2(%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "paddw %%mm0, %%mm1 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "movd -2(%0), %%mm0 \n\t"\ + "movd 3(%0), %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm3, %%mm0 \n\t"\ + "psllw $2, %%mm2 \n\t"\ + "psubw %%mm1, %%mm2 \n\t"\ + "pmullw %%mm4, %%mm2 \n\t"\ + "paddw %%mm5, %%mm0 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm6, d)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=4;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %0, %%mm4 \n\t"\ + "movq %1, %%mm5 \n\t"\ + :: "m"(ff_pw_5), "m"(ff_pw_16)\ + );\ + do{\ + __asm__ volatile(\ + "movd -1(%0), %%mm1 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "movd 1(%0), %%mm3 \n\t"\ + "movd 2(%0), %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "paddw %%mm0, %%mm1 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "movd -2(%0), %%mm0 \n\t"\ + "movd 3(%0), %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm3, %%mm0 \n\t"\ + "psllw $2, %%mm2 \n\t"\ + "psubw %%mm1, %%mm2 \n\t"\ + "pmullw %%mm4, %%mm2 \n\t"\ + "paddw %%mm5, %%mm0 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "movd (%2), %%mm3 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + PAVGB" %%mm3, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm6, d)\ + "add %4, %0 \n\t"\ + "add %4, %1 \n\t"\ + "add %3, %2 \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + }while(--h);\ +}\ +static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + src -= 2*srcStride;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "memory"\ + );\ +}\ +static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + int h=4;\ + int w=3;\ + src -= 2*srcStride+2;\ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ + \ + : "+a"(src)\ + : "c"(tmp), "S"((x86_reg)srcStride)\ + : "memory"\ + );\ + tmp += 4;\ + src += 4 - 9*srcStride;\ + }\ + tmp -= 3*4;\ + __asm__ volatile(\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "paddw 10(%0), %%mm0 \n\t"\ + "movq 2(%0), %%mm1 \n\t"\ + "paddw 8(%0), %%mm1 \n\t"\ + "movq 4(%0), %%mm2 \n\t"\ + "paddw 6(%0), %%mm2 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ + "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ + "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ + "paddsw %%mm2, %%mm0 \n\t"\ + "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ + "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ + "psraw $6, %%mm0 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm7, d)\ + "add $24, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "psllw $2, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movd -2(%0), %%mm2 \n\t"\ + "movd 7(%0), %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ + "paddw %%mm5, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm4, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 1(%0), %%mm2 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "psllw $2, %%mm0 \n\t"\ + "psllw $2, %%mm1 \n\t"\ + "movq -1(%0), %%mm2 \n\t"\ + "movq 2(%0), %%mm4 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "movq %%mm4, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm4, %%mm2 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm5, %%mm1 \n\t"\ + "pmullw %%mm6, %%mm0 \n\t"\ + "pmullw %%mm6, %%mm1 \n\t"\ + "movd -2(%0), %%mm2 \n\t"\ + "movd 7(%0), %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm5 \n\t"\ + "paddw %%mm3, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ + "paddw %%mm5, %%mm2 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm4, %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "movq (%2), %%mm4 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + PAVGB" %%mm4, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm5, q)\ + "add %5, %0 \n\t"\ + "add %5, %1 \n\t"\ + "add %4, %2 \n\t"\ + "decl %3 \n\t"\ + "jg 1b \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ + : "memory"\ + );\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + int w= 2;\ + src -= 2*srcStride;\ + \ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ + QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + "cmpl $16, %4 \n\t"\ + "jne 2f \n\t"\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ + QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ + QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ + QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ + QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ + QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ + "2: \n\t"\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\ + : "memory"\ + );\ + src += 4-(h+5)*srcStride;\ + dst += 4-h*dstStride;\ + }\ +}\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ + int w = (size+8)>>2;\ + src -= 2*srcStride+2;\ + while(w--){\ + __asm__ volatile(\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd (%0), %%mm0 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm1 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm2 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm3 \n\t"\ + "add %2, %0 \n\t"\ + "movd (%0), %%mm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpcklbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ + QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ + QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ + "cmpl $16, %3 \n\t"\ + "jne 2f \n\t"\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ + QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ + QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ + QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ + QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ + QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ + QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ + "2: \n\t"\ + : "+a"(src)\ + : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)\ + : "memory"\ + );\ + tmp += 4;\ + src += 4 - (size+5)*srcStride;\ + }\ +}\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ + int w = size>>4;\ + do{\ + int h = size;\ + __asm__ volatile(\ + "1: \n\t"\ + "movq (%0), %%mm0 \n\t"\ + "movq 8(%0), %%mm3 \n\t"\ + "movq 2(%0), %%mm1 \n\t"\ + "movq 10(%0), %%mm4 \n\t"\ + "paddw %%mm4, %%mm0 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "paddw 18(%0), %%mm3 \n\t"\ + "paddw 16(%0), %%mm4 \n\t"\ + "movq 4(%0), %%mm2 \n\t"\ + "movq 12(%0), %%mm5 \n\t"\ + "paddw 6(%0), %%mm2 \n\t"\ + "paddw 14(%0), %%mm5 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"\ + "psubw %%mm4, %%mm3 \n\t"\ + "psraw $2, %%mm0 \n\t"\ + "psraw $2, %%mm3 \n\t"\ + "psubw %%mm1, %%mm0 \n\t"\ + "psubw %%mm4, %%mm3 \n\t"\ + "paddsw %%mm2, %%mm0 \n\t"\ + "paddsw %%mm5, %%mm3 \n\t"\ + "psraw $2, %%mm0 \n\t"\ + "psraw $2, %%mm3 \n\t"\ + "paddw %%mm2, %%mm0 \n\t"\ + "paddw %%mm5, %%mm3 \n\t"\ + "psraw $6, %%mm0 \n\t"\ + "psraw $6, %%mm3 \n\t"\ + "packuswb %%mm3, %%mm0 \n\t"\ + OP(%%mm0, (%1),%%mm7, q)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : "memory"\ + );\ + tmp += 8 - size*24;\ + dst += 8 - size*dstStride;\ + }while(w--);\ +}\ +\ +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +}\ +\ +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + src += 8*dstStride;\ + dst += 8*dstStride;\ + src2 += 8*src2Stride;\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +}\ +\ +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ + OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +}\ +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ +}\ +\ +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ +}\ +\ +static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + __asm__ volatile(\ + "movq (%1), %%mm0 \n\t"\ + "movq 24(%1), %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + "packuswb %%mm1, %%mm1 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm1 \n\t"\ + OP(%%mm0, (%2), %%mm4, d)\ + OP(%%mm1, (%2,%4), %%mm5, d)\ + "lea (%0,%3,2), %0 \n\t"\ + "lea (%2,%4,2), %2 \n\t"\ + "movq 48(%1), %%mm0 \n\t"\ + "movq 72(%1), %%mm1 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "packuswb %%mm0, %%mm0 \n\t"\ + "packuswb %%mm1, %%mm1 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm1 \n\t"\ + OP(%%mm0, (%2), %%mm4, d)\ + OP(%%mm1, (%2,%4), %%mm5, d)\ + :"+a"(src8), "+c"(src16), "+d"(dst)\ + :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ + :"memory");\ +}\ +static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + do{\ + __asm__ volatile(\ + "movq (%1), %%mm0 \n\t"\ + "movq 8(%1), %%mm1 \n\t"\ + "movq 48(%1), %%mm2 \n\t"\ + "movq 8+48(%1), %%mm3 \n\t"\ + "psraw $5, %%mm0 \n\t"\ + "psraw $5, %%mm1 \n\t"\ + "psraw $5, %%mm2 \n\t"\ + "psraw $5, %%mm3 \n\t"\ + "packuswb %%mm1, %%mm0 \n\t"\ + "packuswb %%mm3, %%mm2 \n\t"\ + PAVGB" (%0), %%mm0 \n\t"\ + PAVGB" (%0,%3), %%mm2 \n\t"\ + OP(%%mm0, (%2), %%mm5, q)\ + OP(%%mm2, (%2,%4), %%mm5, q)\ + ::"a"(src8), "c"(src16), "d"(dst),\ + "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ + :"memory");\ + src8 += 2L*src8Stride;\ + src16 += 48;\ + dst += 2L*dstStride;\ + }while(h-=2);\ +}\ +static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ +{\ + OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ + OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ +}\ + + +#if ARCH_X86_64 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=16;\ + __asm__ volatile(\ + "pxor %%xmm15, %%xmm15 \n\t"\ + "movdqa %6, %%xmm14 \n\t"\ + "movdqa %7, %%xmm13 \n\t"\ + "1: \n\t"\ + "lddqu 6(%0), %%xmm1 \n\t"\ + "lddqu -2(%0), %%xmm7 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm15, %%xmm1 \n\t"\ + "punpcklbw %%xmm15, %%xmm0 \n\t"\ + "punpcklbw %%xmm15, %%xmm7 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm0, %%xmm6 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm0, %%xmm8 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm0, %%xmm9 \n\t"\ + "movdqa %%xmm0, %%xmm12 \n\t"\ + "movdqa %%xmm1, %%xmm11 \n\t"\ + "palignr $10,%%xmm0, %%xmm11\n\t"\ + "palignr $10,%%xmm7, %%xmm12\n\t"\ + "palignr $2, %%xmm0, %%xmm4 \n\t"\ + "palignr $2, %%xmm7, %%xmm9 \n\t"\ + "palignr $4, %%xmm0, %%xmm3 \n\t"\ + "palignr $4, %%xmm7, %%xmm8 \n\t"\ + "palignr $6, %%xmm0, %%xmm2 \n\t"\ + "palignr $6, %%xmm7, %%xmm6 \n\t"\ + "paddw %%xmm0 ,%%xmm11 \n\t"\ + "palignr $8, %%xmm0, %%xmm1 \n\t"\ + "palignr $8, %%xmm7, %%xmm0 \n\t"\ + "paddw %%xmm12,%%xmm7 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm8, %%xmm6 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm9, %%xmm0 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "psllw $2, %%xmm6 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "psubw %%xmm0, %%xmm6 \n\t"\ + "paddw %%xmm13,%%xmm11 \n\t"\ + "paddw %%xmm13,%%xmm7 \n\t"\ + "pmullw %%xmm14,%%xmm2 \n\t"\ + "pmullw %%xmm14,%%xmm6 \n\t"\ + "lddqu (%2), %%xmm3 \n\t"\ + "paddw %%xmm11,%%xmm2 \n\t"\ + "paddw %%xmm7, %%xmm6 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "psraw $5, %%xmm6 \n\t"\ + "packuswb %%xmm2,%%xmm6 \n\t"\ + "pavgb %%xmm3, %%xmm6 \n\t"\ + OP(%%xmm6, (%1), %%xmm4, dqa)\ + "add %5, %0 \n\t"\ + "add %5, %1 \n\t"\ + "add %4, %2 \n\t"\ + "decl %3 \n\t"\ + "jg 1b \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ + "m"(ff_pw_5), "m"(ff_pw_16)\ + : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \ + "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \ + "%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \ + "%xmm12", "%xmm13", "%xmm14", "%xmm15",)\ + "memory"\ + );\ +} +#else // ARCH_X86_64 +#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ + src += 8*dstStride;\ + dst += 8*dstStride;\ + src2 += 8*src2Stride;\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ + OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ +} +#endif // ARCH_X86_64 + +#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ + "1: \n\t"\ + "lddqu -2(%0), %%xmm1 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $2, %%xmm0, %%xmm4 \n\t"\ + "palignr $4, %%xmm0, %%xmm3 \n\t"\ + "palignr $6, %%xmm0, %%xmm2 \n\t"\ + "palignr $8, %%xmm0, %%xmm1 \n\t"\ + "palignr $10,%%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "movq (%2), %%xmm3 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ + "pmullw %%xmm6, %%xmm2 \n\t"\ + "paddw %%xmm0, %%xmm2 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "packuswb %%xmm2, %%xmm2 \n\t"\ + "pavgb %%xmm3, %%xmm2 \n\t"\ + OP(%%xmm2, (%1), %%xmm4, q)\ + "add %5, %0 \n\t"\ + "add %5, %1 \n\t"\ + "add %4, %2 \n\t"\ + "decl %3 \n\t"\ + "jg 1b \n\t"\ + : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ + : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ + "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ + "memory"\ + );\ +}\ +QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ +\ +static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + int h=8;\ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ + "1: \n\t"\ + "lddqu -2(%0), %%xmm1 \n\t"\ + "movdqa %%xmm1, %%xmm0 \n\t"\ + "punpckhbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $2, %%xmm0, %%xmm4 \n\t"\ + "palignr $4, %%xmm0, %%xmm3 \n\t"\ + "palignr $6, %%xmm0, %%xmm2 \n\t"\ + "palignr $8, %%xmm0, %%xmm1 \n\t"\ + "palignr $10,%%xmm0, %%xmm5 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "psllw $2, %%xmm2 \n\t"\ + "psubw %%xmm1, %%xmm2 \n\t"\ + "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ + "pmullw %%xmm6, %%xmm2 \n\t"\ + "paddw %%xmm0, %%xmm2 \n\t"\ + "psraw $5, %%xmm2 \n\t"\ + "packuswb %%xmm2, %%xmm2 \n\t"\ + OP(%%xmm2, (%1), %%xmm4, q)\ + "add %3, %0 \n\t"\ + "add %4, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(src), "+c"(dst), "+g"(h)\ + : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ + : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ + "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ + "memory"\ + );\ +}\ +static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ + src += 8*srcStride;\ + dst += 8*dstStride;\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ + OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ +}\ + +#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ + src -= 2*srcStride;\ + \ + __asm__ volatile(\ + "pxor %%xmm7, %%xmm7 \n\t"\ + "movq (%0), %%xmm0 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm1 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm2 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm3 \n\t"\ + "add %2, %0 \n\t"\ + "movq (%0), %%xmm4 \n\t"\ + "add %2, %0 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm1 \n\t"\ + "punpcklbw %%xmm7, %%xmm2 \n\t"\ + "punpcklbw %%xmm7, %%xmm3 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + "cmpl $16, %4 \n\t"\ + "jne 2f \n\t"\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ + QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ + QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ + QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ + QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ + QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ + "2: \n\t"\ + \ + : "+a"(src), "+c"(dst)\ + : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\ + : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ + "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ + "memory"\ + );\ +}\ +static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ +}\ +static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ + OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ +} + +static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ + int w = (size+8)>>3; + src -= 2*srcStride+2; + while(w--){ + __asm__ volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "movq (%0), %%xmm0 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm1 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm2 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm3 \n\t" + "add %2, %0 \n\t" + "movq (%0), %%xmm4 \n\t" + "add %2, %0 \n\t" + "punpcklbw %%xmm7, %%xmm0 \n\t" + "punpcklbw %%xmm7, %%xmm1 \n\t" + "punpcklbw %%xmm7, %%xmm2 \n\t" + "punpcklbw %%xmm7, %%xmm3 \n\t" + "punpcklbw %%xmm7, %%xmm4 \n\t" + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) + "cmpl $16, %3 \n\t" + "jne 2f \n\t" + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) + QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) + QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) + QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) + QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) + QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) + QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) + "2: \n\t" + : "+a"(src) + : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size) + : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7",) + "memory" + ); + tmp += 8; + src += 8 - (size+5)*srcStride; + } +} + +#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ +static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ + int h = size;\ + if(size == 16){\ + __asm__ volatile(\ + "1: \n\t"\ + "movdqa 32(%0), %%xmm4 \n\t"\ + "movdqa 16(%0), %%xmm5 \n\t"\ + "movdqa (%0), %%xmm7 \n\t"\ + "movdqa %%xmm4, %%xmm3 \n\t"\ + "movdqa %%xmm4, %%xmm2 \n\t"\ + "movdqa %%xmm4, %%xmm1 \n\t"\ + "movdqa %%xmm4, %%xmm0 \n\t"\ + "palignr $10, %%xmm5, %%xmm0 \n\t"\ + "palignr $8, %%xmm5, %%xmm1 \n\t"\ + "palignr $6, %%xmm5, %%xmm2 \n\t"\ + "palignr $4, %%xmm5, %%xmm3 \n\t"\ + "palignr $2, %%xmm5, %%xmm4 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "movdqa %%xmm5, %%xmm6 \n\t"\ + "movdqa %%xmm5, %%xmm4 \n\t"\ + "movdqa %%xmm5, %%xmm3 \n\t"\ + "palignr $8, %%xmm7, %%xmm4 \n\t"\ + "palignr $2, %%xmm7, %%xmm6 \n\t"\ + "palignr $10, %%xmm7, %%xmm3 \n\t"\ + "paddw %%xmm6, %%xmm4 \n\t"\ + "movdqa %%xmm5, %%xmm6 \n\t"\ + "palignr $6, %%xmm7, %%xmm5 \n\t"\ + "palignr $4, %%xmm7, %%xmm6 \n\t"\ + "paddw %%xmm7, %%xmm3 \n\t"\ + "paddw %%xmm6, %%xmm5 \n\t"\ + \ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psubw %%xmm4, %%xmm3 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psraw $2, %%xmm3 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psubw %%xmm4, %%xmm3 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "paddw %%xmm5, %%xmm3 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psraw $2, %%xmm3 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "paddw %%xmm5, %%xmm3 \n\t"\ + "psraw $6, %%xmm0 \n\t"\ + "psraw $6, %%xmm3 \n\t"\ + "packuswb %%xmm0, %%xmm3 \n\t"\ + OP(%%xmm3, (%1), %%xmm7, dqa)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ + "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ + "memory"\ + );\ + }else{\ + __asm__ volatile(\ + "1: \n\t"\ + "movdqa 16(%0), %%xmm1 \n\t"\ + "movdqa (%0), %%xmm0 \n\t"\ + "movdqa %%xmm1, %%xmm2 \n\t"\ + "movdqa %%xmm1, %%xmm3 \n\t"\ + "movdqa %%xmm1, %%xmm4 \n\t"\ + "movdqa %%xmm1, %%xmm5 \n\t"\ + "palignr $10, %%xmm0, %%xmm5 \n\t"\ + "palignr $8, %%xmm0, %%xmm4 \n\t"\ + "palignr $6, %%xmm0, %%xmm3 \n\t"\ + "palignr $4, %%xmm0, %%xmm2 \n\t"\ + "palignr $2, %%xmm0, %%xmm1 \n\t"\ + "paddw %%xmm5, %%xmm0 \n\t"\ + "paddw %%xmm4, %%xmm1 \n\t"\ + "paddw %%xmm3, %%xmm2 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "psubw %%xmm1, %%xmm0 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "psraw $2, %%xmm0 \n\t"\ + "paddw %%xmm2, %%xmm0 \n\t"\ + "psraw $6, %%xmm0 \n\t"\ + "packuswb %%xmm0, %%xmm0 \n\t"\ + OP(%%xmm0, (%1), %%xmm7, q)\ + "add $48, %0 \n\t"\ + "add %3, %1 \n\t"\ + "decl %2 \n\t"\ + " jnz 1b \n\t"\ + : "+a"(tmp), "+c"(dst), "+g"(h)\ + : "S"((x86_reg)dstStride)\ + : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ + "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ + "memory"\ + );\ + }\ +} + +#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ +static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ + put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ + OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ +}\ +static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ +}\ +static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ + OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ +}\ + +#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 +#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 +#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 +#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 +#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 +#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 +#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 +#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 + +#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 +#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 +#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 +#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 +#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 +#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 +#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 +#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 + +#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 +#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 +#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 +#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 + +#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 +#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 +#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 +#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 + +#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 +#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 + +#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ +H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ +H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ + +static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ + put_pixels16_sse2(dst, src, stride, 16); +} +static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ + avg_pixels16_sse2(dst, src, stride, 16); +} +#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 +#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 + +#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ +}\ + +#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ +}\ + +#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ +}\ + +#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ +static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ + put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ + OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ +}\ +\ +static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ + uint8_t * const halfHV= temp;\ + int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ + assert(((int)temp & 7) == 0);\ + put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ + OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ +}\ + +#define H264_MC_4816(MMX)\ +H264_MC(put_, 4, MMX, 8)\ +H264_MC(put_, 8, MMX, 8)\ +H264_MC(put_, 16,MMX, 8)\ +H264_MC(avg_, 4, MMX, 8)\ +H264_MC(avg_, 8, MMX, 8)\ +H264_MC(avg_, 16,MMX, 8)\ + +#define H264_MC_816(QPEL, XMM)\ +QPEL(put_, 8, XMM, 16)\ +QPEL(put_, 16,XMM, 16)\ +QPEL(avg_, 8, XMM, 16)\ +QPEL(avg_, 16,XMM, 16)\ + +#define PAVGB "pavgusb" +QPEL_H264(put_, PUT_OP, 3dnow) +QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) +#undef PAVGB +#define PAVGB "pavgb" +QPEL_H264(put_, PUT_OP, mmx2) +QPEL_H264(avg_, AVG_MMX2_OP, mmx2) +QPEL_H264_V_XMM(put_, PUT_OP, sse2) +QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) +QPEL_H264_HV_XMM(put_, PUT_OP, sse2) +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) +#if HAVE_SSSE3 +QPEL_H264_H_XMM(put_, PUT_OP, ssse3) +QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) +QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) +QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) +QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) +QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) +#endif +#undef PAVGB + +H264_MC_4816(3dnow) +H264_MC_4816(mmx2) +H264_MC_816(H264_MC_V, sse2) +H264_MC_816(H264_MC_HV, sse2) +#if HAVE_SSSE3 +H264_MC_816(H264_MC_H, ssse3) +H264_MC_816(H264_MC_HV, ssse3) +#endif + +#endif /* HAVE_INLINE_ASM */ + +//10bit +#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \ +void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \ + (uint8_t *dst, uint8_t *src, int stride); + +#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) + +#define LUMA_MC_816(DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ + LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) + +LUMA_MC_ALL(10, mc00, mmxext) +LUMA_MC_ALL(10, mc10, mmxext) +LUMA_MC_ALL(10, mc20, mmxext) +LUMA_MC_ALL(10, mc30, mmxext) +LUMA_MC_ALL(10, mc01, mmxext) +LUMA_MC_ALL(10, mc11, mmxext) +LUMA_MC_ALL(10, mc21, mmxext) +LUMA_MC_ALL(10, mc31, mmxext) +LUMA_MC_ALL(10, mc02, mmxext) +LUMA_MC_ALL(10, mc12, mmxext) +LUMA_MC_ALL(10, mc22, mmxext) +LUMA_MC_ALL(10, mc32, mmxext) +LUMA_MC_ALL(10, mc03, mmxext) +LUMA_MC_ALL(10, mc13, mmxext) +LUMA_MC_ALL(10, mc23, mmxext) +LUMA_MC_ALL(10, mc33, mmxext) + +LUMA_MC_816(10, mc00, sse2) +LUMA_MC_816(10, mc10, sse2) +LUMA_MC_816(10, mc10, sse2_cache64) +LUMA_MC_816(10, mc10, ssse3_cache64) +LUMA_MC_816(10, mc20, sse2) +LUMA_MC_816(10, mc20, sse2_cache64) +LUMA_MC_816(10, mc20, ssse3_cache64) +LUMA_MC_816(10, mc30, sse2) +LUMA_MC_816(10, mc30, sse2_cache64) +LUMA_MC_816(10, mc30, ssse3_cache64) +LUMA_MC_816(10, mc01, sse2) +LUMA_MC_816(10, mc11, sse2) +LUMA_MC_816(10, mc21, sse2) +LUMA_MC_816(10, mc31, sse2) +LUMA_MC_816(10, mc02, sse2) +LUMA_MC_816(10, mc12, sse2) +LUMA_MC_816(10, mc22, sse2) +LUMA_MC_816(10, mc32, sse2) +LUMA_MC_816(10, mc03, sse2) +LUMA_MC_816(10, mc13, sse2) +LUMA_MC_816(10, mc23, sse2) +LUMA_MC_816(10, mc33, sse2) + +#define QPEL16_OPMC(OP, MC, MMX)\ +void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ + src += 8*stride;\ + dst += 8*stride;\ + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ + ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ +} + +#define QPEL16_OP(MC, MMX)\ +QPEL16_OPMC(put, MC, MMX)\ +QPEL16_OPMC(avg, MC, MMX) + +#define QPEL16(MMX)\ +QPEL16_OP(mc00, MMX)\ +QPEL16_OP(mc01, MMX)\ +QPEL16_OP(mc02, MMX)\ +QPEL16_OP(mc03, MMX)\ +QPEL16_OP(mc10, MMX)\ +QPEL16_OP(mc11, MMX)\ +QPEL16_OP(mc12, MMX)\ +QPEL16_OP(mc13, MMX)\ +QPEL16_OP(mc20, MMX)\ +QPEL16_OP(mc21, MMX)\ +QPEL16_OP(mc22, MMX)\ +QPEL16_OP(mc23, MMX)\ +QPEL16_OP(mc30, MMX)\ +QPEL16_OP(mc31, MMX)\ +QPEL16_OP(mc32, MMX)\ +QPEL16_OP(mc33, MMX) + +#if ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+ +QPEL16(mmxext) +#endif diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c deleted file mode 100644 index fc1635de8b..0000000000 --- a/libavcodec/x86/h264_qpel_mmx.c +++ /dev/null @@ -1,1291 +0,0 @@ -/* - * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt - * Copyright (c) 2011 Daniel Kang - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -/***********************************/ -/* motion compensation */ - -#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\ - "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\ - "add %2, %0 \n\t"\ - "paddw "#F", "#A" \n\t"\ - "paddw "#A", "#T" \n\t"\ - "psraw $5, "#T" \n\t"\ - "packuswb "#T", "#T" \n\t"\ - OP(T, (%1), A, d)\ - "add %3, %1 \n\t" - -#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\ - "mov"#q" "#C", "#T" \n\t"\ - "mov"#d" (%0), "#F" \n\t"\ - "paddw "#D", "#T" \n\t"\ - "psllw $2, "#T" \n\t"\ - "paddw "MANGLE(ff_pw_16)", "#A"\n\t"\ - "psubw "#B", "#T" \n\t"\ - "psubw "#E", "#T" \n\t"\ - "punpcklbw "#Z", "#F" \n\t"\ - "pmullw "MANGLE(ff_pw_5)", "#T"\n\t"\ - "paddw "#F", "#A" \n\t"\ - "add %2, %0 \n\t"\ - "paddw "#A", "#T" \n\t"\ - "mov"#q" "#T", "#OF"(%1) \n\t" - -#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q) -#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q) -#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa) -#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa) - - -#define QPEL_H264(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=4;\ -\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq "MANGLE(ff_pw_5) ", %%mm4\n\t"\ - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ - "1: \n\t"\ - "movd -1(%0), %%mm1 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "movd 1(%0), %%mm3 \n\t"\ - "movd 2(%0), %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "movd -2(%0), %%mm0 \n\t"\ - "movd 3(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm3, %%mm0 \n\t"\ - "psllw $2, %%mm2 \n\t"\ - "psubw %%mm1, %%mm2 \n\t"\ - "pmullw %%mm4, %%mm2 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm6, d)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=4;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq %0, %%mm4 \n\t"\ - "movq %1, %%mm5 \n\t"\ - :: "m"(ff_pw_5), "m"(ff_pw_16)\ - );\ - do{\ - __asm__ volatile(\ - "movd -1(%0), %%mm1 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "movd 1(%0), %%mm3 \n\t"\ - "movd 2(%0), %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "paddw %%mm0, %%mm1 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "movd -2(%0), %%mm0 \n\t"\ - "movd 3(%0), %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm3, %%mm0 \n\t"\ - "psllw $2, %%mm2 \n\t"\ - "psubw %%mm1, %%mm2 \n\t"\ - "pmullw %%mm4, %%mm2 \n\t"\ - "paddw %%mm5, %%mm0 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "movd (%2), %%mm3 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - PAVGB" %%mm3, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm6, d)\ - "add %4, %0 \n\t"\ - "add %4, %1 \n\t"\ - "add %3, %2 \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - }while(--h);\ -}\ -static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - src -= 2*srcStride;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ - : "memory"\ - );\ -}\ -static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - int h=4;\ - int w=3;\ - src -= 2*srcStride+2;\ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\ - \ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride)\ - : "memory"\ - );\ - tmp += 4;\ - src += 4 - 9*srcStride;\ - }\ - tmp -= 3*4;\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "paddw 10(%0), %%mm0 \n\t"\ - "movq 2(%0), %%mm1 \n\t"\ - "paddw 8(%0), %%mm1 \n\t"\ - "movq 4(%0), %%mm2 \n\t"\ - "paddw 6(%0), %%mm2 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"/*a-b (abccba)*/\ - "psraw $2, %%mm0 \n\t"/*(a-b)/4 */\ - "psubw %%mm1, %%mm0 \n\t"/*(a-b)/4-b */\ - "paddsw %%mm2, %%mm0 \n\t"\ - "psraw $2, %%mm0 \n\t"/*((a-b)/4-b+c)/4 */\ - "paddw %%mm2, %%mm0 \n\t"/*(a-5*b+20*c)/16 */\ - "psraw $6, %%mm0 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm7, d)\ - "add $24, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "psllw $2, %%mm0 \n\t"\ - "psllw $2, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movd -2(%0), %%mm2 \n\t"\ - "movd 7(%0), %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ - "paddw %%mm5, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm4, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movq "MANGLE(ff_pw_5)", %%mm6\n\t"\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 1(%0), %%mm2 \n\t"\ - "movq %%mm0, %%mm1 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpckhbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "psllw $2, %%mm0 \n\t"\ - "psllw $2, %%mm1 \n\t"\ - "movq -1(%0), %%mm2 \n\t"\ - "movq 2(%0), %%mm4 \n\t"\ - "movq %%mm2, %%mm3 \n\t"\ - "movq %%mm4, %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpckhbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "punpckhbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm4, %%mm2 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psubw %%mm2, %%mm0 \n\t"\ - "psubw %%mm5, %%mm1 \n\t"\ - "pmullw %%mm6, %%mm0 \n\t"\ - "pmullw %%mm6, %%mm1 \n\t"\ - "movd -2(%0), %%mm2 \n\t"\ - "movd 7(%0), %%mm5 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm5 \n\t"\ - "paddw %%mm3, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "movq "MANGLE(ff_pw_16)", %%mm5\n\t"\ - "paddw %%mm5, %%mm2 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm4, %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "movq (%2), %%mm4 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - PAVGB" %%mm4, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm5, q)\ - "add %5, %0 \n\t"\ - "add %5, %1 \n\t"\ - "add %4, %2 \n\t"\ - "decl %3 \n\t"\ - "jg 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ - : "memory"\ - );\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - int w= 2;\ - src -= 2*srcStride;\ - \ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - "cmpl $16, %4 \n\t"\ - "jne 2f \n\t"\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ - QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ - QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ - QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\ - QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ - QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ - "2: \n\t"\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\ - : "memory"\ - );\ - src += 4-(h+5)*srcStride;\ - dst += 4-h*dstStride;\ - }\ -}\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ - int w = (size+8)>>2;\ - src -= 2*srcStride+2;\ - while(w--){\ - __asm__ volatile(\ - "pxor %%mm7, %%mm7 \n\t"\ - "movd (%0), %%mm0 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm1 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm2 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm3 \n\t"\ - "add %2, %0 \n\t"\ - "movd (%0), %%mm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm1 \n\t"\ - "punpcklbw %%mm7, %%mm2 \n\t"\ - "punpcklbw %%mm7, %%mm3 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\ - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\ - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\ - "cmpl $16, %3 \n\t"\ - "jne 2f \n\t"\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ - QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ - QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ - QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ - QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\ - QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\ - QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\ - "2: \n\t"\ - : "+a"(src)\ - : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size)\ - : "memory"\ - );\ - tmp += 4;\ - src += 4 - (size+5)*srcStride;\ - }\ -}\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int w = size>>4;\ - do{\ - int h = size;\ - __asm__ volatile(\ - "1: \n\t"\ - "movq (%0), %%mm0 \n\t"\ - "movq 8(%0), %%mm3 \n\t"\ - "movq 2(%0), %%mm1 \n\t"\ - "movq 10(%0), %%mm4 \n\t"\ - "paddw %%mm4, %%mm0 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "paddw 18(%0), %%mm3 \n\t"\ - "paddw 16(%0), %%mm4 \n\t"\ - "movq 4(%0), %%mm2 \n\t"\ - "movq 12(%0), %%mm5 \n\t"\ - "paddw 6(%0), %%mm2 \n\t"\ - "paddw 14(%0), %%mm5 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "psubw %%mm1, %%mm0 \n\t"\ - "psubw %%mm4, %%mm3 \n\t"\ - "paddsw %%mm2, %%mm0 \n\t"\ - "paddsw %%mm5, %%mm3 \n\t"\ - "psraw $2, %%mm0 \n\t"\ - "psraw $2, %%mm3 \n\t"\ - "paddw %%mm2, %%mm0 \n\t"\ - "paddw %%mm5, %%mm3 \n\t"\ - "psraw $6, %%mm0 \n\t"\ - "psraw $6, %%mm3 \n\t"\ - "packuswb %%mm3, %%mm0 \n\t"\ - OP(%%mm0, (%1),%%mm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : "memory"\ - );\ - tmp += 8 - size*24;\ - dst += 8 - size*dstStride;\ - }while(w--);\ -}\ -\ -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -}\ -\ -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -}\ -\ -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ -}\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ -}\ -\ -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ -}\ -\ -static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - __asm__ volatile(\ - "movq (%1), %%mm0 \n\t"\ - "movq 24(%1), %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - "packuswb %%mm1, %%mm1 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm1 \n\t"\ - OP(%%mm0, (%2), %%mm4, d)\ - OP(%%mm1, (%2,%4), %%mm5, d)\ - "lea (%0,%3,2), %0 \n\t"\ - "lea (%2,%4,2), %2 \n\t"\ - "movq 48(%1), %%mm0 \n\t"\ - "movq 72(%1), %%mm1 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "packuswb %%mm0, %%mm0 \n\t"\ - "packuswb %%mm1, %%mm1 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm1 \n\t"\ - OP(%%mm0, (%2), %%mm4, d)\ - OP(%%mm1, (%2,%4), %%mm5, d)\ - :"+a"(src8), "+c"(src16), "+d"(dst)\ - :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\ - :"memory");\ -}\ -static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - do{\ - __asm__ volatile(\ - "movq (%1), %%mm0 \n\t"\ - "movq 8(%1), %%mm1 \n\t"\ - "movq 48(%1), %%mm2 \n\t"\ - "movq 8+48(%1), %%mm3 \n\t"\ - "psraw $5, %%mm0 \n\t"\ - "psraw $5, %%mm1 \n\t"\ - "psraw $5, %%mm2 \n\t"\ - "psraw $5, %%mm3 \n\t"\ - "packuswb %%mm1, %%mm0 \n\t"\ - "packuswb %%mm3, %%mm2 \n\t"\ - PAVGB" (%0), %%mm0 \n\t"\ - PAVGB" (%0,%3), %%mm2 \n\t"\ - OP(%%mm0, (%2), %%mm5, q)\ - OP(%%mm2, (%2,%4), %%mm5, q)\ - ::"a"(src8), "c"(src16), "d"(dst),\ - "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\ - :"memory");\ - src8 += 2L*src8Stride;\ - src16 += 48;\ - dst += 2L*dstStride;\ - }while(h-=2);\ -}\ -static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ -{\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ - OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ -}\ - - -#if ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=16;\ - __asm__ volatile(\ - "pxor %%xmm15, %%xmm15 \n\t"\ - "movdqa %6, %%xmm14 \n\t"\ - "movdqa %7, %%xmm13 \n\t"\ - "1: \n\t"\ - "lddqu 6(%0), %%xmm1 \n\t"\ - "lddqu -2(%0), %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm15, %%xmm1 \n\t"\ - "punpcklbw %%xmm15, %%xmm0 \n\t"\ - "punpcklbw %%xmm15, %%xmm7 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm0, %%xmm6 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm0, %%xmm8 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm0, %%xmm9 \n\t"\ - "movdqa %%xmm0, %%xmm12 \n\t"\ - "movdqa %%xmm1, %%xmm11 \n\t"\ - "palignr $10,%%xmm0, %%xmm11\n\t"\ - "palignr $10,%%xmm7, %%xmm12\n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $2, %%xmm7, %%xmm9 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $4, %%xmm7, %%xmm8 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $6, %%xmm7, %%xmm6 \n\t"\ - "paddw %%xmm0 ,%%xmm11 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $8, %%xmm7, %%xmm0 \n\t"\ - "paddw %%xmm12,%%xmm7 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm8, %%xmm6 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm9, %%xmm0 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psllw $2, %%xmm6 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "psubw %%xmm0, %%xmm6 \n\t"\ - "paddw %%xmm13,%%xmm11 \n\t"\ - "paddw %%xmm13,%%xmm7 \n\t"\ - "pmullw %%xmm14,%%xmm2 \n\t"\ - "pmullw %%xmm14,%%xmm6 \n\t"\ - "lddqu (%2), %%xmm3 \n\t"\ - "paddw %%xmm11,%%xmm2 \n\t"\ - "paddw %%xmm7, %%xmm6 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "psraw $5, %%xmm6 \n\t"\ - "packuswb %%xmm2,%%xmm6 \n\t"\ - "pavgb %%xmm3, %%xmm6 \n\t"\ - OP(%%xmm6, (%1), %%xmm4, dqa)\ - "add %5, %0 \n\t"\ - "add %5, %1 \n\t"\ - "add %4, %2 \n\t"\ - "decl %3 \n\t"\ - "jg 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_5), "m"(ff_pw_16)\ - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , \ - "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" , \ - "%xmm8" , "%xmm9" , "%xmm10", "%xmm11", \ - "%xmm12", "%xmm13", "%xmm14", "%xmm15",)\ - "memory"\ - );\ -} -#else // ARCH_X86_64 -#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ - src += 8*dstStride;\ - dst += 8*dstStride;\ - src2 += 8*src2Stride;\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ - OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ -} -#endif // ARCH_X86_64 - -#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ - "1: \n\t"\ - "lddqu -2(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $10,%%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "movq (%2), %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm0, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - "pavgb %%xmm3, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %5, %0 \n\t"\ - "add %5, %1 \n\t"\ - "add %4, %2 \n\t"\ - "decl %3 \n\t"\ - "jg 1b \n\t"\ - : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ -}\ -QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ -\ -static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - int h=8;\ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ - "1: \n\t"\ - "lddqu -2(%0), %%xmm1 \n\t"\ - "movdqa %%xmm1, %%xmm0 \n\t"\ - "punpckhbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $2, %%xmm0, %%xmm4 \n\t"\ - "palignr $4, %%xmm0, %%xmm3 \n\t"\ - "palignr $6, %%xmm0, %%xmm2 \n\t"\ - "palignr $8, %%xmm0, %%xmm1 \n\t"\ - "palignr $10,%%xmm0, %%xmm5 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "psllw $2, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw "MANGLE(ff_pw_16)", %%xmm0\n\t"\ - "pmullw %%xmm6, %%xmm2 \n\t"\ - "paddw %%xmm0, %%xmm2 \n\t"\ - "psraw $5, %%xmm2 \n\t"\ - "packuswb %%xmm2, %%xmm2 \n\t"\ - OP(%%xmm2, (%1), %%xmm4, q)\ - "add %3, %0 \n\t"\ - "add %4, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+g"(h)\ - : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ -}\ -static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ - src += 8*srcStride;\ - dst += 8*dstStride;\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ - OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ -}\ - -#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ - src -= 2*srcStride;\ - \ - __asm__ volatile(\ - "pxor %%xmm7, %%xmm7 \n\t"\ - "movq (%0), %%xmm0 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm1 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm2 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm3 \n\t"\ - "add %2, %0 \n\t"\ - "movq (%0), %%xmm4 \n\t"\ - "add %2, %0 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm1 \n\t"\ - "punpcklbw %%xmm7, %%xmm2 \n\t"\ - "punpcklbw %%xmm7, %%xmm3 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - "cmpl $16, %4 \n\t"\ - "jne 2f \n\t"\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ - QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ - QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ - QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\ - QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ - QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ - "2: \n\t"\ - \ - : "+a"(src), "+c"(dst)\ - : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "rm"(h)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ -}\ -static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ -}\ -static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ - OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ -} - -static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ - int w = (size+8)>>3; - src -= 2*srcStride+2; - while(w--){ - __asm__ volatile( - "pxor %%xmm7, %%xmm7 \n\t" - "movq (%0), %%xmm0 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm1 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm2 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm3 \n\t" - "add %2, %0 \n\t" - "movq (%0), %%xmm4 \n\t" - "add %2, %0 \n\t" - "punpcklbw %%xmm7, %%xmm0 \n\t" - "punpcklbw %%xmm7, %%xmm1 \n\t" - "punpcklbw %%xmm7, %%xmm2 \n\t" - "punpcklbw %%xmm7, %%xmm3 \n\t" - "punpcklbw %%xmm7, %%xmm4 \n\t" - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48) - "cmpl $16, %3 \n\t" - "jne 2f \n\t" - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) - QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) - QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) - QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) - QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48) - QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48) - QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48) - "2: \n\t" - : "+a"(src) - : "c"(tmp), "S"((x86_reg)srcStride), "rm"(size) - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7",) - "memory" - ); - tmp += 8; - src += 8 - (size+5)*srcStride; - } -} - -#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ -static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ - int h = size;\ - if(size == 16){\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 32(%0), %%xmm4 \n\t"\ - "movdqa 16(%0), %%xmm5 \n\t"\ - "movdqa (%0), %%xmm7 \n\t"\ - "movdqa %%xmm4, %%xmm3 \n\t"\ - "movdqa %%xmm4, %%xmm2 \n\t"\ - "movdqa %%xmm4, %%xmm1 \n\t"\ - "movdqa %%xmm4, %%xmm0 \n\t"\ - "palignr $10, %%xmm5, %%xmm0 \n\t"\ - "palignr $8, %%xmm5, %%xmm1 \n\t"\ - "palignr $6, %%xmm5, %%xmm2 \n\t"\ - "palignr $4, %%xmm5, %%xmm3 \n\t"\ - "palignr $2, %%xmm5, %%xmm4 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "movdqa %%xmm5, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm3 \n\t"\ - "palignr $8, %%xmm7, %%xmm4 \n\t"\ - "palignr $2, %%xmm7, %%xmm6 \n\t"\ - "palignr $10, %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm4 \n\t"\ - "movdqa %%xmm5, %%xmm6 \n\t"\ - "palignr $6, %%xmm7, %%xmm5 \n\t"\ - "palignr $4, %%xmm7, %%xmm6 \n\t"\ - "paddw %%xmm7, %%xmm3 \n\t"\ - "paddw %%xmm6, %%xmm5 \n\t"\ - \ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psubw %%xmm4, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psraw $2, %%xmm3 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "paddw %%xmm5, %%xmm3 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "psraw $6, %%xmm3 \n\t"\ - "packuswb %%xmm0, %%xmm3 \n\t"\ - OP(%%xmm3, (%1), %%xmm7, dqa)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ - }else{\ - __asm__ volatile(\ - "1: \n\t"\ - "movdqa 16(%0), %%xmm1 \n\t"\ - "movdqa (%0), %%xmm0 \n\t"\ - "movdqa %%xmm1, %%xmm2 \n\t"\ - "movdqa %%xmm1, %%xmm3 \n\t"\ - "movdqa %%xmm1, %%xmm4 \n\t"\ - "movdqa %%xmm1, %%xmm5 \n\t"\ - "palignr $10, %%xmm0, %%xmm5 \n\t"\ - "palignr $8, %%xmm0, %%xmm4 \n\t"\ - "palignr $6, %%xmm0, %%xmm3 \n\t"\ - "palignr $4, %%xmm0, %%xmm2 \n\t"\ - "palignr $2, %%xmm0, %%xmm1 \n\t"\ - "paddw %%xmm5, %%xmm0 \n\t"\ - "paddw %%xmm4, %%xmm1 \n\t"\ - "paddw %%xmm3, %%xmm2 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "psubw %%xmm1, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $2, %%xmm0 \n\t"\ - "paddw %%xmm2, %%xmm0 \n\t"\ - "psraw $6, %%xmm0 \n\t"\ - "packuswb %%xmm0, %%xmm0 \n\t"\ - OP(%%xmm0, (%1), %%xmm7, q)\ - "add $48, %0 \n\t"\ - "add %3, %1 \n\t"\ - "decl %2 \n\t"\ - " jnz 1b \n\t"\ - : "+a"(tmp), "+c"(dst), "+g"(h)\ - : "S"((x86_reg)dstStride)\ - : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ - "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\ - "memory"\ - );\ - }\ -} - -#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ -static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ - put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ - OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ -}\ -static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ -}\ -static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ - OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ -}\ - -#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2 -#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2 -#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2 -#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2 -#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2 -#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2 -#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2 -#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2 - -#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2 -#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2 -#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2 -#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2 -#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2 -#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2 -#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2 -#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2 - -#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2 -#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2 -#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2 -#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2 - -#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2 -#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2 -#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2 -#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2 - -#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2 -#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2 - -#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ -H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ -H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ - -static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ - put_pixels16_sse2(dst, src, stride, 16); -} -static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){ - avg_pixels16_sse2(dst, src, stride, 16); -} -#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2 -#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2 - -#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ -}\ - -#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ -}\ - -#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ -}\ - -#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ - uint8_t * const halfHV= temp;\ - int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ - assert(((int)temp & 7) == 0);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ -}\ - -#define H264_MC_4816(MMX)\ -H264_MC(put_, 4, MMX, 8)\ -H264_MC(put_, 8, MMX, 8)\ -H264_MC(put_, 16,MMX, 8)\ -H264_MC(avg_, 4, MMX, 8)\ -H264_MC(avg_, 8, MMX, 8)\ -H264_MC(avg_, 16,MMX, 8)\ - -#define H264_MC_816(QPEL, XMM)\ -QPEL(put_, 8, XMM, 16)\ -QPEL(put_, 16,XMM, 16)\ -QPEL(avg_, 8, XMM, 16)\ -QPEL(avg_, 16,XMM, 16)\ - -#define PAVGB "pavgusb" -QPEL_H264(put_, PUT_OP, 3dnow) -QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow) -#undef PAVGB -#define PAVGB "pavgb" -QPEL_H264(put_, PUT_OP, mmx2) -QPEL_H264(avg_, AVG_MMX2_OP, mmx2) -QPEL_H264_V_XMM(put_, PUT_OP, sse2) -QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2) -QPEL_H264_HV_XMM(put_, PUT_OP, sse2) -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2) -#if HAVE_SSSE3 -QPEL_H264_H_XMM(put_, PUT_OP, ssse3) -QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3) -QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3) -QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) -QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3) -#endif -#undef PAVGB - -H264_MC_4816(3dnow) -H264_MC_4816(mmx2) -H264_MC_816(H264_MC_V, sse2) -H264_MC_816(H264_MC_HV, sse2) -#if HAVE_SSSE3 -H264_MC_816(H264_MC_H, ssse3) -H264_MC_816(H264_MC_HV, ssse3) -#endif - -#endif /* HAVE_INLINE_ASM */ - -//10bit -#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \ -void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \ - (uint8_t *dst, uint8_t *src, int stride); - -#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) - -#define LUMA_MC_816(DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ - LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) - -LUMA_MC_ALL(10, mc00, mmxext) -LUMA_MC_ALL(10, mc10, mmxext) -LUMA_MC_ALL(10, mc20, mmxext) -LUMA_MC_ALL(10, mc30, mmxext) -LUMA_MC_ALL(10, mc01, mmxext) -LUMA_MC_ALL(10, mc11, mmxext) -LUMA_MC_ALL(10, mc21, mmxext) -LUMA_MC_ALL(10, mc31, mmxext) -LUMA_MC_ALL(10, mc02, mmxext) -LUMA_MC_ALL(10, mc12, mmxext) -LUMA_MC_ALL(10, mc22, mmxext) -LUMA_MC_ALL(10, mc32, mmxext) -LUMA_MC_ALL(10, mc03, mmxext) -LUMA_MC_ALL(10, mc13, mmxext) -LUMA_MC_ALL(10, mc23, mmxext) -LUMA_MC_ALL(10, mc33, mmxext) - -LUMA_MC_816(10, mc00, sse2) -LUMA_MC_816(10, mc10, sse2) -LUMA_MC_816(10, mc10, sse2_cache64) -LUMA_MC_816(10, mc10, ssse3_cache64) -LUMA_MC_816(10, mc20, sse2) -LUMA_MC_816(10, mc20, sse2_cache64) -LUMA_MC_816(10, mc20, ssse3_cache64) -LUMA_MC_816(10, mc30, sse2) -LUMA_MC_816(10, mc30, sse2_cache64) -LUMA_MC_816(10, mc30, ssse3_cache64) -LUMA_MC_816(10, mc01, sse2) -LUMA_MC_816(10, mc11, sse2) -LUMA_MC_816(10, mc21, sse2) -LUMA_MC_816(10, mc31, sse2) -LUMA_MC_816(10, mc02, sse2) -LUMA_MC_816(10, mc12, sse2) -LUMA_MC_816(10, mc22, sse2) -LUMA_MC_816(10, mc32, sse2) -LUMA_MC_816(10, mc03, sse2) -LUMA_MC_816(10, mc13, sse2) -LUMA_MC_816(10, mc23, sse2) -LUMA_MC_816(10, mc33, sse2) - -#define QPEL16_OPMC(OP, MC, MMX)\ -void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ - src += 8*stride;\ - dst += 8*stride;\ - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ - ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ -} - -#define QPEL16_OP(MC, MMX)\ -QPEL16_OPMC(put, MC, MMX)\ -QPEL16_OPMC(avg, MC, MMX) - -#define QPEL16(MMX)\ -QPEL16_OP(mc00, MMX)\ -QPEL16_OP(mc01, MMX)\ -QPEL16_OP(mc02, MMX)\ -QPEL16_OP(mc03, MMX)\ -QPEL16_OP(mc10, MMX)\ -QPEL16_OP(mc11, MMX)\ -QPEL16_OP(mc12, MMX)\ -QPEL16_OP(mc13, MMX)\ -QPEL16_OP(mc20, MMX)\ -QPEL16_OP(mc21, MMX)\ -QPEL16_OP(mc22, MMX)\ -QPEL16_OP(mc23, MMX)\ -QPEL16_OP(mc30, MMX)\ -QPEL16_OP(mc31, MMX)\ -QPEL16_OP(mc32, MMX)\ -QPEL16_OP(mc33, MMX) - -#if ARCH_X86_32 && HAVE_YASM // ARCH_X86_64 implies sse2+ -QPEL16(mmxext) -#endif diff --git a/libavcodec/x86/lpc.c b/libavcodec/x86/lpc.c new file mode 100644 index 0000000000..82f77612f2 --- /dev/null +++ b/libavcodec/x86/lpc.c @@ -0,0 +1,154 @@ +/* + * MMX optimized LPC DSP utils + * Copyright (c) 2007 Loren Merritt + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/asm.h" +#include "libavutil/cpu.h" +#include "libavutil/internal.h" +#include "libavcodec/lpc.h" + +#if HAVE_INLINE_ASM + +static void lpc_apply_welch_window_sse2(const int32_t *data, int len, + double *w_data) +{ + double c = 2.0 / (len-1.0); + int n2 = len>>1; + x86_reg i = -n2*sizeof(int32_t); + x86_reg j = n2*sizeof(int32_t); + __asm__ volatile( + "movsd %4, %%xmm7 \n\t" + "movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t" + "movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t" + "movlhps %%xmm7, %%xmm7 \n\t" + "subpd %%xmm5, %%xmm7 \n\t" + "addsd %%xmm6, %%xmm7 \n\t" + "test $1, %5 \n\t" + "jz 2f \n\t" +#define WELCH(MOVPD, offset)\ + "1: \n\t"\ + "movapd %%xmm7, %%xmm1 \n\t"\ + "mulpd %%xmm1, %%xmm1 \n\t"\ + "movapd %%xmm6, %%xmm0 \n\t"\ + "subpd %%xmm1, %%xmm0 \n\t"\ + "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ + "cvtpi2pd (%3,%0), %%xmm2 \n\t"\ + "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\ + "mulpd %%xmm0, %%xmm2 \n\t"\ + "mulpd %%xmm1, %%xmm3 \n\t"\ + "movapd %%xmm2, (%2,%0,2) \n\t"\ + MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\ + "subpd %%xmm5, %%xmm7 \n\t"\ + "sub $8, %1 \n\t"\ + "add $8, %0 \n\t"\ + "jl 1b \n\t"\ + + WELCH("movupd", -1) + "jmp 3f \n\t" + "2: \n\t" + WELCH("movapd", -2) + "3: \n\t" + :"+&r"(i), "+&r"(j) + :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len) + XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm5", "%xmm6", "%xmm7") + ); +#undef WELCH +} + +static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, + double *autoc) +{ + int j; + + if((x86_reg)data & 15) + data++; + + for(j=0; jlpc_apply_welch_window = lpc_apply_welch_window_sse2; + c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; + } +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/x86/lpc_mmx.c b/libavcodec/x86/lpc_mmx.c deleted file mode 100644 index 82f77612f2..0000000000 --- a/libavcodec/x86/lpc_mmx.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * MMX optimized LPC DSP utils - * Copyright (c) 2007 Loren Merritt - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/x86/asm.h" -#include "libavutil/cpu.h" -#include "libavutil/internal.h" -#include "libavcodec/lpc.h" - -#if HAVE_INLINE_ASM - -static void lpc_apply_welch_window_sse2(const int32_t *data, int len, - double *w_data) -{ - double c = 2.0 / (len-1.0); - int n2 = len>>1; - x86_reg i = -n2*sizeof(int32_t); - x86_reg j = n2*sizeof(int32_t); - __asm__ volatile( - "movsd %4, %%xmm7 \n\t" - "movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t" - "movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t" - "movlhps %%xmm7, %%xmm7 \n\t" - "subpd %%xmm5, %%xmm7 \n\t" - "addsd %%xmm6, %%xmm7 \n\t" - "test $1, %5 \n\t" - "jz 2f \n\t" -#define WELCH(MOVPD, offset)\ - "1: \n\t"\ - "movapd %%xmm7, %%xmm1 \n\t"\ - "mulpd %%xmm1, %%xmm1 \n\t"\ - "movapd %%xmm6, %%xmm0 \n\t"\ - "subpd %%xmm1, %%xmm0 \n\t"\ - "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ - "cvtpi2pd (%3,%0), %%xmm2 \n\t"\ - "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\ - "mulpd %%xmm0, %%xmm2 \n\t"\ - "mulpd %%xmm1, %%xmm3 \n\t"\ - "movapd %%xmm2, (%2,%0,2) \n\t"\ - MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\ - "subpd %%xmm5, %%xmm7 \n\t"\ - "sub $8, %1 \n\t"\ - "add $8, %0 \n\t"\ - "jl 1b \n\t"\ - - WELCH("movupd", -1) - "jmp 3f \n\t" - "2: \n\t" - WELCH("movapd", -2) - "3: \n\t" - :"+&r"(i), "+&r"(j) - :"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len) - XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm5", "%xmm6", "%xmm7") - ); -#undef WELCH -} - -static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, - double *autoc) -{ - int j; - - if((x86_reg)data & 15) - data++; - - for(j=0; jlpc_apply_welch_window = lpc_apply_welch_window_sse2; - c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; - } -#endif /* HAVE_INLINE_ASM */ -} diff --git a/libavcodec/x86/motion_est.c b/libavcodec/x86/motion_est.c new file mode 100644 index 0000000000..6eb44d4b2d --- /dev/null +++ b/libavcodec/x86/motion_est.c @@ -0,0 +1,469 @@ +/* + * MMX optimized motion estimation + * Copyright (c) 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * mostly by Michael Niedermayer + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/internal.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/dsputil.h" +#include "dsputil_mmx.h" + +#if HAVE_INLINE_ASM + +DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={ +0x0000000000000000ULL, +0x0001000100010001ULL, +0x0002000200020002ULL, +}; + +DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; + +static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + x86_reg len= -(stride*h); + __asm__ volatile( + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" + "add %3, %%"REG_a" \n\t" + "psubusb %%mm0, %%mm2 \n\t" + "psubusb %%mm4, %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm5 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm5, %%mm1 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "add %3, %%"REG_a" \n\t" + " js 1b \n\t" + : "+a" (len) + : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride) + ); +} + +static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + __asm__ volatile( + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2, %3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg)stride) + ); +} + +static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) +{ + int ret; + __asm__ volatile( + "pxor %%xmm2, %%xmm2 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu (%1, %4), %%xmm1 \n\t" + "psadbw (%2), %%xmm0 \n\t" + "psadbw (%2, %4), %%xmm1 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "paddw %%xmm1, %%xmm2 \n\t" + "lea (%1,%4,2), %1 \n\t" + "lea (%2,%4,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + "movhlps %%xmm2, %%xmm0 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "movd %%xmm2, %3 \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret) + : "r" ((x86_reg)stride) + ); + return ret; +} + +static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + __asm__ volatile( + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "pavgb 1(%1), %%mm0 \n\t" + "pavgb 1(%1, %3), %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2, %3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg)stride) + ); +} + +static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "add %3, %1 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "pavgb %%mm1, %%mm0 \n\t" + "pavgb %%mm2, %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2, %3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "movq %%mm2, %%mm0 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg)stride) + ); +} + +static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + __asm__ volatile( + "movq "MANGLE(bone)", %%mm5 \n\t" + "movq (%1), %%mm0 \n\t" + "pavgb 1(%1), %%mm0 \n\t" + "add %3, %1 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1,%3), %%mm2 \n\t" + "pavgb 1(%1), %%mm1 \n\t" + "pavgb 1(%1,%3), %%mm2 \n\t" + "psubusb %%mm5, %%mm1 \n\t" + "pavgb %%mm1, %%mm0 \n\t" + "pavgb %%mm2, %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2,%3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "movq %%mm2, %%mm0 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg)stride) + ); +} + +static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) +{ + x86_reg len= -(stride*h); + __asm__ volatile( + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "paddw %%mm2, %%mm3 \n\t" + "movq (%3, %%"REG_a"), %%mm4 \n\t" + "movq (%3, %%"REG_a"), %%mm2 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "paddw %%mm5, %%mm3 \n\t" + "psrlw $1, %%mm1 \n\t" + "psrlw $1, %%mm3 \n\t" + "packuswb %%mm3, %%mm1 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm2, %%mm1 \n\t" + "por %%mm4, %%mm1 \n\t" + "movq %%mm1, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "add %4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+a" (len) + : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride) + ); +} + +static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + x86_reg len= -(stride*h); + __asm__ volatile( + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq 1(%1, %%"REG_a"), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm3, %%mm1 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" + "movq 1(%2, %%"REG_a"), %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddw %%mm4, %%mm2 \n\t" + "paddw %%mm5, %%mm3 \n\t" + "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm3, %%mm1 \n\t" + "paddw %%mm5, %%mm0 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "movq (%3, %%"REG_a"), %%mm4 \n\t" + "movq (%3, %%"REG_a"), %%mm5 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm1 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "psubusb %%mm0, %%mm4 \n\t" + "psubusb %%mm5, %%mm0 \n\t" + "por %%mm4, %%mm0 \n\t" + "movq %%mm0, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm4, %%mm6 \n\t" + "movq %%mm2, %%mm0 \n\t" + "movq %%mm3, %%mm1 \n\t" + "add %4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+a" (len) + : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride) + ); +} + +static inline int sum_mmx(void) +{ + int ret; + __asm__ volatile( + "movq %%mm6, %%mm0 \n\t" + "psrlq $32, %%mm6 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "movq %%mm6, %%mm0 \n\t" + "psrlq $16, %%mm6 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "movd %%mm6, %0 \n\t" + : "=r" (ret) + ); + return ret&0xFFFF; +} + +static inline int sum_mmx2(void) +{ + int ret; + __asm__ volatile( + "movd %%mm6, %0 \n\t" + : "=r" (ret) + ); + return ret; +} + +static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + sad8_2_mmx(blk1, blk1+1, blk2, stride, h); +} +static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); +} + + +#define PIX_SAD(suf)\ +static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + assert(h==8);\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t":);\ +\ + sad8_1_ ## suf(blk1, blk2, stride, 8);\ +\ + return sum_ ## suf();\ +}\ +static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + assert(h==8);\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "movq %0, %%mm5 \n\t"\ + :: "m"(round_tab[1]) \ + );\ +\ + sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ +\ + return sum_ ## suf();\ +}\ +\ +static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + assert(h==8);\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "movq %0, %%mm5 \n\t"\ + :: "m"(round_tab[1]) \ + );\ +\ + sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ +\ + return sum_ ## suf();\ +}\ +\ +static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + assert(h==8);\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + ::);\ +\ + sad8_4_ ## suf(blk1, blk2, stride, 8);\ +\ + return sum_ ## suf();\ +}\ +\ +static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t":);\ +\ + sad8_1_ ## suf(blk1 , blk2 , stride, h);\ + sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ +\ + return sum_ ## suf();\ +}\ +static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "movq %0, %%mm5 \n\t"\ + :: "m"(round_tab[1]) \ + );\ +\ + sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ + sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ +\ + return sum_ ## suf();\ +}\ +static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "movq %0, %%mm5 \n\t"\ + :: "m"(round_tab[1]) \ + );\ +\ + sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ + sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ +\ + return sum_ ## suf();\ +}\ +static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ +{\ + __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + ::);\ +\ + sad8_4_ ## suf(blk1 , blk2 , stride, h);\ + sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ +\ + return sum_ ## suf();\ +}\ + +PIX_SAD(mmx) +PIX_SAD(mmx2) + +#endif /* HAVE_INLINE_ASM */ + +void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) +{ +#if HAVE_INLINE_ASM + int mm_flags = av_get_cpu_flags(); + + if (mm_flags & AV_CPU_FLAG_MMX) { + c->pix_abs[0][0] = sad16_mmx; + c->pix_abs[0][1] = sad16_x2_mmx; + c->pix_abs[0][2] = sad16_y2_mmx; + c->pix_abs[0][3] = sad16_xy2_mmx; + c->pix_abs[1][0] = sad8_mmx; + c->pix_abs[1][1] = sad8_x2_mmx; + c->pix_abs[1][2] = sad8_y2_mmx; + c->pix_abs[1][3] = sad8_xy2_mmx; + + c->sad[0]= sad16_mmx; + c->sad[1]= sad8_mmx; + } + if (mm_flags & AV_CPU_FLAG_MMXEXT) { + c->pix_abs[0][0] = sad16_mmx2; + c->pix_abs[1][0] = sad8_mmx2; + + c->sad[0]= sad16_mmx2; + c->sad[1]= sad8_mmx2; + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->pix_abs[0][1] = sad16_x2_mmx2; + c->pix_abs[0][2] = sad16_y2_mmx2; + c->pix_abs[0][3] = sad16_xy2_mmx2; + c->pix_abs[1][1] = sad8_x2_mmx2; + c->pix_abs[1][2] = sad8_y2_mmx2; + c->pix_abs[1][3] = sad8_xy2_mmx2; + } + } + if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { + c->sad[0]= sad16_sse2; + } +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/x86/motion_est_mmx.c b/libavcodec/x86/motion_est_mmx.c deleted file mode 100644 index 6eb44d4b2d..0000000000 --- a/libavcodec/x86/motion_est_mmx.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * MMX optimized motion estimation - * Copyright (c) 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * mostly by Michael Niedermayer - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/internal.h" -#include "libavutil/mem.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/dsputil.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={ -0x0000000000000000ULL, -0x0001000100010001ULL, -0x0002000200020002ULL, -}; - -DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; - -static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - x86_reg len= -(stride*h); - __asm__ volatile( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq (%2, %%"REG_a"), %%mm2 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - "add %3, %%"REG_a" \n\t" - "psubusb %%mm0, %%mm2 \n\t" - "psubusb %%mm4, %%mm0 \n\t" - "movq (%1, %%"REG_a"), %%mm1 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm5 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm1, %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %3, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride) - ); -} - -static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - __asm__ volatile( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) -{ - int ret; - __asm__ volatile( - "pxor %%xmm2, %%xmm2 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1, %4), %%xmm1 \n\t" - "psadbw (%2), %%xmm0 \n\t" - "psadbw (%2, %4), %%xmm1 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "paddw %%xmm1, %%xmm2 \n\t" - "lea (%1,%4,2), %1 \n\t" - "lea (%2,%4,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - "movhlps %%xmm2, %%xmm0 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "movd %%xmm2, %3 \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret) - : "r" ((x86_reg)stride) - ); - return ret; -} - -static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - __asm__ volatile( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "pavgb 1(%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - __asm__ volatile( - "movq "MANGLE(bone)", %%mm5 \n\t" - "movq (%1), %%mm0 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1,%3), %%mm2 \n\t" - "pavgb 1(%1), %%mm1 \n\t" - "pavgb 1(%1,%3), %%mm2 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2,%3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg)stride) - ); -} - -static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) -{ - x86_reg len= -(stride*h); - __asm__ volatile( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq (%2, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "movq (%3, %%"REG_a"), %%mm4 \n\t" - "movq (%3, %%"REG_a"), %%mm2 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "psrlw $1, %%mm1 \n\t" - "psrlw $1, %%mm3 \n\t" - "packuswb %%mm3, %%mm1 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm2, %%mm1 \n\t" - "por %%mm4, %%mm1 \n\t" - "movq %%mm1, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %4, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride) - ); -} - -static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - x86_reg len= -(stride*h); - __asm__ volatile( - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm3, %%mm1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%2, %%"REG_a"), %%mm2 \n\t" - "movq 1(%2, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddw %%mm4, %%mm2 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm3, %%mm1 \n\t" - "paddw %%mm5, %%mm0 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "movq (%3, %%"REG_a"), %%mm4 \n\t" - "movq (%3, %%"REG_a"), %%mm5 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "psubusb %%mm0, %%mm4 \n\t" - "psubusb %%mm5, %%mm0 \n\t" - "por %%mm4, %%mm0 \n\t" - "movq %%mm0, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm4 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm4, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "movq %%mm3, %%mm1 \n\t" - "add %4, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride) - ); -} - -static inline int sum_mmx(void) -{ - int ret; - __asm__ volatile( - "movq %%mm6, %%mm0 \n\t" - "psrlq $32, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psrlq $16, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "movd %%mm6, %0 \n\t" - : "=r" (ret) - ); - return ret&0xFFFF; -} - -static inline int sum_mmx2(void) -{ - int ret; - __asm__ volatile( - "movd %%mm6, %0 \n\t" - : "=r" (ret) - ); - return ret; -} - -static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - sad8_2_mmx(blk1, blk1+1, blk2, stride, h); -} -static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); -} - - -#define PIX_SAD(suf)\ -static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - assert(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t":);\ -\ - sad8_1_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - assert(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -\ -static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - assert(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -\ -static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - assert(h==8);\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - ::);\ -\ - sad8_4_ ## suf(blk1, blk2, stride, 8);\ -\ - return sum_ ## suf();\ -}\ -\ -static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t":);\ -\ - sad8_1_ ## suf(blk1 , blk2 , stride, h);\ - sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ -static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ - sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ -static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "movq %0, %%mm5 \n\t"\ - :: "m"(round_tab[1]) \ - );\ -\ - sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ - sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ -static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ -{\ - __asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - ::);\ -\ - sad8_4_ ## suf(blk1 , blk2 , stride, h);\ - sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ -\ - return sum_ ## suf();\ -}\ - -PIX_SAD(mmx) -PIX_SAD(mmx2) - -#endif /* HAVE_INLINE_ASM */ - -void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) -{ -#if HAVE_INLINE_ASM - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & AV_CPU_FLAG_MMX) { - c->pix_abs[0][0] = sad16_mmx; - c->pix_abs[0][1] = sad16_x2_mmx; - c->pix_abs[0][2] = sad16_y2_mmx; - c->pix_abs[0][3] = sad16_xy2_mmx; - c->pix_abs[1][0] = sad8_mmx; - c->pix_abs[1][1] = sad8_x2_mmx; - c->pix_abs[1][2] = sad8_y2_mmx; - c->pix_abs[1][3] = sad8_xy2_mmx; - - c->sad[0]= sad16_mmx; - c->sad[1]= sad8_mmx; - } - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->pix_abs[0][0] = sad16_mmx2; - c->pix_abs[1][0] = sad8_mmx2; - - c->sad[0]= sad16_mmx2; - c->sad[1]= sad8_mmx2; - - if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->pix_abs[0][1] = sad16_x2_mmx2; - c->pix_abs[0][2] = sad16_y2_mmx2; - c->pix_abs[0][3] = sad16_xy2_mmx2; - c->pix_abs[1][1] = sad8_x2_mmx2; - c->pix_abs[1][2] = sad8_y2_mmx2; - c->pix_abs[1][3] = sad8_xy2_mmx2; - } - } - if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { - c->sad[0]= sad16_sse2; - } -#endif /* HAVE_INLINE_ASM */ -} diff --git a/libavcodec/x86/mpegaudiodec.c b/libavcodec/x86/mpegaudiodec.c new file mode 100644 index 0000000000..701ae75138 --- /dev/null +++ b/libavcodec/x86/mpegaudiodec.c @@ -0,0 +1,261 @@ +/* + * MMX optimized MP3 decoding functions + * Copyright (c) 2010 Vitor Sessak + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegaudiodsp.h" + +void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win); +void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win); +void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win); +void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win); +void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win); +void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, + float *tmpbuf); +void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, + float *tmpbuf); + +DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; + +#if HAVE_INLINE_ASM + +#define MACS(rt, ra, rb) rt+=(ra)*(rb) +#define MLSS(rt, ra, rb) rt-=(ra)*(rb) + +#define SUM8(op, sum, w, p) \ +{ \ + op(sum, (w)[0 * 64], (p)[0 * 64]); \ + op(sum, (w)[1 * 64], (p)[1 * 64]); \ + op(sum, (w)[2 * 64], (p)[2 * 64]); \ + op(sum, (w)[3 * 64], (p)[3 * 64]); \ + op(sum, (w)[4 * 64], (p)[4 * 64]); \ + op(sum, (w)[5 * 64], (p)[5 * 64]); \ + op(sum, (w)[6 * 64], (p)[6 * 64]); \ + op(sum, (w)[7 * 64], (p)[7 * 64]); \ +} + +static void apply_window(const float *buf, const float *win1, + const float *win2, float *sum1, float *sum2, int len) +{ + x86_reg count = - 4*len; + const float *win1a = win1+len; + const float *win2a = win2+len; + const float *bufa = buf+len; + float *sum1a = sum1+len; + float *sum2a = sum2+len; + + +#define MULT(a, b) \ + "movaps " #a "(%1,%0), %%xmm1 \n\t" \ + "movaps " #a "(%3,%0), %%xmm2 \n\t" \ + "mulps %%xmm2, %%xmm1 \n\t" \ + "subps %%xmm1, %%xmm0 \n\t" \ + "mulps " #b "(%2,%0), %%xmm2 \n\t" \ + "subps %%xmm2, %%xmm4 \n\t" \ + + __asm__ volatile( + "1: \n\t" + "xorps %%xmm0, %%xmm0 \n\t" + "xorps %%xmm4, %%xmm4 \n\t" + + MULT( 0, 0) + MULT( 256, 64) + MULT( 512, 128) + MULT( 768, 192) + MULT(1024, 256) + MULT(1280, 320) + MULT(1536, 384) + MULT(1792, 448) + + "movaps %%xmm0, (%4,%0) \n\t" + "movaps %%xmm4, (%5,%0) \n\t" + "add $16, %0 \n\t" + "jl 1b \n\t" + :"+&r"(count) + :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) + ); + +#undef MULT +} + +static void apply_window_mp3(float *in, float *win, int *unused, float *out, + int incr) +{ + LOCAL_ALIGNED_16(float, suma, [17]); + LOCAL_ALIGNED_16(float, sumb, [17]); + LOCAL_ALIGNED_16(float, sumc, [17]); + LOCAL_ALIGNED_16(float, sumd, [17]); + + float sum; + + /* copy to avoid wrap */ + __asm__ volatile( + "movaps 0(%0), %%xmm0 \n\t" \ + "movaps 16(%0), %%xmm1 \n\t" \ + "movaps 32(%0), %%xmm2 \n\t" \ + "movaps 48(%0), %%xmm3 \n\t" \ + "movaps %%xmm0, 0(%1) \n\t" \ + "movaps %%xmm1, 16(%1) \n\t" \ + "movaps %%xmm2, 32(%1) \n\t" \ + "movaps %%xmm3, 48(%1) \n\t" \ + "movaps 64(%0), %%xmm0 \n\t" \ + "movaps 80(%0), %%xmm1 \n\t" \ + "movaps 96(%0), %%xmm2 \n\t" \ + "movaps 112(%0), %%xmm3 \n\t" \ + "movaps %%xmm0, 64(%1) \n\t" \ + "movaps %%xmm1, 80(%1) \n\t" \ + "movaps %%xmm2, 96(%1) \n\t" \ + "movaps %%xmm3, 112(%1) \n\t" + ::"r"(in), "r"(in+512) + :"memory" + ); + + apply_window(in + 16, win , win + 512, suma, sumc, 16); + apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); + + SUM8(MACS, suma[0], win + 32, in + 48); + + sumc[ 0] = 0; + sumb[16] = 0; + sumd[16] = 0; + +#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ + "movups " #sumd "(%4), %%xmm0 \n\t" \ + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ + "subps " #suma "(%1), %%xmm0 \n\t" \ + "movaps %%xmm0," #out1 "(%0) \n\t" \ +\ + "movups " #sumc "(%3), %%xmm0 \n\t" \ + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ + "addps " #sumb "(%2), %%xmm0 \n\t" \ + "movaps %%xmm0," #out2 "(%0) \n\t" + + if (incr == 1) { + __asm__ volatile( + SUMS( 0, 48, 4, 52, 0, 112) + SUMS(16, 32, 20, 36, 16, 96) + SUMS(32, 16, 36, 20, 32, 80) + SUMS(48, 0, 52, 4, 48, 64) + + :"+&r"(out) + :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) + :"memory" + ); + out += 16*incr; + } else { + int j; + float *out2 = out + 32 * incr; + out[0 ] = -suma[ 0]; + out += incr; + out2 -= incr; + for(j=1;j<16;j++) { + *out = -suma[ j] + sumd[16-j]; + *out2 = sumb[16-j] + sumc[ j]; + out += incr; + out2 -= incr; + } + } + + sum = 0; + SUM8(MLSS, sum, win + 16 + 32, in + 32); + *out = sum; +} + +#endif /* HAVE_INLINE_ASM */ + +#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ +static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ + int count, int switch_point, int block_type) \ +{ \ + int align_end = count - (count & 3); \ + int j; \ + for (j = 0; j < align_end; j+= 4) { \ + LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ + float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ + in += 4*18; \ + buf += 4*18; \ + out += 4; \ + } \ + for (; j < count; j++) { \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + int win_idx = (switch_point && j < 2) ? 0 : block_type; \ + float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ + \ + ff_imdct36_float_ ## CPU1(out, buf, in, win); \ + \ + in += 18; \ + buf++; \ + out++; \ + } \ +} + +DECL_IMDCT_BLOCKS(sse,sse) +DECL_IMDCT_BLOCKS(sse2,sse) +DECL_IMDCT_BLOCKS(sse3,sse) +DECL_IMDCT_BLOCKS(ssse3,sse) +DECL_IMDCT_BLOCKS(avx,avx) + +void ff_mpadsp_init_mmx(MPADSPContext *s) +{ + int mm_flags = av_get_cpu_flags(); + + int i, j; + for (j = 0; j < 4; j++) { + for (i = 0; i < 40; i ++) { + mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; + mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; + mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + } + } + +#if HAVE_INLINE_ASM + if (mm_flags & AV_CPU_FLAG_SSE2) { + s->apply_window_float = apply_window_mp3; + } +#endif /* HAVE_INLINE_ASM */ +#if HAVE_YASM + if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { + s->imdct36_blocks_float = imdct36_blocks_avx; +#if HAVE_SSE + } else if (mm_flags & AV_CPU_FLAG_SSSE3) { + s->imdct36_blocks_float = imdct36_blocks_ssse3; + } else if (mm_flags & AV_CPU_FLAG_SSE3) { + s->imdct36_blocks_float = imdct36_blocks_sse3; + } else if (mm_flags & AV_CPU_FLAG_SSE2) { + s->imdct36_blocks_float = imdct36_blocks_sse2; + } else if (mm_flags & AV_CPU_FLAG_SSE) { + s->imdct36_blocks_float = imdct36_blocks_sse; +#endif /* HAVE_SSE */ + } +#endif /* HAVE_YASM */ +} diff --git a/libavcodec/x86/mpegaudiodec_mmx.c b/libavcodec/x86/mpegaudiodec_mmx.c deleted file mode 100644 index 701ae75138..0000000000 --- a/libavcodec/x86/mpegaudiodec_mmx.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * MMX optimized MP3 decoding functions - * Copyright (c) 2010 Vitor Sessak - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegaudiodsp.h" - -void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win); -void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win); -void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, - float *tmpbuf); -void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, - float *tmpbuf); - -DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; - -#if HAVE_INLINE_ASM - -#define MACS(rt, ra, rb) rt+=(ra)*(rb) -#define MLSS(rt, ra, rb) rt-=(ra)*(rb) - -#define SUM8(op, sum, w, p) \ -{ \ - op(sum, (w)[0 * 64], (p)[0 * 64]); \ - op(sum, (w)[1 * 64], (p)[1 * 64]); \ - op(sum, (w)[2 * 64], (p)[2 * 64]); \ - op(sum, (w)[3 * 64], (p)[3 * 64]); \ - op(sum, (w)[4 * 64], (p)[4 * 64]); \ - op(sum, (w)[5 * 64], (p)[5 * 64]); \ - op(sum, (w)[6 * 64], (p)[6 * 64]); \ - op(sum, (w)[7 * 64], (p)[7 * 64]); \ -} - -static void apply_window(const float *buf, const float *win1, - const float *win2, float *sum1, float *sum2, int len) -{ - x86_reg count = - 4*len; - const float *win1a = win1+len; - const float *win2a = win2+len; - const float *bufa = buf+len; - float *sum1a = sum1+len; - float *sum2a = sum2+len; - - -#define MULT(a, b) \ - "movaps " #a "(%1,%0), %%xmm1 \n\t" \ - "movaps " #a "(%3,%0), %%xmm2 \n\t" \ - "mulps %%xmm2, %%xmm1 \n\t" \ - "subps %%xmm1, %%xmm0 \n\t" \ - "mulps " #b "(%2,%0), %%xmm2 \n\t" \ - "subps %%xmm2, %%xmm4 \n\t" \ - - __asm__ volatile( - "1: \n\t" - "xorps %%xmm0, %%xmm0 \n\t" - "xorps %%xmm4, %%xmm4 \n\t" - - MULT( 0, 0) - MULT( 256, 64) - MULT( 512, 128) - MULT( 768, 192) - MULT(1024, 256) - MULT(1280, 320) - MULT(1536, 384) - MULT(1792, 448) - - "movaps %%xmm0, (%4,%0) \n\t" - "movaps %%xmm4, (%5,%0) \n\t" - "add $16, %0 \n\t" - "jl 1b \n\t" - :"+&r"(count) - :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) - ); - -#undef MULT -} - -static void apply_window_mp3(float *in, float *win, int *unused, float *out, - int incr) -{ - LOCAL_ALIGNED_16(float, suma, [17]); - LOCAL_ALIGNED_16(float, sumb, [17]); - LOCAL_ALIGNED_16(float, sumc, [17]); - LOCAL_ALIGNED_16(float, sumd, [17]); - - float sum; - - /* copy to avoid wrap */ - __asm__ volatile( - "movaps 0(%0), %%xmm0 \n\t" \ - "movaps 16(%0), %%xmm1 \n\t" \ - "movaps 32(%0), %%xmm2 \n\t" \ - "movaps 48(%0), %%xmm3 \n\t" \ - "movaps %%xmm0, 0(%1) \n\t" \ - "movaps %%xmm1, 16(%1) \n\t" \ - "movaps %%xmm2, 32(%1) \n\t" \ - "movaps %%xmm3, 48(%1) \n\t" \ - "movaps 64(%0), %%xmm0 \n\t" \ - "movaps 80(%0), %%xmm1 \n\t" \ - "movaps 96(%0), %%xmm2 \n\t" \ - "movaps 112(%0), %%xmm3 \n\t" \ - "movaps %%xmm0, 64(%1) \n\t" \ - "movaps %%xmm1, 80(%1) \n\t" \ - "movaps %%xmm2, 96(%1) \n\t" \ - "movaps %%xmm3, 112(%1) \n\t" - ::"r"(in), "r"(in+512) - :"memory" - ); - - apply_window(in + 16, win , win + 512, suma, sumc, 16); - apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); - - SUM8(MACS, suma[0], win + 32, in + 48); - - sumc[ 0] = 0; - sumb[16] = 0; - sumd[16] = 0; - -#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ - "movups " #sumd "(%4), %%xmm0 \n\t" \ - "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ - "subps " #suma "(%1), %%xmm0 \n\t" \ - "movaps %%xmm0," #out1 "(%0) \n\t" \ -\ - "movups " #sumc "(%3), %%xmm0 \n\t" \ - "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ - "addps " #sumb "(%2), %%xmm0 \n\t" \ - "movaps %%xmm0," #out2 "(%0) \n\t" - - if (incr == 1) { - __asm__ volatile( - SUMS( 0, 48, 4, 52, 0, 112) - SUMS(16, 32, 20, 36, 16, 96) - SUMS(32, 16, 36, 20, 32, 80) - SUMS(48, 0, 52, 4, 48, 64) - - :"+&r"(out) - :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) - :"memory" - ); - out += 16*incr; - } else { - int j; - float *out2 = out + 32 * incr; - out[0 ] = -suma[ 0]; - out += incr; - out2 -= incr; - for(j=1;j<16;j++) { - *out = -suma[ j] + sumd[16-j]; - *out2 = sumb[16-j] + sumc[ j]; - out += incr; - out2 -= incr; - } - } - - sum = 0; - SUM8(MLSS, sum, win + 16 + 32, in + 32); - *out = sum; -} - -#endif /* HAVE_INLINE_ASM */ - -#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ -static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ - int count, int switch_point, int block_type) \ -{ \ - int align_end = count - (count & 3); \ - int j; \ - for (j = 0; j < align_end; j+= 4) { \ - LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ - float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ - /* apply window & overlap with previous buffer */ \ - \ - /* select window */ \ - ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ - in += 4*18; \ - buf += 4*18; \ - out += 4; \ - } \ - for (; j < count; j++) { \ - /* apply window & overlap with previous buffer */ \ - \ - /* select window */ \ - int win_idx = (switch_point && j < 2) ? 0 : block_type; \ - float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ - \ - ff_imdct36_float_ ## CPU1(out, buf, in, win); \ - \ - in += 18; \ - buf++; \ - out++; \ - } \ -} - -DECL_IMDCT_BLOCKS(sse,sse) -DECL_IMDCT_BLOCKS(sse2,sse) -DECL_IMDCT_BLOCKS(sse3,sse) -DECL_IMDCT_BLOCKS(ssse3,sse) -DECL_IMDCT_BLOCKS(avx,avx) - -void ff_mpadsp_init_mmx(MPADSPContext *s) -{ - int mm_flags = av_get_cpu_flags(); - - int i, j; - for (j = 0; j < 4; j++) { - for (i = 0; i < 40; i ++) { - mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; - mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; - mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; - mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; - mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; - mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; - mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; - mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; - } - } - -#if HAVE_INLINE_ASM - if (mm_flags & AV_CPU_FLAG_SSE2) { - s->apply_window_float = apply_window_mp3; - } -#endif /* HAVE_INLINE_ASM */ -#if HAVE_YASM - if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { - s->imdct36_blocks_float = imdct36_blocks_avx; -#if HAVE_SSE - } else if (mm_flags & AV_CPU_FLAG_SSSE3) { - s->imdct36_blocks_float = imdct36_blocks_ssse3; - } else if (mm_flags & AV_CPU_FLAG_SSE3) { - s->imdct36_blocks_float = imdct36_blocks_sse3; - } else if (mm_flags & AV_CPU_FLAG_SSE2) { - s->imdct36_blocks_float = imdct36_blocks_sse2; - } else if (mm_flags & AV_CPU_FLAG_SSE) { - s->imdct36_blocks_float = imdct36_blocks_sse; -#endif /* HAVE_SSE */ - } -#endif /* HAVE_YASM */ -} diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c new file mode 100644 index 0000000000..0e809337d3 --- /dev/null +++ b/libavcodec/x86/simple_idct.c @@ -0,0 +1,1169 @@ +/* + * Simple IDCT MMX + * + * Copyright (c) 2001, 2002 Michael Niedermayer + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "libavcodec/dsputil.h" +#include "libavcodec/simple_idct.h" +#include "libavutil/internal.h" +#include "libavutil/mem.h" +#include "dsputil_mmx.h" + +#if HAVE_INLINE_ASM + +/* +23170.475006 +22725.260826 +21406.727617 +19265.545870 +16384.000000 +12872.826198 +8866.956905 +4520.335430 +*/ +#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 +#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + +#define ROW_SHIFT 11 +#define COL_SHIFT 20 // 6 + +DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL; +DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL; + +DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { + 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, +// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, +// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), + 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, + // the 1 = ((1<<(COL_SHIFT-1))/C4)< - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "libavcodec/dsputil.h" -#include "libavcodec/simple_idct.h" -#include "libavutil/internal.h" -#include "libavutil/mem.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -/* -23170.475006 -22725.260826 -21406.727617 -19265.545870 -16384.000000 -12872.826198 -8866.956905 -4520.335430 -*/ -#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 -#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 - -#define ROW_SHIFT 11 -#define COL_SHIFT 20 // 6 - -DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL; -DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL; - -DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { - 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, -// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, -// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), - 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, - // the 1 = ((1<<(COL_SHIFT-1))/C4)< + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/snow.h" +#include "libavcodec/dwt.h" +#include "dsputil_mmx.h" + +#if HAVE_INLINE_ASM + +static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){ + const int w2= (width+1)>>1; + const int w_l= (width>>1); + const int w_r= w2 - 1; + int i; + + { // Lift 0 + IDWTELEM * const ref = b + w2 - 1; + IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice + // (the first time erroneously), we allow the SSE2 code to run an extra pass. + // The savings in code and time are well worth having to store this value and + // calculate b[0] correctly afterwards. + + i = 0; + __asm__ volatile( + "pcmpeqd %%xmm7, %%xmm7 \n\t" + "pcmpeqd %%xmm3, %%xmm3 \n\t" + "psllw $1, %%xmm3 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "psllw $13, %%xmm3 \n\t" + ::); + for(; i>W_DS); + } + + { // Lift 1 + IDWTELEM * const dst = b+w2; + + i = 0; + for(; (((x86_reg)&dst[i]) & 0x1F) && i> W_BS); + } + + { // Lift 3 + IDWTELEM * const src = b+w2; + + i = 0; + for(; (((x86_reg)&temp[i]) & 0x1F) && i>W_AS); + } + for(; i>1]; + b[i] = b[i>>1]; + } + for (i-=62; i>=0; i-=64){ + __asm__ volatile( + "movdqa (%1), %%xmm0 \n\t" + "movdqa 16(%1), %%xmm2 \n\t" + "movdqa 32(%1), %%xmm4 \n\t" + "movdqa 48(%1), %%xmm6 \n\t" + "movdqa (%1), %%xmm1 \n\t" + "movdqa 16(%1), %%xmm3 \n\t" + "movdqa 32(%1), %%xmm5 \n\t" + "movdqa 48(%1), %%xmm7 \n\t" + "punpcklwd (%2), %%xmm0 \n\t" + "punpcklwd 16(%2), %%xmm2 \n\t" + "punpcklwd 32(%2), %%xmm4 \n\t" + "punpcklwd 48(%2), %%xmm6 \n\t" + "movdqa %%xmm0, (%0) \n\t" + "movdqa %%xmm2, 32(%0) \n\t" + "movdqa %%xmm4, 64(%0) \n\t" + "movdqa %%xmm6, 96(%0) \n\t" + "punpckhwd (%2), %%xmm1 \n\t" + "punpckhwd 16(%2), %%xmm3 \n\t" + "punpckhwd 32(%2), %%xmm5 \n\t" + "punpckhwd 48(%2), %%xmm7 \n\t" + "movdqa %%xmm1, 16(%0) \n\t" + "movdqa %%xmm3, 48(%0) \n\t" + "movdqa %%xmm5, 80(%0) \n\t" + "movdqa %%xmm7, 112(%0) \n\t" + :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) + : "memory" + ); + } + } +} + +static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){ + const int w2= (width+1)>>1; + const int w_l= (width>>1); + const int w_r= w2 - 1; + int i; + + { // Lift 0 + IDWTELEM * const ref = b + w2 - 1; + + i = 1; + b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); + __asm__ volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm3, %%mm3 \n\t" + "psllw $1, %%mm3 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "psllw $13, %%mm3 \n\t" + ::); + for(; i> W_BS); + __asm__ volatile( + "psllw $15, %%mm7 \n\t" + "pcmpeqw %%mm6, %%mm6 \n\t" + "psrlw $13, %%mm6 \n\t" + "paddw %%mm7, %%mm6 \n\t" + ::); + for(; i>1]; + b[i] = b[i>>1]; + } + for (i-=30; i>=0; i-=32){ + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 16(%1), %%mm4 \n\t" + "movq 24(%1), %%mm6 \n\t" + "movq (%1), %%mm1 \n\t" + "movq 8(%1), %%mm3 \n\t" + "movq 16(%1), %%mm5 \n\t" + "movq 24(%1), %%mm7 \n\t" + "punpcklwd (%2), %%mm0 \n\t" + "punpcklwd 8(%2), %%mm2 \n\t" + "punpcklwd 16(%2), %%mm4 \n\t" + "punpcklwd 24(%2), %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, 16(%0) \n\t" + "movq %%mm4, 32(%0) \n\t" + "movq %%mm6, 48(%0) \n\t" + "punpckhwd (%2), %%mm1 \n\t" + "punpckhwd 8(%2), %%mm3 \n\t" + "punpckhwd 16(%2), %%mm5 \n\t" + "punpckhwd 24(%2), %%mm7 \n\t" + "movq %%mm1, 8(%0) \n\t" + "movq %%mm3, 24(%0) \n\t" + "movq %%mm5, 40(%0) \n\t" + "movq %%mm7, 56(%0) \n\t" + :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) + : "memory" + ); + } + } +} + +#if HAVE_7REGS +#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ + ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ + ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ + ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ + ""op" 48("r",%%"REG_d"), %%"t3" \n\t" + +#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ + snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) + +#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ + snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) + +#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ + "psubw %%"s0", %%"t0" \n\t"\ + "psubw %%"s1", %%"t1" \n\t"\ + "psubw %%"s2", %%"t2" \n\t"\ + "psubw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ + "movdqa %%"s0", ("w",%%"REG_d") \n\t"\ + "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ + "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ + "movdqa %%"s3", 48("w",%%"REG_d") \n\t" + +#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ + "psraw $"n", %%"t0" \n\t"\ + "psraw $"n", %%"t1" \n\t"\ + "psraw $"n", %%"t2" \n\t"\ + "psraw $"n", %%"t3" \n\t" + +#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ + "paddw %%"s0", %%"t0" \n\t"\ + "paddw %%"s1", %%"t1" \n\t"\ + "paddw %%"s2", %%"t2" \n\t"\ + "paddw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ + "pmulhw %%"s0", %%"t0" \n\t"\ + "pmulhw %%"s1", %%"t1" \n\t"\ + "pmulhw %%"s2", %%"t2" \n\t"\ + "pmulhw %%"s3", %%"t3" \n\t" + +#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ + "movdqa %%"s0", %%"t0" \n\t"\ + "movdqa %%"s1", %%"t1" \n\t"\ + "movdqa %%"s2", %%"t2" \n\t"\ + "movdqa %%"s3", %%"t3" \n\t" + +static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ + x86_reg i = width; + + while(i & 0x1F) + { + i--; + b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; + b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; + b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; + b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; + } + i+=i; + + __asm__ volatile ( + "jmp 2f \n\t" + "1: \n\t" + snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") + + + "pcmpeqw %%xmm0, %%xmm0 \n\t" + "pcmpeqw %%xmm2, %%xmm2 \n\t" + "paddw %%xmm2, %%xmm2 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "psllw $13, %%xmm2 \n\t" + snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") + + "pcmpeqw %%xmm7, %%xmm7 \n\t" + "pcmpeqw %%xmm5, %%xmm5 \n\t" + "psllw $15, %%xmm7 \n\t" + "psrlw $13, %%xmm5 \n\t" + "paddw %%xmm7, %%xmm5 \n\t" + snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") + "movq (%2,%%"REG_d"), %%xmm1 \n\t" + "movq 8(%2,%%"REG_d"), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm0 \n\t" + "pavgw %%xmm3, %%xmm2 \n\t" + "movq 16(%2,%%"REG_d"), %%xmm1 \n\t" + "movq 24(%2,%%"REG_d"), %%xmm3 \n\t" + "paddw %%xmm7, %%xmm1 \n\t" + "paddw %%xmm7, %%xmm3 \n\t" + "pavgw %%xmm1, %%xmm4 \n\t" + "pavgw %%xmm3, %%xmm6 \n\t" + snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") + + snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") + snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") + snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") + + "2: \n\t" + "sub $64, %%"REG_d" \n\t" + "jge 1b \n\t" + :"+d"(i) + :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); +} + +#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ + ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ + ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ + ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ + ""op" 24("r",%%"REG_d"), %%"t3" \n\t" + +#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ + snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) + +#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ + snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) + +#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ + "movq %%"s0", ("w",%%"REG_d") \n\t"\ + "movq %%"s1", 8("w",%%"REG_d") \n\t"\ + "movq %%"s2", 16("w",%%"REG_d") \n\t"\ + "movq %%"s3", 24("w",%%"REG_d") \n\t" + +#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ + "movq %%"s0", %%"t0" \n\t"\ + "movq %%"s1", %%"t1" \n\t"\ + "movq %%"s2", %%"t2" \n\t"\ + "movq %%"s3", %%"t3" \n\t" + + +static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ + x86_reg i = width; + while(i & 15) + { + i--; + b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; + b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; + b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; + b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; + } + i+=i; + __asm__ volatile( + "jmp 2f \n\t" + "1: \n\t" + + snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") + "pcmpeqw %%mm0, %%mm0 \n\t" + "pcmpeqw %%mm2, %%mm2 \n\t" + "paddw %%mm2, %%mm2 \n\t" + "paddw %%mm0, %%mm2 \n\t" + "psllw $13, %%mm2 \n\t" + snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") + snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") + snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") + snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") + "pcmpeqw %%mm7, %%mm7 \n\t" + "pcmpeqw %%mm5, %%mm5 \n\t" + "psllw $15, %%mm7 \n\t" + "psrlw $13, %%mm5 \n\t" + "paddw %%mm7, %%mm5 \n\t" + snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") + "movq (%2,%%"REG_d"), %%mm1 \n\t" + "movq 8(%2,%%"REG_d"), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm0 \n\t" + "pavgw %%mm3, %%mm2 \n\t" + "movq 16(%2,%%"REG_d"), %%mm1 \n\t" + "movq 24(%2,%%"REG_d"), %%mm3 \n\t" + "paddw %%mm7, %%mm1 \n\t" + "paddw %%mm7, %%mm3 \n\t" + "pavgw %%mm1, %%mm4 \n\t" + "pavgw %%mm3, %%mm6 \n\t" + snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") + + snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") + snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") + snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") + snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") + + "2: \n\t" + "sub $32, %%"REG_d" \n\t" + "jge 1b \n\t" + :"+d"(i) + :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); +} +#endif //HAVE_7REGS + +#define snow_inner_add_yblock_sse2_header \ + IDWTELEM * * dst_array = sb->line + src_y;\ + x86_reg tmp;\ + __asm__ volatile(\ + "mov %7, %%"REG_c" \n\t"\ + "mov %6, %2 \n\t"\ + "mov %4, %%"REG_S" \n\t"\ + "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ + "pcmpeqd %%xmm3, %%xmm3 \n\t"\ + "psllw $15, %%xmm3 \n\t"\ + "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %3, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movq (%%"REG_d"), %%"out_reg1" \n\t"\ + "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ + "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ + "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ + "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ + "punpcklbw %%xmm7, %%xmm0 \n\t"\ + "punpcklbw %%xmm7, %%xmm4 \n\t"\ + "pmullw %%xmm0, %%"out_reg1" \n\t"\ + "pmullw %%xmm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ + snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ + "paddusw %%xmm2, %%xmm1 \n\t"\ + "paddusw %%xmm6, %%xmm5 \n\t" + +#define snow_inner_add_yblock_sse2_end_common1\ + "add $32, %%"REG_S" \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t" + +#define snow_inner_add_yblock_sse2_end_common2\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :\ + "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ + "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +#define snow_inner_add_yblock_sse2_end_8\ + "sal $1, %%"REG_c" \n\t"\ + "addl $"PTR_SIZE"*2, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "sar $1, %%"REG_c" \n\t"\ + "sub $2, %2 \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +#define snow_inner_add_yblock_sse2_end_16\ + "addl $"PTR_SIZE"*1, %1 \n\t"\ + snow_inner_add_yblock_sse2_end_common1\ + "dec %2 \n\t"\ + snow_inner_add_yblock_sse2_end_common2 + +static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_8("2", "8") +snow_inner_add_yblock_sse2_accum_8("1", "128") +snow_inner_add_yblock_sse2_accum_8("0", "136") + + "mov %0, %%"REG_d" \n\t" + "movdqa (%%"REG_D"), %%xmm0 \n\t" + "movdqa %%xmm1, %%xmm2 \n\t" + + "punpckhwd %%xmm7, %%xmm1 \n\t" + "punpcklwd %%xmm7, %%xmm2 \n\t" + "paddd %%xmm2, %%xmm0 \n\t" + "movdqa 16(%%"REG_D"), %%xmm2 \n\t" + "paddd %%xmm1, %%xmm2 \n\t" + "paddd %%xmm3, %%xmm0 \n\t" + "paddd %%xmm3, %%xmm2 \n\t" + + "mov %1, %%"REG_D" \n\t" + "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" + "add %3, %%"REG_D" \n\t" + + "movdqa (%%"REG_D"), %%xmm4 \n\t" + "movdqa %%xmm5, %%xmm6 \n\t" + "punpckhwd %%xmm7, %%xmm5 \n\t" + "punpcklwd %%xmm7, %%xmm6 \n\t" + "paddd %%xmm6, %%xmm4 \n\t" + "movdqa 16(%%"REG_D"), %%xmm6 \n\t" + "paddd %%xmm5, %%xmm6 \n\t" + "paddd %%xmm3, %%xmm4 \n\t" + "paddd %%xmm3, %%xmm6 \n\t" + + "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm2, %%xmm0 \n\t" + "packuswb %%xmm7, %%xmm0 \n\t" + "movq %%xmm0, (%%"REG_d") \n\t" + + "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ + "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ + "packssdw %%xmm6, %%xmm4 \n\t" + "packuswb %%xmm7, %%xmm4 \n\t" + "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" +snow_inner_add_yblock_sse2_end_8 +} + +static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_sse2_header +snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") +snow_inner_add_yblock_sse2_accum_16("2", "16") +snow_inner_add_yblock_sse2_accum_16("1", "512") +snow_inner_add_yblock_sse2_accum_16("0", "528") + + "mov %0, %%"REG_d" \n\t" + "psrlw $4, %%xmm1 \n\t" + "psrlw $4, %%xmm5 \n\t" + "paddw (%%"REG_D"), %%xmm1 \n\t" + "paddw 16(%%"REG_D"), %%xmm5 \n\t" + "paddw %%xmm3, %%xmm1 \n\t" + "paddw %%xmm3, %%xmm5 \n\t" + "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ + "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ + "packuswb %%xmm5, %%xmm1 \n\t" + + "movdqu %%xmm1, (%%"REG_d") \n\t" + +snow_inner_add_yblock_sse2_end_16 +} + +#define snow_inner_add_yblock_mmx_header \ + IDWTELEM * * dst_array = sb->line + src_y;\ + x86_reg tmp;\ + __asm__ volatile(\ + "mov %7, %%"REG_c" \n\t"\ + "mov %6, %2 \n\t"\ + "mov %4, %%"REG_S" \n\t"\ + "pxor %%mm7, %%mm7 \n\t" /* 0 */\ + "pcmpeqd %%mm3, %%mm3 \n\t"\ + "psllw $15, %%mm3 \n\t"\ + "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ + "1: \n\t"\ + "mov %1, %%"REG_D" \n\t"\ + "mov (%%"REG_D"), %%"REG_D" \n\t"\ + "add %3, %%"REG_D" \n\t" + +#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ + "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ + "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ + "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ + "punpcklbw %%mm7, %%"out_reg1" \n\t"\ + "punpcklbw %%mm7, %%"out_reg2" \n\t"\ + "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ + "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "pmullw %%mm0, %%"out_reg1" \n\t"\ + "pmullw %%mm4, %%"out_reg2" \n\t" + +#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ + snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ + "paddusw %%mm2, %%mm1 \n\t"\ + "paddusw %%mm6, %%mm5 \n\t" + +#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ + "mov %0, %%"REG_d" \n\t"\ + "psrlw $4, %%mm1 \n\t"\ + "psrlw $4, %%mm5 \n\t"\ + "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ + "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ + "paddw %%mm3, %%mm1 \n\t"\ + "paddw %%mm3, %%mm5 \n\t"\ + "psraw $4, %%mm1 \n\t"\ + "psraw $4, %%mm5 \n\t"\ + "packuswb %%mm5, %%mm1 \n\t"\ + "movq %%mm1, "write_offset"(%%"REG_d") \n\t" + +#define snow_inner_add_yblock_mmx_end(s_step)\ + "add $"s_step", %%"REG_S" \n\t"\ + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ + "add %%"REG_c", (%%"REG_a") \n\t"\ + "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\ + "add %%"REG_c", %0 \n\t"\ + "dec %2 \n\t"\ + "jnz 1b \n\t"\ + :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :\ + "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ + "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); + +static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "8", "0") +snow_inner_add_yblock_mmx_accum("1", "128", "0") +snow_inner_add_yblock_mmx_accum("0", "136", "0") +snow_inner_add_yblock_mmx_mix("0", "0") +snow_inner_add_yblock_mmx_end("16") +} + +static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, + int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ +snow_inner_add_yblock_mmx_header +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") +snow_inner_add_yblock_mmx_accum("2", "16", "0") +snow_inner_add_yblock_mmx_accum("1", "512", "0") +snow_inner_add_yblock_mmx_accum("0", "528", "0") +snow_inner_add_yblock_mmx_mix("0", "0") + +snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") +snow_inner_add_yblock_mmx_accum("2", "24", "8") +snow_inner_add_yblock_mmx_accum("1", "520", "8") +snow_inner_add_yblock_mmx_accum("0", "536", "8") +snow_inner_add_yblock_mmx_mix("16", "8") +snow_inner_add_yblock_mmx_end("32") +} + +static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) { + if (!(b_h & 1)) + inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + } else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +} + +static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ + if (b_w == 16) + inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else if (b_w == 8 && obmc_stride == 16) + inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); + else + ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); +} + +#endif /* HAVE_INLINE_ASM */ + +void ff_dwt_init_x86(DWTContext *c) +{ +#if HAVE_INLINE_ASM + int mm_flags = av_get_cpu_flags(); + + if (mm_flags & AV_CPU_FLAG_MMX) { + if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ + c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; +#if HAVE_7REGS + c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; +#endif + c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; + } + else{ + if (mm_flags & AV_CPU_FLAG_MMXEXT) { + c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; +#if HAVE_7REGS + c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; +#endif + } + c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; + } + } +#endif /* HAVE_INLINE_ASM */ +} diff --git a/libavcodec/x86/snowdsp_mmx.c b/libavcodec/x86/snowdsp_mmx.c deleted file mode 100644 index fb190d8d8d..0000000000 --- a/libavcodec/x86/snowdsp_mmx.c +++ /dev/null @@ -1,902 +0,0 @@ -/* - * MMX and SSE2 optimized snow DSP utils - * Copyright (c) 2005-2006 Robert Edele - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/snow.h" -#include "libavcodec/dwt.h" -#include "dsputil_mmx.h" - -#if HAVE_INLINE_ASM - -static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){ - const int w2= (width+1)>>1; - const int w_l= (width>>1); - const int w_r= w2 - 1; - int i; - - { // Lift 0 - IDWTELEM * const ref = b + w2 - 1; - IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice - // (the first time erroneously), we allow the SSE2 code to run an extra pass. - // The savings in code and time are well worth having to store this value and - // calculate b[0] correctly afterwards. - - i = 0; - __asm__ volatile( - "pcmpeqd %%xmm7, %%xmm7 \n\t" - "pcmpeqd %%xmm3, %%xmm3 \n\t" - "psllw $1, %%xmm3 \n\t" - "paddw %%xmm7, %%xmm3 \n\t" - "psllw $13, %%xmm3 \n\t" - ::); - for(; i>W_DS); - } - - { // Lift 1 - IDWTELEM * const dst = b+w2; - - i = 0; - for(; (((x86_reg)&dst[i]) & 0x1F) && i> W_BS); - } - - { // Lift 3 - IDWTELEM * const src = b+w2; - - i = 0; - for(; (((x86_reg)&temp[i]) & 0x1F) && i>W_AS); - } - for(; i>1]; - b[i] = b[i>>1]; - } - for (i-=62; i>=0; i-=64){ - __asm__ volatile( - "movdqa (%1), %%xmm0 \n\t" - "movdqa 16(%1), %%xmm2 \n\t" - "movdqa 32(%1), %%xmm4 \n\t" - "movdqa 48(%1), %%xmm6 \n\t" - "movdqa (%1), %%xmm1 \n\t" - "movdqa 16(%1), %%xmm3 \n\t" - "movdqa 32(%1), %%xmm5 \n\t" - "movdqa 48(%1), %%xmm7 \n\t" - "punpcklwd (%2), %%xmm0 \n\t" - "punpcklwd 16(%2), %%xmm2 \n\t" - "punpcklwd 32(%2), %%xmm4 \n\t" - "punpcklwd 48(%2), %%xmm6 \n\t" - "movdqa %%xmm0, (%0) \n\t" - "movdqa %%xmm2, 32(%0) \n\t" - "movdqa %%xmm4, 64(%0) \n\t" - "movdqa %%xmm6, 96(%0) \n\t" - "punpckhwd (%2), %%xmm1 \n\t" - "punpckhwd 16(%2), %%xmm3 \n\t" - "punpckhwd 32(%2), %%xmm5 \n\t" - "punpckhwd 48(%2), %%xmm7 \n\t" - "movdqa %%xmm1, 16(%0) \n\t" - "movdqa %%xmm3, 48(%0) \n\t" - "movdqa %%xmm5, 80(%0) \n\t" - "movdqa %%xmm7, 112(%0) \n\t" - :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) - : "memory" - ); - } - } -} - -static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){ - const int w2= (width+1)>>1; - const int w_l= (width>>1); - const int w_r= w2 - 1; - int i; - - { // Lift 0 - IDWTELEM * const ref = b + w2 - 1; - - i = 1; - b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); - __asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "pcmpeqw %%mm3, %%mm3 \n\t" - "psllw $1, %%mm3 \n\t" - "paddw %%mm7, %%mm3 \n\t" - "psllw $13, %%mm3 \n\t" - ::); - for(; i> W_BS); - __asm__ volatile( - "psllw $15, %%mm7 \n\t" - "pcmpeqw %%mm6, %%mm6 \n\t" - "psrlw $13, %%mm6 \n\t" - "paddw %%mm7, %%mm6 \n\t" - ::); - for(; i>1]; - b[i] = b[i>>1]; - } - for (i-=30; i>=0; i-=32){ - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 8(%1), %%mm2 \n\t" - "movq 16(%1), %%mm4 \n\t" - "movq 24(%1), %%mm6 \n\t" - "movq (%1), %%mm1 \n\t" - "movq 8(%1), %%mm3 \n\t" - "movq 16(%1), %%mm5 \n\t" - "movq 24(%1), %%mm7 \n\t" - "punpcklwd (%2), %%mm0 \n\t" - "punpcklwd 8(%2), %%mm2 \n\t" - "punpcklwd 16(%2), %%mm4 \n\t" - "punpcklwd 24(%2), %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, 16(%0) \n\t" - "movq %%mm4, 32(%0) \n\t" - "movq %%mm6, 48(%0) \n\t" - "punpckhwd (%2), %%mm1 \n\t" - "punpckhwd 8(%2), %%mm3 \n\t" - "punpckhwd 16(%2), %%mm5 \n\t" - "punpckhwd 24(%2), %%mm7 \n\t" - "movq %%mm1, 8(%0) \n\t" - "movq %%mm3, 24(%0) \n\t" - "movq %%mm5, 40(%0) \n\t" - "movq %%mm7, 56(%0) \n\t" - :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) - : "memory" - ); - } - } -} - -#if HAVE_7REGS -#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ - ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ - ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ - ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ - ""op" 48("r",%%"REG_d"), %%"t3" \n\t" - -#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ - snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) - -#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ - snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) - -#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ - "psubw %%"s0", %%"t0" \n\t"\ - "psubw %%"s1", %%"t1" \n\t"\ - "psubw %%"s2", %%"t2" \n\t"\ - "psubw %%"s3", %%"t3" \n\t" - -#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ - "movdqa %%"s0", ("w",%%"REG_d") \n\t"\ - "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ - "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ - "movdqa %%"s3", 48("w",%%"REG_d") \n\t" - -#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ - "psraw $"n", %%"t0" \n\t"\ - "psraw $"n", %%"t1" \n\t"\ - "psraw $"n", %%"t2" \n\t"\ - "psraw $"n", %%"t3" \n\t" - -#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ - "paddw %%"s0", %%"t0" \n\t"\ - "paddw %%"s1", %%"t1" \n\t"\ - "paddw %%"s2", %%"t2" \n\t"\ - "paddw %%"s3", %%"t3" \n\t" - -#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ - "pmulhw %%"s0", %%"t0" \n\t"\ - "pmulhw %%"s1", %%"t1" \n\t"\ - "pmulhw %%"s2", %%"t2" \n\t"\ - "pmulhw %%"s3", %%"t3" \n\t" - -#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ - "movdqa %%"s0", %%"t0" \n\t"\ - "movdqa %%"s1", %%"t1" \n\t"\ - "movdqa %%"s2", %%"t2" \n\t"\ - "movdqa %%"s3", %%"t3" \n\t" - -static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ - x86_reg i = width; - - while(i & 0x1F) - { - i--; - b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; - b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; - b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; - b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; - } - i+=i; - - __asm__ volatile ( - "jmp 2f \n\t" - "1: \n\t" - snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") - - - "pcmpeqw %%xmm0, %%xmm0 \n\t" - "pcmpeqw %%xmm2, %%xmm2 \n\t" - "paddw %%xmm2, %%xmm2 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "psllw $13, %%xmm2 \n\t" - snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") - - "pcmpeqw %%xmm7, %%xmm7 \n\t" - "pcmpeqw %%xmm5, %%xmm5 \n\t" - "psllw $15, %%xmm7 \n\t" - "psrlw $13, %%xmm5 \n\t" - "paddw %%xmm7, %%xmm5 \n\t" - snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") - "movq (%2,%%"REG_d"), %%xmm1 \n\t" - "movq 8(%2,%%"REG_d"), %%xmm3 \n\t" - "paddw %%xmm7, %%xmm1 \n\t" - "paddw %%xmm7, %%xmm3 \n\t" - "pavgw %%xmm1, %%xmm0 \n\t" - "pavgw %%xmm3, %%xmm2 \n\t" - "movq 16(%2,%%"REG_d"), %%xmm1 \n\t" - "movq 24(%2,%%"REG_d"), %%xmm3 \n\t" - "paddw %%xmm7, %%xmm1 \n\t" - "paddw %%xmm7, %%xmm3 \n\t" - "pavgw %%xmm1, %%xmm4 \n\t" - "pavgw %%xmm3, %%xmm6 \n\t" - snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") - - snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") - snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") - snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") - - "2: \n\t" - "sub $64, %%"REG_d" \n\t" - "jge 1b \n\t" - :"+d"(i) - :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); -} - -#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ - ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ - ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ - ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ - ""op" 24("r",%%"REG_d"), %%"t3" \n\t" - -#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ - snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) - -#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ - snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) - -#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ - "movq %%"s0", ("w",%%"REG_d") \n\t"\ - "movq %%"s1", 8("w",%%"REG_d") \n\t"\ - "movq %%"s2", 16("w",%%"REG_d") \n\t"\ - "movq %%"s3", 24("w",%%"REG_d") \n\t" - -#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ - "movq %%"s0", %%"t0" \n\t"\ - "movq %%"s1", %%"t1" \n\t"\ - "movq %%"s2", %%"t2" \n\t"\ - "movq %%"s3", %%"t3" \n\t" - - -static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ - x86_reg i = width; - while(i & 15) - { - i--; - b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; - b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; - b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; - b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; - } - i+=i; - __asm__ volatile( - "jmp 2f \n\t" - "1: \n\t" - - snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") - "pcmpeqw %%mm0, %%mm0 \n\t" - "pcmpeqw %%mm2, %%mm2 \n\t" - "paddw %%mm2, %%mm2 \n\t" - "paddw %%mm0, %%mm2 \n\t" - "psllw $13, %%mm2 \n\t" - snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") - snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") - snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") - snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") - "pcmpeqw %%mm7, %%mm7 \n\t" - "pcmpeqw %%mm5, %%mm5 \n\t" - "psllw $15, %%mm7 \n\t" - "psrlw $13, %%mm5 \n\t" - "paddw %%mm7, %%mm5 \n\t" - snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") - "movq (%2,%%"REG_d"), %%mm1 \n\t" - "movq 8(%2,%%"REG_d"), %%mm3 \n\t" - "paddw %%mm7, %%mm1 \n\t" - "paddw %%mm7, %%mm3 \n\t" - "pavgw %%mm1, %%mm0 \n\t" - "pavgw %%mm3, %%mm2 \n\t" - "movq 16(%2,%%"REG_d"), %%mm1 \n\t" - "movq 24(%2,%%"REG_d"), %%mm3 \n\t" - "paddw %%mm7, %%mm1 \n\t" - "paddw %%mm7, %%mm3 \n\t" - "pavgw %%mm1, %%mm4 \n\t" - "pavgw %%mm3, %%mm6 \n\t" - snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") - snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") - - snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") - snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") - snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") - snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") - - "2: \n\t" - "sub $32, %%"REG_d" \n\t" - "jge 1b \n\t" - :"+d"(i) - :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); -} -#endif //HAVE_7REGS - -#define snow_inner_add_yblock_sse2_header \ - IDWTELEM * * dst_array = sb->line + src_y;\ - x86_reg tmp;\ - __asm__ volatile(\ - "mov %7, %%"REG_c" \n\t"\ - "mov %6, %2 \n\t"\ - "mov %4, %%"REG_S" \n\t"\ - "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ - "pcmpeqd %%xmm3, %%xmm3 \n\t"\ - "psllw $15, %%xmm3 \n\t"\ - "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ - "1: \n\t"\ - "mov %1, %%"REG_D" \n\t"\ - "mov (%%"REG_D"), %%"REG_D" \n\t"\ - "add %3, %%"REG_D" \n\t" - -#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ - "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ - "movq (%%"REG_d"), %%"out_reg1" \n\t"\ - "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ - "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ - "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - "pmullw %%xmm0, %%"out_reg1" \n\t"\ - "pmullw %%xmm4, %%"out_reg2" \n\t" - -#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ - "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ - "movq (%%"REG_d"), %%"out_reg1" \n\t"\ - "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ - "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ - "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ - "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ - "punpcklbw %%xmm7, %%xmm0 \n\t"\ - "punpcklbw %%xmm7, %%xmm4 \n\t"\ - "pmullw %%xmm0, %%"out_reg1" \n\t"\ - "pmullw %%xmm4, %%"out_reg2" \n\t" - -#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ - snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ - "paddusw %%xmm2, %%xmm1 \n\t"\ - "paddusw %%xmm6, %%xmm5 \n\t" - -#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ - snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ - "paddusw %%xmm2, %%xmm1 \n\t"\ - "paddusw %%xmm6, %%xmm5 \n\t" - -#define snow_inner_add_yblock_sse2_end_common1\ - "add $32, %%"REG_S" \n\t"\ - "add %%"REG_c", %0 \n\t"\ - "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ - "add %%"REG_c", (%%"REG_a") \n\t" - -#define snow_inner_add_yblock_sse2_end_common2\ - "jnz 1b \n\t"\ - :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ - :\ - "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ - "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); - -#define snow_inner_add_yblock_sse2_end_8\ - "sal $1, %%"REG_c" \n\t"\ - "addl $"PTR_SIZE"*2, %1 \n\t"\ - snow_inner_add_yblock_sse2_end_common1\ - "sar $1, %%"REG_c" \n\t"\ - "sub $2, %2 \n\t"\ - snow_inner_add_yblock_sse2_end_common2 - -#define snow_inner_add_yblock_sse2_end_16\ - "addl $"PTR_SIZE"*1, %1 \n\t"\ - snow_inner_add_yblock_sse2_end_common1\ - "dec %2 \n\t"\ - snow_inner_add_yblock_sse2_end_common2 - -static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_sse2_header -snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") -snow_inner_add_yblock_sse2_accum_8("2", "8") -snow_inner_add_yblock_sse2_accum_8("1", "128") -snow_inner_add_yblock_sse2_accum_8("0", "136") - - "mov %0, %%"REG_d" \n\t" - "movdqa (%%"REG_D"), %%xmm0 \n\t" - "movdqa %%xmm1, %%xmm2 \n\t" - - "punpckhwd %%xmm7, %%xmm1 \n\t" - "punpcklwd %%xmm7, %%xmm2 \n\t" - "paddd %%xmm2, %%xmm0 \n\t" - "movdqa 16(%%"REG_D"), %%xmm2 \n\t" - "paddd %%xmm1, %%xmm2 \n\t" - "paddd %%xmm3, %%xmm0 \n\t" - "paddd %%xmm3, %%xmm2 \n\t" - - "mov %1, %%"REG_D" \n\t" - "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" - "add %3, %%"REG_D" \n\t" - - "movdqa (%%"REG_D"), %%xmm4 \n\t" - "movdqa %%xmm5, %%xmm6 \n\t" - "punpckhwd %%xmm7, %%xmm5 \n\t" - "punpcklwd %%xmm7, %%xmm6 \n\t" - "paddd %%xmm6, %%xmm4 \n\t" - "movdqa 16(%%"REG_D"), %%xmm6 \n\t" - "paddd %%xmm5, %%xmm6 \n\t" - "paddd %%xmm3, %%xmm4 \n\t" - "paddd %%xmm3, %%xmm6 \n\t" - - "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ - "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ - "packssdw %%xmm2, %%xmm0 \n\t" - "packuswb %%xmm7, %%xmm0 \n\t" - "movq %%xmm0, (%%"REG_d") \n\t" - - "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ - "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ - "packssdw %%xmm6, %%xmm4 \n\t" - "packuswb %%xmm7, %%xmm4 \n\t" - "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" -snow_inner_add_yblock_sse2_end_8 -} - -static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_sse2_header -snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") -snow_inner_add_yblock_sse2_accum_16("2", "16") -snow_inner_add_yblock_sse2_accum_16("1", "512") -snow_inner_add_yblock_sse2_accum_16("0", "528") - - "mov %0, %%"REG_d" \n\t" - "psrlw $4, %%xmm1 \n\t" - "psrlw $4, %%xmm5 \n\t" - "paddw (%%"REG_D"), %%xmm1 \n\t" - "paddw 16(%%"REG_D"), %%xmm5 \n\t" - "paddw %%xmm3, %%xmm1 \n\t" - "paddw %%xmm3, %%xmm5 \n\t" - "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ - "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ - "packuswb %%xmm5, %%xmm1 \n\t" - - "movdqu %%xmm1, (%%"REG_d") \n\t" - -snow_inner_add_yblock_sse2_end_16 -} - -#define snow_inner_add_yblock_mmx_header \ - IDWTELEM * * dst_array = sb->line + src_y;\ - x86_reg tmp;\ - __asm__ volatile(\ - "mov %7, %%"REG_c" \n\t"\ - "mov %6, %2 \n\t"\ - "mov %4, %%"REG_S" \n\t"\ - "pxor %%mm7, %%mm7 \n\t" /* 0 */\ - "pcmpeqd %%mm3, %%mm3 \n\t"\ - "psllw $15, %%mm3 \n\t"\ - "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ - "1: \n\t"\ - "mov %1, %%"REG_D" \n\t"\ - "mov (%%"REG_D"), %%"REG_D" \n\t"\ - "add %3, %%"REG_D" \n\t" - -#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ - "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ - "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ - "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ - "punpcklbw %%mm7, %%"out_reg1" \n\t"\ - "punpcklbw %%mm7, %%"out_reg2" \n\t"\ - "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ - "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ - "punpcklbw %%mm7, %%mm0 \n\t"\ - "punpcklbw %%mm7, %%mm4 \n\t"\ - "pmullw %%mm0, %%"out_reg1" \n\t"\ - "pmullw %%mm4, %%"out_reg2" \n\t" - -#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ - snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ - "paddusw %%mm2, %%mm1 \n\t"\ - "paddusw %%mm6, %%mm5 \n\t" - -#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ - "mov %0, %%"REG_d" \n\t"\ - "psrlw $4, %%mm1 \n\t"\ - "psrlw $4, %%mm5 \n\t"\ - "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ - "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ - "paddw %%mm3, %%mm1 \n\t"\ - "paddw %%mm3, %%mm5 \n\t"\ - "psraw $4, %%mm1 \n\t"\ - "psraw $4, %%mm5 \n\t"\ - "packuswb %%mm5, %%mm1 \n\t"\ - "movq %%mm1, "write_offset"(%%"REG_d") \n\t" - -#define snow_inner_add_yblock_mmx_end(s_step)\ - "add $"s_step", %%"REG_S" \n\t"\ - "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ - "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ - "add %%"REG_c", (%%"REG_a") \n\t"\ - "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\ - "add %%"REG_c", %0 \n\t"\ - "dec %2 \n\t"\ - "jnz 1b \n\t"\ - :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ - :\ - "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ - "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); - -static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_mmx_header -snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") -snow_inner_add_yblock_mmx_accum("2", "8", "0") -snow_inner_add_yblock_mmx_accum("1", "128", "0") -snow_inner_add_yblock_mmx_accum("0", "136", "0") -snow_inner_add_yblock_mmx_mix("0", "0") -snow_inner_add_yblock_mmx_end("16") -} - -static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, - int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ -snow_inner_add_yblock_mmx_header -snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") -snow_inner_add_yblock_mmx_accum("2", "16", "0") -snow_inner_add_yblock_mmx_accum("1", "512", "0") -snow_inner_add_yblock_mmx_accum("0", "528", "0") -snow_inner_add_yblock_mmx_mix("0", "0") - -snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") -snow_inner_add_yblock_mmx_accum("2", "24", "8") -snow_inner_add_yblock_mmx_accum("1", "520", "8") -snow_inner_add_yblock_mmx_accum("0", "536", "8") -snow_inner_add_yblock_mmx_mix("16", "8") -snow_inner_add_yblock_mmx_end("32") -} - -static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ - - if (b_w == 16) - inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else if (b_w == 8 && obmc_stride == 16) { - if (!(b_h & 1)) - inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else - inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - } else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); -} - -static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ - if (b_w == 16) - inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else if (b_w == 8 && obmc_stride == 16) - inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); - else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); -} - -#endif /* HAVE_INLINE_ASM */ - -void ff_dwt_init_x86(DWTContext *c) -{ -#if HAVE_INLINE_ASM - int mm_flags = av_get_cpu_flags(); - - if (mm_flags & AV_CPU_FLAG_MMX) { - if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ - c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; -#if HAVE_7REGS - c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; -#endif - c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; - } - else{ - if (mm_flags & AV_CPU_FLAG_MMXEXT) { - c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; -#if HAVE_7REGS - c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; -#endif - } - c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; - } - } -#endif /* HAVE_INLINE_ASM */ -} -- cgit v1.2.3