From 2d60444331fca1910510038dd3817bea885c2367 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Sat, 8 Feb 2014 02:59:58 +0100 Subject: dsputil: Split motion estimation compare bits off into their own context --- libavcodec/Makefile | 2 +- libavcodec/ac3enc.c | 5 +- libavcodec/ac3enc.h | 4 +- libavcodec/arm/Makefile | 4 +- libavcodec/arm/dsputil_armv6.S | 244 ------- libavcodec/arm/dsputil_init_arm.c | 58 -- libavcodec/arm/me_cmp_armv6.S | 244 +++++++ libavcodec/arm/me_cmp_init_arm.c | 57 ++ libavcodec/dsputil.c | 952 -------------------------- libavcodec/dsputil.h | 85 --- libavcodec/dv.h | 2 +- libavcodec/dvenc.c | 10 +- libavcodec/error_resilience.c | 10 +- libavcodec/error_resilience.h | 4 +- libavcodec/h264.c | 8 +- libavcodec/h264.h | 4 +- libavcodec/h264_slice.c | 3 +- libavcodec/me_cmp.c | 942 ++++++++++++++++++++++++++ libavcodec/me_cmp.h | 73 ++ libavcodec/motion_est.c | 43 +- libavcodec/motion_est_template.c | 65 +- libavcodec/mpeg4videoenc.c | 2 +- libavcodec/mpegvideo.c | 4 +- libavcodec/mpegvideo.h | 4 +- libavcodec/mpegvideo_enc.c | 96 ++- libavcodec/mpegvideoencdsp.c | 2 +- libavcodec/ppc/Makefile | 2 +- libavcodec/ppc/dsputil_altivec.c | 767 --------------------- libavcodec/ppc/me_cmp.c | 767 +++++++++++++++++++++ libavcodec/svq1enc.c | 10 +- libavcodec/svq1enc.h | 4 +- libavcodec/utils.c | 6 +- libavcodec/x86/Makefile | 4 +- libavcodec/x86/dsputil_init.c | 1321 ------------------------------------- libavcodec/x86/dsputilenc.asm | 336 ---------- libavcodec/x86/me_cmp.asm | 336 ++++++++++ libavcodec/x86/me_cmp_init.c | 1321 +++++++++++++++++++++++++++++++++++++ 37 files changed, 3885 insertions(+), 3916 deletions(-) delete mode 100644 libavcodec/arm/dsputil_armv6.S delete mode 100644 libavcodec/arm/dsputil_init_arm.c create mode 100644 libavcodec/arm/me_cmp_armv6.S create mode 100644 libavcodec/arm/me_cmp_init_arm.c delete mode 100644 libavcodec/dsputil.c delete mode 100644 libavcodec/dsputil.h create mode 100644 libavcodec/me_cmp.c create mode 100644 libavcodec/me_cmp.h delete mode 100644 libavcodec/ppc/dsputil_altivec.c create mode 100644 libavcodec/ppc/me_cmp.c delete mode 100644 libavcodec/x86/dsputil_init.c delete mode 100644 libavcodec/x86/dsputilenc.asm create mode 100644 libavcodec/x86/me_cmp.asm create mode 100644 libavcodec/x86/me_cmp_init.c (limited to 'libavcodec') diff --git a/libavcodec/Makefile b/libavcodec/Makefile index 632b0dabc7..7da6b7e93e 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -35,7 +35,6 @@ OBJS-$(CONFIG_BLOCKDSP) += blockdsp.o OBJS-$(CONFIG_BSWAPDSP) += bswapdsp.o OBJS-$(CONFIG_CABAC) += cabac.o OBJS-$(CONFIG_DCT) += dct.o dct32_fixed.o dct32_float.o -OBJS-$(CONFIG_DSPUTIL) += dsputil.o OBJS-$(CONFIG_DXVA2) += dxva2.o OBJS-$(CONFIG_ERROR_RESILIENCE) += error_resilience.o OBJS-$(CONFIG_FDCTDSP) += fdctdsp.o faandct.o \ @@ -60,6 +59,7 @@ OBJS-$(CONFIG_LIBXVID) += libxvid_rc.o OBJS-$(CONFIG_LPC) += lpc.o OBJS-$(CONFIG_LSP) += lsp.o OBJS-$(CONFIG_MDCT) += mdct_fixed.o mdct_float.o +OBJS-$(CONFIG_ME_CMP) += me_cmp.o OBJS-$(CONFIG_MPEG_ER) += mpeg_er.o OBJS-$(CONFIG_MPEGAUDIO) += mpegaudio.o mpegaudiodata.o \ mpegaudiodecheader.o diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c index c6dc141eea..5c02e7f9de 100644 --- a/libavcodec/ac3enc.c +++ b/libavcodec/ac3enc.c @@ -36,6 +36,7 @@ #include "libavutil/internal.h" #include "libavutil/opt.h" #include "avcodec.h" +#include "me_cmp.h" #include "put_bits.h" #include "audiodsp.h" #include "ac3dsp.h" @@ -379,7 +380,7 @@ static void compute_exp_strategy(AC3EncodeContext *s) exp_strategy[blk] = EXP_NEW; continue; } - exp_diff = s->dsp.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16); + exp_diff = s->mecc.sad[0](NULL, exp, exp - AC3_MAX_COEFS, 16, 16); exp_strategy[blk] = EXP_REUSE; if (ch == CPL_CH && exp_diff > (EXP_DIFF_THRESHOLD * (s->blocks[blk].end_freq[ch] - s->start_freq[ch]) / AC3_MAX_COEFS)) exp_strategy[blk] = EXP_NEW; @@ -2482,7 +2483,7 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx) goto init_fail; ff_audiodsp_init(&s->adsp); - ff_dsputil_init(&s->dsp, avctx); + ff_me_cmp_init(&s->mecc, avctx); ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT); dprint_options(s); diff --git a/libavcodec/ac3enc.h b/libavcodec/ac3enc.h index b8e8768913..30c15d0d2f 100644 --- a/libavcodec/ac3enc.h +++ b/libavcodec/ac3enc.h @@ -35,9 +35,9 @@ #include "ac3.h" #include "ac3dsp.h" #include "avcodec.h" -#include "dsputil.h" #include "fft.h" #include "mathops.h" +#include "me_cmp.h" #include "put_bits.h" #include "audiodsp.h" @@ -162,9 +162,9 @@ typedef struct AC3EncodeContext { AC3EncOptions options; ///< encoding options AVCodecContext *avctx; ///< parent AVCodecContext PutBitContext pb; ///< bitstream writer context - DSPContext dsp; AudioDSPContext adsp; AVFloatDSPContext fdsp; + MECmpContext mecc; AC3DSPContext ac3dsp; ///< AC-3 optimized functions FFTContext mdct; ///< FFT context for MDCT calculation const SampleType *mdct_window; ///< MDCT window function array diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index aab39c85bc..742c3ee726 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -6,7 +6,6 @@ OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \ arm/ac3dsp_arm.o OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o -OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_arm.o OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \ arm/fft_fixed_init_arm.o OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o @@ -19,6 +18,7 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \ arm/idctdsp_arm.o \ arm/jrevdct_arm.o \ arm/simple_idct_arm.o +OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_init_arm.o OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o @@ -53,13 +53,13 @@ ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \ ARMV5TE-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv5te.o ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o -ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_armv6.o ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ arm/hpeldsp_armv6.o ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \ arm/idctdsp_armv6.o \ arm/simple_idct_armv6.o +ARMV6-OBJS-$(CONFIG_ME_CMP) += arm/me_cmp_armv6.o ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o ARMV6-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_armv6.o diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S deleted file mode 100644 index 436e20dd25..0000000000 --- a/libavcodec/arm/dsputil_armv6.S +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_pix_abs16_armv6, export=1 - ldr r0, [sp] - push {r4-r9, lr} - mov r12, #0 - mov lr, #0 - ldm r1, {r4-r7} - ldr r8, [r2] -1: - ldr r9, [r2, #4] - pld [r1, r3] - usada8 r12, r4, r8, r12 - ldr r8, [r2, #8] - pld [r2, r3] - usada8 lr, r5, r9, lr - ldr r9, [r2, #12] - usada8 r12, r6, r8, r12 - subs r0, r0, #1 - usada8 lr, r7, r9, lr - beq 2f - add r1, r1, r3 - ldm r1, {r4-r7} - add r2, r2, r3 - ldr r8, [r2] - b 1b -2: - add r0, r12, lr - pop {r4-r9, pc} -endfunc - -function ff_pix_abs16_x2_armv6, export=1 - ldr r12, [sp] - push {r4-r11, lr} - mov r0, #0 - mov lr, #1 - orr lr, lr, lr, lsl #8 - orr lr, lr, lr, lsl #16 -1: - ldr r8, [r2] - ldr r9, [r2, #4] - lsr r10, r8, #8 - ldr r4, [r1] - lsr r6, r9, #8 - orr r10, r10, r9, lsl #24 - ldr r5, [r2, #8] - eor r11, r8, r10 - uhadd8 r7, r8, r10 - orr r6, r6, r5, lsl #24 - and r11, r11, lr - uadd8 r7, r7, r11 - ldr r8, [r1, #4] - usada8 r0, r4, r7, r0 - eor r7, r9, r6 - lsr r10, r5, #8 - and r7, r7, lr - uhadd8 r4, r9, r6 - ldr r6, [r2, #12] - uadd8 r4, r4, r7 - pld [r1, r3] - orr r10, r10, r6, lsl #24 - usada8 r0, r8, r4, r0 - ldr r4, [r1, #8] - eor r11, r5, r10 - ldrb r7, [r2, #16] - and r11, r11, lr - uhadd8 r8, r5, r10 - ldr r5, [r1, #12] - uadd8 r8, r8, r11 - pld [r2, r3] - lsr r10, r6, #8 - usada8 r0, r4, r8, r0 - orr r10, r10, r7, lsl #24 - subs r12, r12, #1 - eor r11, r6, r10 - add r1, r1, r3 - uhadd8 r9, r6, r10 - and r11, r11, lr - uadd8 r9, r9, r11 - add r2, r2, r3 - usada8 r0, r5, r9, r0 - bgt 1b - - pop {r4-r11, pc} -endfunc - -.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 - ldr \n0, [r2] - eor \n1, \p0, \n0 - uhadd8 \p0, \p0, \n0 - and \n1, \n1, lr - ldr \n2, [r1] - uadd8 \p0, \p0, \n1 - ldr \n1, [r2, #4] - usada8 r0, \p0, \n2, r0 - pld [r1, r3] - eor \n3, \p1, \n1 - uhadd8 \p1, \p1, \n1 - and \n3, \n3, lr - ldr \p0, [r1, #4] - uadd8 \p1, \p1, \n3 - ldr \n2, [r2, #8] - usada8 r0, \p1, \p0, r0 - pld [r2, r3] - eor \p0, \p2, \n2 - uhadd8 \p2, \p2, \n2 - and \p0, \p0, lr - ldr \p1, [r1, #8] - uadd8 \p2, \p2, \p0 - ldr \n3, [r2, #12] - usada8 r0, \p2, \p1, r0 - eor \p1, \p3, \n3 - uhadd8 \p3, \p3, \n3 - and \p1, \p1, lr - ldr \p0, [r1, #12] - uadd8 \p3, \p3, \p1 - add r1, r1, r3 - usada8 r0, \p3, \p0, r0 - add r2, r2, r3 -.endm - -function ff_pix_abs16_y2_armv6, export=1 - pld [r1] - pld [r2] - ldr r12, [sp] - push {r4-r11, lr} - mov r0, #0 - mov lr, #1 - orr lr, lr, lr, lsl #8 - orr lr, lr, lr, lsl #16 - ldr r4, [r2] - ldr r5, [r2, #4] - ldr r6, [r2, #8] - ldr r7, [r2, #12] - add r2, r2, r3 -1: - usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 - subs r12, r12, #2 - usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 - bgt 1b - - pop {r4-r11, pc} -endfunc - -function ff_pix_abs8_armv6, export=1 - pld [r2, r3] - ldr r12, [sp] - push {r4-r9, lr} - mov r0, #0 - mov lr, #0 - ldrd_post r4, r5, r1, r3 -1: - subs r12, r12, #2 - ldr r7, [r2, #4] - ldr_post r6, r2, r3 - ldrd_post r8, r9, r1, r3 - usada8 r0, r4, r6, r0 - pld [r2, r3] - usada8 lr, r5, r7, lr - ldr r7, [r2, #4] - ldr_post r6, r2, r3 - beq 2f - ldrd_post r4, r5, r1, r3 - usada8 r0, r8, r6, r0 - pld [r2, r3] - usada8 lr, r9, r7, lr - b 1b -2: - usada8 r0, r8, r6, r0 - usada8 lr, r9, r7, lr - add r0, r0, lr - pop {r4-r9, pc} -endfunc - -function ff_sse16_armv6, export=1 - ldr r12, [sp] - push {r4-r9, lr} - mov r0, #0 -1: - ldrd r4, r5, [r1] - ldr r8, [r2] - uxtb16 lr, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r9, r8 - uxtb16 r8, r8, ror #8 - ldr r7, [r2, #4] - usub16 lr, lr, r9 - usub16 r4, r4, r8 - smlad r0, lr, lr, r0 - uxtb16 r6, r5 - uxtb16 lr, r5, ror #8 - uxtb16 r8, r7 - uxtb16 r9, r7, ror #8 - smlad r0, r4, r4, r0 - ldrd r4, r5, [r1, #8] - usub16 r6, r6, r8 - usub16 r8, lr, r9 - ldr r7, [r2, #8] - smlad r0, r6, r6, r0 - uxtb16 lr, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r9, r7 - uxtb16 r7, r7, ror #8 - smlad r0, r8, r8, r0 - ldr r8, [r2, #12] - usub16 lr, lr, r9 - usub16 r4, r4, r7 - smlad r0, lr, lr, r0 - uxtb16 r6, r5 - uxtb16 r5, r5, ror #8 - uxtb16 r9, r8 - uxtb16 r8, r8, ror #8 - smlad r0, r4, r4, r0 - usub16 r6, r6, r9 - usub16 r5, r5, r8 - smlad r0, r6, r6, r0 - add r1, r1, r3 - add r2, r2, r3 - subs r12, r12, #1 - smlad r0, r5, r5, r0 - bgt 1b - - pop {r4-r9, pc} -endfunc diff --git a/libavcodec/arm/dsputil_init_arm.c b/libavcodec/arm/dsputil_init_arm.c deleted file mode 100644 index f3fa61f171..0000000000 --- a/libavcodec/arm/dsputil_init_arm.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" - -int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); -int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); -int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); - -int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); - -int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); - - -av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_armv6(cpu_flags)) { - c->pix_abs[0][0] = ff_pix_abs16_armv6; - c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; - c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; - - c->pix_abs[1][0] = ff_pix_abs8_armv6; - - c->sad[0] = ff_pix_abs16_armv6; - c->sad[1] = ff_pix_abs8_armv6; - - c->sse[0] = ff_sse16_armv6; - } -} diff --git a/libavcodec/arm/me_cmp_armv6.S b/libavcodec/arm/me_cmp_armv6.S new file mode 100644 index 0000000000..436e20dd25 --- /dev/null +++ b/libavcodec/arm/me_cmp_armv6.S @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_pix_abs16_armv6, export=1 + ldr r0, [sp] + push {r4-r9, lr} + mov r12, #0 + mov lr, #0 + ldm r1, {r4-r7} + ldr r8, [r2] +1: + ldr r9, [r2, #4] + pld [r1, r3] + usada8 r12, r4, r8, r12 + ldr r8, [r2, #8] + pld [r2, r3] + usada8 lr, r5, r9, lr + ldr r9, [r2, #12] + usada8 r12, r6, r8, r12 + subs r0, r0, #1 + usada8 lr, r7, r9, lr + beq 2f + add r1, r1, r3 + ldm r1, {r4-r7} + add r2, r2, r3 + ldr r8, [r2] + b 1b +2: + add r0, r12, lr + pop {r4-r9, pc} +endfunc + +function ff_pix_abs16_x2_armv6, export=1 + ldr r12, [sp] + push {r4-r11, lr} + mov r0, #0 + mov lr, #1 + orr lr, lr, lr, lsl #8 + orr lr, lr, lr, lsl #16 +1: + ldr r8, [r2] + ldr r9, [r2, #4] + lsr r10, r8, #8 + ldr r4, [r1] + lsr r6, r9, #8 + orr r10, r10, r9, lsl #24 + ldr r5, [r2, #8] + eor r11, r8, r10 + uhadd8 r7, r8, r10 + orr r6, r6, r5, lsl #24 + and r11, r11, lr + uadd8 r7, r7, r11 + ldr r8, [r1, #4] + usada8 r0, r4, r7, r0 + eor r7, r9, r6 + lsr r10, r5, #8 + and r7, r7, lr + uhadd8 r4, r9, r6 + ldr r6, [r2, #12] + uadd8 r4, r4, r7 + pld [r1, r3] + orr r10, r10, r6, lsl #24 + usada8 r0, r8, r4, r0 + ldr r4, [r1, #8] + eor r11, r5, r10 + ldrb r7, [r2, #16] + and r11, r11, lr + uhadd8 r8, r5, r10 + ldr r5, [r1, #12] + uadd8 r8, r8, r11 + pld [r2, r3] + lsr r10, r6, #8 + usada8 r0, r4, r8, r0 + orr r10, r10, r7, lsl #24 + subs r12, r12, #1 + eor r11, r6, r10 + add r1, r1, r3 + uhadd8 r9, r6, r10 + and r11, r11, lr + uadd8 r9, r9, r11 + add r2, r2, r3 + usada8 r0, r5, r9, r0 + bgt 1b + + pop {r4-r11, pc} +endfunc + +.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 + ldr \n0, [r2] + eor \n1, \p0, \n0 + uhadd8 \p0, \p0, \n0 + and \n1, \n1, lr + ldr \n2, [r1] + uadd8 \p0, \p0, \n1 + ldr \n1, [r2, #4] + usada8 r0, \p0, \n2, r0 + pld [r1, r3] + eor \n3, \p1, \n1 + uhadd8 \p1, \p1, \n1 + and \n3, \n3, lr + ldr \p0, [r1, #4] + uadd8 \p1, \p1, \n3 + ldr \n2, [r2, #8] + usada8 r0, \p1, \p0, r0 + pld [r2, r3] + eor \p0, \p2, \n2 + uhadd8 \p2, \p2, \n2 + and \p0, \p0, lr + ldr \p1, [r1, #8] + uadd8 \p2, \p2, \p0 + ldr \n3, [r2, #12] + usada8 r0, \p2, \p1, r0 + eor \p1, \p3, \n3 + uhadd8 \p3, \p3, \n3 + and \p1, \p1, lr + ldr \p0, [r1, #12] + uadd8 \p3, \p3, \p1 + add r1, r1, r3 + usada8 r0, \p3, \p0, r0 + add r2, r2, r3 +.endm + +function ff_pix_abs16_y2_armv6, export=1 + pld [r1] + pld [r2] + ldr r12, [sp] + push {r4-r11, lr} + mov r0, #0 + mov lr, #1 + orr lr, lr, lr, lsl #8 + orr lr, lr, lr, lsl #16 + ldr r4, [r2] + ldr r5, [r2, #4] + ldr r6, [r2, #8] + ldr r7, [r2, #12] + add r2, r2, r3 +1: + usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 + subs r12, r12, #2 + usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 + bgt 1b + + pop {r4-r11, pc} +endfunc + +function ff_pix_abs8_armv6, export=1 + pld [r2, r3] + ldr r12, [sp] + push {r4-r9, lr} + mov r0, #0 + mov lr, #0 + ldrd_post r4, r5, r1, r3 +1: + subs r12, r12, #2 + ldr r7, [r2, #4] + ldr_post r6, r2, r3 + ldrd_post r8, r9, r1, r3 + usada8 r0, r4, r6, r0 + pld [r2, r3] + usada8 lr, r5, r7, lr + ldr r7, [r2, #4] + ldr_post r6, r2, r3 + beq 2f + ldrd_post r4, r5, r1, r3 + usada8 r0, r8, r6, r0 + pld [r2, r3] + usada8 lr, r9, r7, lr + b 1b +2: + usada8 r0, r8, r6, r0 + usada8 lr, r9, r7, lr + add r0, r0, lr + pop {r4-r9, pc} +endfunc + +function ff_sse16_armv6, export=1 + ldr r12, [sp] + push {r4-r9, lr} + mov r0, #0 +1: + ldrd r4, r5, [r1] + ldr r8, [r2] + uxtb16 lr, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r9, r8 + uxtb16 r8, r8, ror #8 + ldr r7, [r2, #4] + usub16 lr, lr, r9 + usub16 r4, r4, r8 + smlad r0, lr, lr, r0 + uxtb16 r6, r5 + uxtb16 lr, r5, ror #8 + uxtb16 r8, r7 + uxtb16 r9, r7, ror #8 + smlad r0, r4, r4, r0 + ldrd r4, r5, [r1, #8] + usub16 r6, r6, r8 + usub16 r8, lr, r9 + ldr r7, [r2, #8] + smlad r0, r6, r6, r0 + uxtb16 lr, r4 + uxtb16 r4, r4, ror #8 + uxtb16 r9, r7 + uxtb16 r7, r7, ror #8 + smlad r0, r8, r8, r0 + ldr r8, [r2, #12] + usub16 lr, lr, r9 + usub16 r4, r4, r7 + smlad r0, lr, lr, r0 + uxtb16 r6, r5 + uxtb16 r5, r5, ror #8 + uxtb16 r9, r8 + uxtb16 r8, r8, ror #8 + smlad r0, r4, r4, r0 + usub16 r6, r6, r9 + usub16 r5, r5, r8 + smlad r0, r6, r6, r0 + add r1, r1, r3 + add r2, r2, r3 + subs r12, r12, #1 + smlad r0, r5, r5, r0 + bgt 1b + + pop {r4-r9, pc} +endfunc diff --git a/libavcodec/arm/me_cmp_init_arm.c b/libavcodec/arm/me_cmp_init_arm.c new file mode 100644 index 0000000000..819d901f90 --- /dev/null +++ b/libavcodec/arm/me_cmp_init_arm.c @@ -0,0 +1,57 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/me_cmp.h" +#include "libavcodec/mpegvideo.h" + +int ff_pix_abs16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); +int ff_pix_abs16_x2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); +int ff_pix_abs16_y2_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); + +int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); + +int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, + int line_size, int h); + +av_cold void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_armv6(cpu_flags)) { + c->pix_abs[0][0] = ff_pix_abs16_armv6; + c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; + c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; + + c->pix_abs[1][0] = ff_pix_abs8_armv6; + + c->sad[0] = ff_pix_abs16_armv6; + c->sad[1] = ff_pix_abs8_armv6; + + c->sse[0] = ff_sse16_armv6; + } +} diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c deleted file mode 100644 index ba71a99852..0000000000 --- a/libavcodec/dsputil.c +++ /dev/null @@ -1,952 +0,0 @@ -/* - * DSP utils - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * DSP utils - */ - -#include "libavutil/attributes.h" -#include "avcodec.h" -#include "copy_block.h" -#include "dsputil.h" -#include "simple_idct.h" -#include "mpegvideo.h" -#include "config.h" - -uint32_t ff_square_tab[512] = { 0, }; - -static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - uint32_t *sq = ff_square_tab + 256; - - for (i = 0; i < h; i++) { - s += sq[pix1[0] - pix2[0]]; - s += sq[pix1[1] - pix2[1]]; - s += sq[pix1[2] - pix2[2]]; - s += sq[pix1[3] - pix2[3]]; - pix1 += line_size; - pix2 += line_size; - } - return s; -} - -static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - uint32_t *sq = ff_square_tab + 256; - - for (i = 0; i < h; i++) { - s += sq[pix1[0] - pix2[0]]; - s += sq[pix1[1] - pix2[1]]; - s += sq[pix1[2] - pix2[2]]; - s += sq[pix1[3] - pix2[3]]; - s += sq[pix1[4] - pix2[4]]; - s += sq[pix1[5] - pix2[5]]; - s += sq[pix1[6] - pix2[6]]; - s += sq[pix1[7] - pix2[7]]; - pix1 += line_size; - pix2 += line_size; - } - return s; -} - -static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - uint32_t *sq = ff_square_tab + 256; - - for (i = 0; i < h; i++) { - s += sq[pix1[0] - pix2[0]]; - s += sq[pix1[1] - pix2[1]]; - s += sq[pix1[2] - pix2[2]]; - s += sq[pix1[3] - pix2[3]]; - s += sq[pix1[4] - pix2[4]]; - s += sq[pix1[5] - pix2[5]]; - s += sq[pix1[6] - pix2[6]]; - s += sq[pix1[7] - pix2[7]]; - s += sq[pix1[8] - pix2[8]]; - s += sq[pix1[9] - pix2[9]]; - s += sq[pix1[10] - pix2[10]]; - s += sq[pix1[11] - pix2[11]]; - s += sq[pix1[12] - pix2[12]]; - s += sq[pix1[13] - pix2[13]]; - s += sq[pix1[14] - pix2[14]]; - s += sq[pix1[15] - pix2[15]]; - - pix1 += line_size; - pix2 += line_size; - } - return s; -} - -static int sum_abs_dctelem_c(int16_t *block) -{ - int sum = 0, i; - - for (i = 0; i < 64; i++) - sum += FFABS(block[i]); - return sum; -} - -#define avg2(a, b) ((a + b + 1) >> 1) -#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2) - -static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - - for (i = 0; i < h; i++) { - s += abs(pix1[0] - pix2[0]); - s += abs(pix1[1] - pix2[1]); - s += abs(pix1[2] - pix2[2]); - s += abs(pix1[3] - pix2[3]); - s += abs(pix1[4] - pix2[4]); - s += abs(pix1[5] - pix2[5]); - s += abs(pix1[6] - pix2[6]); - s += abs(pix1[7] - pix2[7]); - s += abs(pix1[8] - pix2[8]); - s += abs(pix1[9] - pix2[9]); - s += abs(pix1[10] - pix2[10]); - s += abs(pix1[11] - pix2[11]); - s += abs(pix1[12] - pix2[12]); - s += abs(pix1[13] - pix2[13]); - s += abs(pix1[14] - pix2[14]); - s += abs(pix1[15] - pix2[15]); - pix1 += line_size; - pix2 += line_size; - } - return s; -} - -static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - - for (i = 0; i < h; i++) { - s += abs(pix1[0] - avg2(pix2[0], pix2[1])); - s += abs(pix1[1] - avg2(pix2[1], pix2[2])); - s += abs(pix1[2] - avg2(pix2[2], pix2[3])); - s += abs(pix1[3] - avg2(pix2[3], pix2[4])); - s += abs(pix1[4] - avg2(pix2[4], pix2[5])); - s += abs(pix1[5] - avg2(pix2[5], pix2[6])); - s += abs(pix1[6] - avg2(pix2[6], pix2[7])); - s += abs(pix1[7] - avg2(pix2[7], pix2[8])); - s += abs(pix1[8] - avg2(pix2[8], pix2[9])); - s += abs(pix1[9] - avg2(pix2[9], pix2[10])); - s += abs(pix1[10] - avg2(pix2[10], pix2[11])); - s += abs(pix1[11] - avg2(pix2[11], pix2[12])); - s += abs(pix1[12] - avg2(pix2[12], pix2[13])); - s += abs(pix1[13] - avg2(pix2[13], pix2[14])); - s += abs(pix1[14] - avg2(pix2[14], pix2[15])); - s += abs(pix1[15] - avg2(pix2[15], pix2[16])); - pix1 += line_size; - pix2 += line_size; - } - return s; -} - -static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - uint8_t *pix3 = pix2 + line_size; - - for (i = 0; i < h; i++) { - s += abs(pix1[0] - avg2(pix2[0], pix3[0])); - s += abs(pix1[1] - avg2(pix2[1], pix3[1])); - s += abs(pix1[2] - avg2(pix2[2], pix3[2])); - s += abs(pix1[3] - avg2(pix2[3], pix3[3])); - s += abs(pix1[4] - avg2(pix2[4], pix3[4])); - s += abs(pix1[5] - avg2(pix2[5], pix3[5])); - s += abs(pix1[6] - avg2(pix2[6], pix3[6])); - s += abs(pix1[7] - avg2(pix2[7], pix3[7])); - s += abs(pix1[8] - avg2(pix2[8], pix3[8])); - s += abs(pix1[9] - avg2(pix2[9], pix3[9])); - s += abs(pix1[10] - avg2(pix2[10], pix3[10])); - s += abs(pix1[11] - avg2(pix2[11], pix3[11])); - s += abs(pix1[12] - avg2(pix2[12], pix3[12])); - s += abs(pix1[13] - avg2(pix2[13], pix3[13])); - s += abs(pix1[14] - avg2(pix2[14], pix3[14])); - s += abs(pix1[15] - avg2(pix2[15], pix3[15])); - pix1 += line_size; - pix2 += line_size; - pix3 += line_size; - } - return s; -} - -static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - uint8_t *pix3 = pix2 + line_size; - - for (i = 0; i < h; i++) { - s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); - s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); - s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); - s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); - s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); - s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); - s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); - s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); - s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); - s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); - s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); - s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); - s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); - s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); - s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); - s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); - pix1 += line_size; - pix2 += line_size; - pix3 += line_size; - } - return s; -} - -static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - - for (i = 0; i < h; i++) { - s += abs(pix1[0] - pix2[0]); - s += abs(pix1[1] - pix2[1]); - s += abs(pix1[2] - pix2[2]); - s += abs(pix1[3] - pix2[3]); - s += abs(pix1[4] - pix2[4]); - s += abs(pix1[5] - pix2[5]); - s += abs(pix1[6] - pix2[6]); - s += abs(pix1[7] - pix2[7]); - pix1 += line_size; - pix2 += line_size; - } - return s; -} - -static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - - for (i = 0; i < h; i++) { - s += abs(pix1[0] - avg2(pix2[0], pix2[1])); - s += abs(pix1[1] - avg2(pix2[1], pix2[2])); - s += abs(pix1[2] - avg2(pix2[2], pix2[3])); - s += abs(pix1[3] - avg2(pix2[3], pix2[4])); - s += abs(pix1[4] - avg2(pix2[4], pix2[5])); - s += abs(pix1[5] - avg2(pix2[5], pix2[6])); - s += abs(pix1[6] - avg2(pix2[6], pix2[7])); - s += abs(pix1[7] - avg2(pix2[7], pix2[8])); - pix1 += line_size; - pix2 += line_size; - } - return s; -} - -static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - uint8_t *pix3 = pix2 + line_size; - - for (i = 0; i < h; i++) { - s += abs(pix1[0] - avg2(pix2[0], pix3[0])); - s += abs(pix1[1] - avg2(pix2[1], pix3[1])); - s += abs(pix1[2] - avg2(pix2[2], pix3[2])); - s += abs(pix1[3] - avg2(pix2[3], pix3[3])); - s += abs(pix1[4] - avg2(pix2[4], pix3[4])); - s += abs(pix1[5] - avg2(pix2[5], pix3[5])); - s += abs(pix1[6] - avg2(pix2[6], pix3[6])); - s += abs(pix1[7] - avg2(pix2[7], pix3[7])); - pix1 += line_size; - pix2 += line_size; - pix3 += line_size; - } - return s; -} - -static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int s = 0, i; - uint8_t *pix3 = pix2 + line_size; - - for (i = 0; i < h; i++) { - s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); - s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); - s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); - s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); - s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); - s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); - s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); - s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); - pix1 += line_size; - pix2 += line_size; - pix3 += line_size; - } - return s; -} - -static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) -{ - int score1 = 0, score2 = 0, x, y; - - for (y = 0; y < h; y++) { - for (x = 0; x < 16; x++) - score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); - if (y + 1 < h) { - for (x = 0; x < 15; x++) - score2 += FFABS(s1[x] - s1[x + stride] - - s1[x + 1] + s1[x + stride + 1]) - - FFABS(s2[x] - s2[x + stride] - - s2[x + 1] + s2[x + stride + 1]); - } - s1 += stride; - s2 += stride; - } - - if (c) - return score1 + FFABS(score2) * c->avctx->nsse_weight; - else - return score1 + FFABS(score2) * 8; -} - -static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) -{ - int score1 = 0, score2 = 0, x, y; - - for (y = 0; y < h; y++) { - for (x = 0; x < 8; x++) - score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); - if (y + 1 < h) { - for (x = 0; x < 7; x++) - score2 += FFABS(s1[x] - s1[x + stride] - - s1[x + 1] + s1[x + stride + 1]) - - FFABS(s2[x] - s2[x + stride] - - s2[x + 1] + s2[x + stride + 1]); - } - s1 += stride; - s2 += stride; - } - - if (c) - return score1 + FFABS(score2) * c->avctx->nsse_weight; - else - return score1 + FFABS(score2) * 8; -} - -static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b, - int stride, int h) -{ - return 0; -} - -void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type) -{ - int i; - - memset(cmp, 0, sizeof(void *) * 6); - - for (i = 0; i < 6; i++) { - switch (type & 0xFF) { - case FF_CMP_SAD: - cmp[i] = c->sad[i]; - break; - case FF_CMP_SATD: - cmp[i] = c->hadamard8_diff[i]; - break; - case FF_CMP_SSE: - cmp[i] = c->sse[i]; - break; - case FF_CMP_DCT: - cmp[i] = c->dct_sad[i]; - break; - case FF_CMP_DCT264: - cmp[i] = c->dct264_sad[i]; - break; - case FF_CMP_DCTMAX: - cmp[i] = c->dct_max[i]; - break; - case FF_CMP_PSNR: - cmp[i] = c->quant_psnr[i]; - break; - case FF_CMP_BIT: - cmp[i] = c->bit[i]; - break; - case FF_CMP_RD: - cmp[i] = c->rd[i]; - break; - case FF_CMP_VSAD: - cmp[i] = c->vsad[i]; - break; - case FF_CMP_VSSE: - cmp[i] = c->vsse[i]; - break; - case FF_CMP_ZERO: - cmp[i] = zero_cmp; - break; - case FF_CMP_NSSE: - cmp[i] = c->nsse[i]; - break; - default: - av_log(NULL, AV_LOG_ERROR, - "internal error in cmp function selection\n"); - } - } -} - -#define BUTTERFLY2(o1, o2, i1, i2) \ - o1 = (i1) + (i2); \ - o2 = (i1) - (i2); - -#define BUTTERFLY1(x, y) \ - { \ - int a, b; \ - a = x; \ - b = y; \ - x = a + b; \ - y = a - b; \ - } - -#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y))) - -static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst, - uint8_t *src, int stride, int h) -{ - int i, temp[64], sum = 0; - - assert(h == 8); - - for (i = 0; i < 8; i++) { - // FIXME: try pointer walks - BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], - src[stride * i + 0] - dst[stride * i + 0], - src[stride * i + 1] - dst[stride * i + 1]); - BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], - src[stride * i + 2] - dst[stride * i + 2], - src[stride * i + 3] - dst[stride * i + 3]); - BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], - src[stride * i + 4] - dst[stride * i + 4], - src[stride * i + 5] - dst[stride * i + 5]); - BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], - src[stride * i + 6] - dst[stride * i + 6], - src[stride * i + 7] - dst[stride * i + 7]); - - BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); - BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); - BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); - BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); - - BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); - BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); - BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); - BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); - } - - for (i = 0; i < 8; i++) { - BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); - BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); - BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); - BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); - - BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); - BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); - BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); - BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); - - sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + - BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + - BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + - BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); - } - return sum; -} - -static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src, - uint8_t *dummy, int stride, int h) -{ - int i, temp[64], sum = 0; - - assert(h == 8); - - for (i = 0; i < 8; i++) { - // FIXME: try pointer walks - BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], - src[stride * i + 0], src[stride * i + 1]); - BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], - src[stride * i + 2], src[stride * i + 3]); - BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], - src[stride * i + 4], src[stride * i + 5]); - BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], - src[stride * i + 6], src[stride * i + 7]); - - BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); - BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); - BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); - BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); - - BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); - BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); - BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); - BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); - } - - for (i = 0; i < 8; i++) { - BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); - BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); - BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); - BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); - - BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); - BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); - BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); - BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); - - sum += - BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) - + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) - + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) - + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); - } - - sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean - - return sum; -} - -static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1, - uint8_t *src2, int stride, int h) -{ - LOCAL_ALIGNED_16(int16_t, temp, [64]); - - assert(h == 8); - - s->pdsp.diff_pixels(temp, src1, src2, stride); - s->fdsp.fdct(temp); - return s->dsp.sum_abs_dctelem(temp); -} - -#if CONFIG_GPL -#define DCT8_1D \ - { \ - const int s07 = SRC(0) + SRC(7); \ - const int s16 = SRC(1) + SRC(6); \ - const int s25 = SRC(2) + SRC(5); \ - const int s34 = SRC(3) + SRC(4); \ - const int a0 = s07 + s34; \ - const int a1 = s16 + s25; \ - const int a2 = s07 - s34; \ - const int a3 = s16 - s25; \ - const int d07 = SRC(0) - SRC(7); \ - const int d16 = SRC(1) - SRC(6); \ - const int d25 = SRC(2) - SRC(5); \ - const int d34 = SRC(3) - SRC(4); \ - const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \ - const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \ - const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \ - const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \ - DST(0, a0 + a1); \ - DST(1, a4 + (a7 >> 2)); \ - DST(2, a2 + (a3 >> 1)); \ - DST(3, a5 + (a6 >> 2)); \ - DST(4, a0 - a1); \ - DST(5, a6 - (a5 >> 2)); \ - DST(6, (a2 >> 1) - a3); \ - DST(7, (a4 >> 2) - a7); \ - } - -static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1, - uint8_t *src2, int stride, int h) -{ - int16_t dct[8][8]; - int i, sum = 0; - - s->pdsp.diff_pixels(dct[0], src1, src2, stride); - -#define SRC(x) dct[i][x] -#define DST(x, v) dct[i][x] = v - for (i = 0; i < 8; i++) - DCT8_1D -#undef SRC -#undef DST - -#define SRC(x) dct[x][i] -#define DST(x, v) sum += FFABS(v) - for (i = 0; i < 8; i++) - DCT8_1D -#undef SRC -#undef DST - return sum; -} -#endif - -static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1, - uint8_t *src2, int stride, int h) -{ - LOCAL_ALIGNED_16(int16_t, temp, [64]); - int sum = 0, i; - - assert(h == 8); - - s->pdsp.diff_pixels(temp, src1, src2, stride); - s->fdsp.fdct(temp); - - for (i = 0; i < 64; i++) - sum = FFMAX(sum, FFABS(temp[i])); - - return sum; -} - -static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1, - uint8_t *src2, int stride, int h) -{ - LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]); - int16_t *const bak = temp + 64; - int sum = 0, i; - - assert(h == 8); - s->mb_intra = 0; - - s->pdsp.diff_pixels(temp, src1, src2, stride); - - memcpy(bak, temp, 64 * sizeof(int16_t)); - - s->block_last_index[0 /* FIXME */] = - s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); - s->dct_unquantize_inter(s, temp, 0, s->qscale); - ff_simple_idct_8(temp); // FIXME - - for (i = 0; i < 64; i++) - sum += (temp[i] - bak[i]) * (temp[i] - bak[i]); - - return sum; -} - -static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, - int stride, int h) -{ - const uint8_t *scantable = s->intra_scantable.permutated; - LOCAL_ALIGNED_16(int16_t, temp, [64]); - LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]); - LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]); - int i, last, run, bits, level, distortion, start_i; - const int esc_length = s->ac_esc_length; - uint8_t *length, *last_length; - - assert(h == 8); - - copy_block8(lsrc1, src1, 8, stride, 8); - copy_block8(lsrc2, src2, 8, stride, 8); - - s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8); - - s->block_last_index[0 /* FIXME */] = - last = - s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); - - bits = 0; - - if (s->mb_intra) { - start_i = 1; - length = s->intra_ac_vlc_length; - last_length = s->intra_ac_vlc_last_length; - bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma - } else { - start_i = 0; - length = s->inter_ac_vlc_length; - last_length = s->inter_ac_vlc_last_length; - } - - if (last >= start_i) { - run = 0; - for (i = start_i; i < last; i++) { - int j = scantable[i]; - level = temp[j]; - - if (level) { - level += 64; - if ((level & (~127)) == 0) - bits += length[UNI_AC_ENC_INDEX(run, level)]; - else - bits += esc_length; - run = 0; - } else - run++; - } - i = scantable[last]; - - level = temp[i] + 64; - - assert(level - 64); - - if ((level & (~127)) == 0) { - bits += last_length[UNI_AC_ENC_INDEX(run, level)]; - } else - bits += esc_length; - } - - if (last >= 0) { - if (s->mb_intra) - s->dct_unquantize_intra(s, temp, 0, s->qscale); - else - s->dct_unquantize_inter(s, temp, 0, s->qscale); - } - - s->idsp.idct_add(lsrc2, 8, temp); - - distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8); - - return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7); -} - -static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, - int stride, int h) -{ - const uint8_t *scantable = s->intra_scantable.permutated; - LOCAL_ALIGNED_16(int16_t, temp, [64]); - int i, last, run, bits, level, start_i; - const int esc_length = s->ac_esc_length; - uint8_t *length, *last_length; - - assert(h == 8); - - s->pdsp.diff_pixels(temp, src1, src2, stride); - - s->block_last_index[0 /* FIXME */] = - last = - s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); - - bits = 0; - - if (s->mb_intra) { - start_i = 1; - length = s->intra_ac_vlc_length; - last_length = s->intra_ac_vlc_last_length; - bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma - } else { - start_i = 0; - length = s->inter_ac_vlc_length; - last_length = s->inter_ac_vlc_last_length; - } - - if (last >= start_i) { - run = 0; - for (i = start_i; i < last; i++) { - int j = scantable[i]; - level = temp[j]; - - if (level) { - level += 64; - if ((level & (~127)) == 0) - bits += length[UNI_AC_ENC_INDEX(run, level)]; - else - bits += esc_length; - run = 0; - } else - run++; - } - i = scantable[last]; - - level = temp[i] + 64; - - assert(level - 64); - - if ((level & (~127)) == 0) - bits += last_length[UNI_AC_ENC_INDEX(run, level)]; - else - bits += esc_length; - } - - return bits; -} - -#define VSAD_INTRA(size) \ -static int vsad_intra ## size ## _c(MpegEncContext *c, \ - uint8_t *s, uint8_t *dummy, \ - int stride, int h) \ -{ \ - int score = 0, x, y; \ - \ - for (y = 1; y < h; y++) { \ - for (x = 0; x < size; x += 4) { \ - score += FFABS(s[x] - s[x + stride]) + \ - FFABS(s[x + 1] - s[x + stride + 1]) + \ - FFABS(s[x + 2] - s[x + 2 + stride]) + \ - FFABS(s[x + 3] - s[x + 3 + stride]); \ - } \ - s += stride; \ - } \ - \ - return score; \ -} -VSAD_INTRA(8) -VSAD_INTRA(16) - -static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, - int stride, int h) -{ - int score = 0, x, y; - - for (y = 1; y < h; y++) { - for (x = 0; x < 16; x++) - score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); - s1 += stride; - s2 += stride; - } - - return score; -} - -#define SQ(a) ((a) * (a)) -#define VSSE_INTRA(size) \ -static int vsse_intra ## size ## _c(MpegEncContext *c, \ - uint8_t *s, uint8_t *dummy, \ - int stride, int h) \ -{ \ - int score = 0, x, y; \ - \ - for (y = 1; y < h; y++) { \ - for (x = 0; x < size; x += 4) { \ - score += SQ(s[x] - s[x + stride]) + \ - SQ(s[x + 1] - s[x + stride + 1]) + \ - SQ(s[x + 2] - s[x + stride + 2]) + \ - SQ(s[x + 3] - s[x + stride + 3]); \ - } \ - s += stride; \ - } \ - \ - return score; \ -} -VSSE_INTRA(8) -VSSE_INTRA(16) - -static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, - int stride, int h) -{ - int score = 0, x, y; - - for (y = 1; y < h; y++) { - for (x = 0; x < 16; x++) - score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); - s1 += stride; - s2 += stride; - } - - return score; -} - -#define WRAPPER8_16_SQ(name8, name16) \ -static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ - int stride, int h) \ -{ \ - int score = 0; \ - \ - score += name8(s, dst, src, stride, 8); \ - score += name8(s, dst + 8, src + 8, stride, 8); \ - if (h == 16) { \ - dst += 8 * stride; \ - src += 8 * stride; \ - score += name8(s, dst, src, stride, 8); \ - score += name8(s, dst + 8, src + 8, stride, 8); \ - } \ - return score; \ -} - -WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) -WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) -WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) -#if CONFIG_GPL -WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) -#endif -WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) -WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) -WRAPPER8_16_SQ(rd8x8_c, rd16_c) -WRAPPER8_16_SQ(bit8x8_c, bit16_c) - -/* init static data */ -av_cold void ff_dsputil_static_init(void) -{ - int i; - - for (i = 0; i < 512; i++) - ff_square_tab[i] = (i - 256) * (i - 256); -} - -av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) -{ - c->sum_abs_dctelem = sum_abs_dctelem_c; - - /* TODO [0] 16 [1] 8 */ - c->pix_abs[0][0] = pix_abs16_c; - c->pix_abs[0][1] = pix_abs16_x2_c; - c->pix_abs[0][2] = pix_abs16_y2_c; - c->pix_abs[0][3] = pix_abs16_xy2_c; - c->pix_abs[1][0] = pix_abs8_c; - c->pix_abs[1][1] = pix_abs8_x2_c; - c->pix_abs[1][2] = pix_abs8_y2_c; - c->pix_abs[1][3] = pix_abs8_xy2_c; - -#define SET_CMP_FUNC(name) \ - c->name[0] = name ## 16_c; \ - c->name[1] = name ## 8x8_c; - - SET_CMP_FUNC(hadamard8_diff) - c->hadamard8_diff[4] = hadamard8_intra16_c; - c->hadamard8_diff[5] = hadamard8_intra8x8_c; - SET_CMP_FUNC(dct_sad) - SET_CMP_FUNC(dct_max) -#if CONFIG_GPL - SET_CMP_FUNC(dct264_sad) -#endif - c->sad[0] = pix_abs16_c; - c->sad[1] = pix_abs8_c; - c->sse[0] = sse16_c; - c->sse[1] = sse8_c; - c->sse[2] = sse4_c; - SET_CMP_FUNC(quant_psnr) - SET_CMP_FUNC(rd) - SET_CMP_FUNC(bit) - c->vsad[0] = vsad16_c; - c->vsad[4] = vsad_intra16_c; - c->vsad[5] = vsad_intra8_c; - c->vsse[0] = vsse16_c; - c->vsse[4] = vsse_intra16_c; - c->vsse[5] = vsse_intra8_c; - c->nsse[0] = nsse16_c; - c->nsse[1] = nsse8_c; - - if (ARCH_ARM) - ff_dsputil_init_arm(c, avctx); - if (ARCH_PPC) - ff_dsputil_init_ppc(c, avctx); - if (ARCH_X86) - ff_dsputil_init_x86(c, avctx); -} diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h deleted file mode 100644 index 350ce1c44a..0000000000 --- a/libavcodec/dsputil.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * DSP utils - * Copyright (c) 2000, 2001, 2002 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file - * DSP utils. - * Note, many functions in here may use MMX which trashes the FPU state, it is - * absolutely necessary to call emms_c() between DSP & float/double code. - */ - -#ifndef AVCODEC_DSPUTIL_H -#define AVCODEC_DSPUTIL_H - -#include "avcodec.h" - -extern uint32_t ff_square_tab[512]; - -struct MpegEncContext; -/* Motion estimation: - * h is limited to { width / 2, width, 2 * width }, - * but never larger than 16 and never smaller than 2. - * Although currently h < 4 is not used as functions with - * width < 8 are neither used nor implemented. */ -typedef int (*me_cmp_func)(struct MpegEncContext *c, - uint8_t *blk1 /* align width (8 or 16) */, - uint8_t *blk2 /* align 1 */, int line_size, int h); - -/** - * DSPContext. - */ -typedef struct DSPContext { - int (*sum_abs_dctelem)(int16_t *block /* align 16 */); - - me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ - me_cmp_func sse[6]; - me_cmp_func hadamard8_diff[6]; - me_cmp_func dct_sad[6]; - me_cmp_func quant_psnr[6]; - me_cmp_func bit[6]; - me_cmp_func rd[6]; - me_cmp_func vsad[6]; - me_cmp_func vsse[6]; - me_cmp_func nsse[6]; - me_cmp_func dct_max[6]; - me_cmp_func dct264_sad[6]; - - me_cmp_func me_pre_cmp[6]; - me_cmp_func me_cmp[6]; - me_cmp_func me_sub_cmp[6]; - me_cmp_func mb_cmp[6]; - me_cmp_func ildct_cmp[6]; // only width 16 used - me_cmp_func frame_skip_cmp[6]; // only width 8 used - - me_cmp_func pix_abs[2][4]; -} DSPContext; - -void ff_dsputil_static_init(void); -void ff_dsputil_init(DSPContext *p, AVCodecContext *avctx); - -void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type); - -void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx); -void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx); -void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx); - -#endif /* AVCODEC_DSPUTIL_H */ diff --git a/libavcodec/dv.h b/libavcodec/dv.h index f162bb944f..019c15add9 100644 --- a/libavcodec/dv.h +++ b/libavcodec/dv.h @@ -28,7 +28,7 @@ #define AVCODEC_DV_H #include "avcodec.h" -#include "dsputil.h" +#include "me_cmp.h" #include "get_bits.h" #include "dv_profile.h" diff --git a/libavcodec/dvenc.c b/libavcodec/dvenc.c index 9f458e3e47..5031218b05 100644 --- a/libavcodec/dvenc.c +++ b/libavcodec/dvenc.c @@ -28,9 +28,9 @@ #include "libavutil/pixdesc.h" #include "config.h" #include "avcodec.h" -#include "dsputil.h" #include "fdctdsp.h" #include "internal.h" +#include "me_cmp.h" #include "pixblockdsp.h" #include "put_bits.h" #include "dv.h" @@ -40,8 +40,8 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx) { DVVideoContext *s = avctx->priv_data; - DSPContext dsp; FDCTDSPContext fdsp; + MECmpContext mecc; PixblockDSPContext pdsp; int ret; @@ -65,13 +65,13 @@ static av_cold int dvvideo_encode_init(AVCodecContext *avctx) dv_vlc_map_tableinit(); - ff_dsputil_init(&dsp, avctx); ff_fdctdsp_init(&fdsp, avctx); + ff_me_cmp_init(&mecc, avctx); ff_pixblockdsp_init(&pdsp, avctx); - ff_set_cmp(&dsp, dsp.ildct_cmp, avctx->ildct_cmp); + ff_set_cmp(&mecc, mecc.ildct_cmp, avctx->ildct_cmp); s->get_pixels = pdsp.get_pixels; - s->ildct_cmp = dsp.ildct_cmp[5]; + s->ildct_cmp = mecc.ildct_cmp[5]; s->fdct[0] = fdsp.fdct; s->fdct[1] = fdsp.fdct248; diff --git a/libavcodec/error_resilience.c b/libavcodec/error_resilience.c index 6ab5a980da..b41474ad48 100644 --- a/libavcodec/error_resilience.c +++ b/libavcodec/error_resilience.c @@ -715,11 +715,11 @@ FF_ENABLE_DEPRECATION_WARNINGS } else { ff_thread_await_progress(s->last_pic.tf, mb_y, 0); } - is_intra_likely += s->dsp->sad[0](NULL, last_mb_ptr, mb_ptr, - linesize[0], 16); - is_intra_likely -= s->dsp->sad[0](NULL, last_mb_ptr, - last_mb_ptr + linesize[0] * 16, - linesize[0], 16); + is_intra_likely += s->mecc->sad[0](NULL, last_mb_ptr, mb_ptr, + linesize[0], 16); + is_intra_likely -= s->mecc->sad[0](NULL, last_mb_ptr, + last_mb_ptr + linesize[0] * 16, + linesize[0], 16); } else { if (IS_INTRA(s->cur_pic.mb_type[mb_xy])) is_intra_likely++; diff --git a/libavcodec/error_resilience.h b/libavcodec/error_resilience.h index 5171094a54..7b9ec1918e 100644 --- a/libavcodec/error_resilience.h +++ b/libavcodec/error_resilience.h @@ -23,7 +23,7 @@ #include #include "avcodec.h" -#include "dsputil.h" +#include "me_cmp.h" #include "thread.h" ///< current MB is the first after a resync marker @@ -52,7 +52,7 @@ typedef struct ERPicture { typedef struct ERContext { AVCodecContext *avctx; - DSPContext *dsp; + MECmpContext *mecc; int *mb_index2xy; int mb_num; diff --git a/libavcodec/h264.c b/libavcodec/h264.c index dcd2ad831e..ba30e5d38e 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -33,7 +33,6 @@ #include "internal.h" #include "cabac.h" #include "cabac_functions.h" -#include "dsputil.h" #include "error_resilience.h" #include "avcodec.h" #include "h264.h" @@ -42,6 +41,7 @@ #include "h264_mvpred.h" #include "golomb.h" #include "mathops.h" +#include "me_cmp.h" #include "mpegutils.h" #include "rectangle.h" #include "svq3.h" @@ -490,7 +490,7 @@ int ff_h264_context_init(H264Context *h) if (CONFIG_ERROR_RESILIENCE) { /* init ER */ er->avctx = h->avctx; - er->dsp = &h->dsp; + er->mecc = &h->mecc; er->decode_mb = h264_er_decode_mb; er->opaque = h; er->quarter_sample = 1; @@ -620,7 +620,7 @@ av_cold int ff_h264_decode_init(AVCodecContext *avctx) /* needed so that IDCT permutation is known early */ if (CONFIG_ERROR_RESILIENCE) - ff_dsputil_init(&h->dsp, h->avctx); + ff_me_cmp_init(&h->mecc, h->avctx); ff_videodsp_init(&h->vdsp, 8); memset(h->pps.scaling_matrix4, 16, 6 * 16 * sizeof(uint8_t)); @@ -1234,7 +1234,7 @@ int ff_h264_set_parameter_from_sps(H264Context *h) ff_h264_pred_init(&h->hpc, h->avctx->codec_id, h->sps.bit_depth_luma, h->sps.chroma_format_idc); if (CONFIG_ERROR_RESILIENCE) - ff_dsputil_init(&h->dsp, h->avctx); + ff_me_cmp_init(&h->mecc, h->avctx); ff_videodsp_init(&h->vdsp, h->sps.bit_depth_luma); } else { av_log(h->avctx, AV_LOG_ERROR, "Unsupported bit depth %d\n", diff --git a/libavcodec/h264.h b/libavcodec/h264.h index 39023dab63..cd4bf87690 100644 --- a/libavcodec/h264.h +++ b/libavcodec/h264.h @@ -30,13 +30,13 @@ #include "libavutil/intreadwrite.h" #include "cabac.h" -#include "dsputil.h" #include "error_resilience.h" #include "get_bits.h" #include "h264chroma.h" #include "h264dsp.h" #include "h264pred.h" #include "h264qpel.h" +#include "me_cmp.h" #include "mpegutils.h" #include "parser.h" #include "qpeldsp.h" @@ -302,7 +302,7 @@ typedef struct H264Picture { */ typedef struct H264Context { AVCodecContext *avctx; - DSPContext dsp; + MECmpContext mecc; VideoDSPContext vdsp; H264DSPContext h264dsp; H264ChromaContext h264chroma; diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c index 1b9f56b488..ce62fbf68e 100644 --- a/libavcodec/h264_slice.c +++ b/libavcodec/h264_slice.c @@ -31,7 +31,6 @@ #include "internal.h" #include "cabac.h" #include "cabac_functions.h" -#include "dsputil.h" #include "error_resilience.h" #include "avcodec.h" #include "h264.h" @@ -1119,7 +1118,7 @@ static int h264_slice_header_init(H264Context *h, int reinit) if (!c) return AVERROR(ENOMEM); c->avctx = h->avctx; - c->dsp = h->dsp; + c->mecc = h->mecc; c->vdsp = h->vdsp; c->h264dsp = h->h264dsp; c->h264qpel = h->h264qpel; diff --git a/libavcodec/me_cmp.c b/libavcodec/me_cmp.c new file mode 100644 index 0000000000..9fcc93739a --- /dev/null +++ b/libavcodec/me_cmp.c @@ -0,0 +1,942 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "avcodec.h" +#include "copy_block.h" +#include "simple_idct.h" +#include "me_cmp.h" +#include "mpegvideo.h" +#include "config.h" + +uint32_t ff_square_tab[512] = { 0, }; + +static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + uint32_t *sq = ff_square_tab + 256; + + for (i = 0; i < h; i++) { + s += sq[pix1[0] - pix2[0]]; + s += sq[pix1[1] - pix2[1]]; + s += sq[pix1[2] - pix2[2]]; + s += sq[pix1[3] - pix2[3]]; + pix1 += line_size; + pix2 += line_size; + } + return s; +} + +static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + uint32_t *sq = ff_square_tab + 256; + + for (i = 0; i < h; i++) { + s += sq[pix1[0] - pix2[0]]; + s += sq[pix1[1] - pix2[1]]; + s += sq[pix1[2] - pix2[2]]; + s += sq[pix1[3] - pix2[3]]; + s += sq[pix1[4] - pix2[4]]; + s += sq[pix1[5] - pix2[5]]; + s += sq[pix1[6] - pix2[6]]; + s += sq[pix1[7] - pix2[7]]; + pix1 += line_size; + pix2 += line_size; + } + return s; +} + +static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + uint32_t *sq = ff_square_tab + 256; + + for (i = 0; i < h; i++) { + s += sq[pix1[0] - pix2[0]]; + s += sq[pix1[1] - pix2[1]]; + s += sq[pix1[2] - pix2[2]]; + s += sq[pix1[3] - pix2[3]]; + s += sq[pix1[4] - pix2[4]]; + s += sq[pix1[5] - pix2[5]]; + s += sq[pix1[6] - pix2[6]]; + s += sq[pix1[7] - pix2[7]]; + s += sq[pix1[8] - pix2[8]]; + s += sq[pix1[9] - pix2[9]]; + s += sq[pix1[10] - pix2[10]]; + s += sq[pix1[11] - pix2[11]]; + s += sq[pix1[12] - pix2[12]]; + s += sq[pix1[13] - pix2[13]]; + s += sq[pix1[14] - pix2[14]]; + s += sq[pix1[15] - pix2[15]]; + + pix1 += line_size; + pix2 += line_size; + } + return s; +} + +static int sum_abs_dctelem_c(int16_t *block) +{ + int sum = 0, i; + + for (i = 0; i < 64; i++) + sum += FFABS(block[i]); + return sum; +} + +#define avg2(a, b) ((a + b + 1) >> 1) +#define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2) + +static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + + for (i = 0; i < h; i++) { + s += abs(pix1[0] - pix2[0]); + s += abs(pix1[1] - pix2[1]); + s += abs(pix1[2] - pix2[2]); + s += abs(pix1[3] - pix2[3]); + s += abs(pix1[4] - pix2[4]); + s += abs(pix1[5] - pix2[5]); + s += abs(pix1[6] - pix2[6]); + s += abs(pix1[7] - pix2[7]); + s += abs(pix1[8] - pix2[8]); + s += abs(pix1[9] - pix2[9]); + s += abs(pix1[10] - pix2[10]); + s += abs(pix1[11] - pix2[11]); + s += abs(pix1[12] - pix2[12]); + s += abs(pix1[13] - pix2[13]); + s += abs(pix1[14] - pix2[14]); + s += abs(pix1[15] - pix2[15]); + pix1 += line_size; + pix2 += line_size; + } + return s; +} + +static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + + for (i = 0; i < h; i++) { + s += abs(pix1[0] - avg2(pix2[0], pix2[1])); + s += abs(pix1[1] - avg2(pix2[1], pix2[2])); + s += abs(pix1[2] - avg2(pix2[2], pix2[3])); + s += abs(pix1[3] - avg2(pix2[3], pix2[4])); + s += abs(pix1[4] - avg2(pix2[4], pix2[5])); + s += abs(pix1[5] - avg2(pix2[5], pix2[6])); + s += abs(pix1[6] - avg2(pix2[6], pix2[7])); + s += abs(pix1[7] - avg2(pix2[7], pix2[8])); + s += abs(pix1[8] - avg2(pix2[8], pix2[9])); + s += abs(pix1[9] - avg2(pix2[9], pix2[10])); + s += abs(pix1[10] - avg2(pix2[10], pix2[11])); + s += abs(pix1[11] - avg2(pix2[11], pix2[12])); + s += abs(pix1[12] - avg2(pix2[12], pix2[13])); + s += abs(pix1[13] - avg2(pix2[13], pix2[14])); + s += abs(pix1[14] - avg2(pix2[14], pix2[15])); + s += abs(pix1[15] - avg2(pix2[15], pix2[16])); + pix1 += line_size; + pix2 += line_size; + } + return s; +} + +static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + uint8_t *pix3 = pix2 + line_size; + + for (i = 0; i < h; i++) { + s += abs(pix1[0] - avg2(pix2[0], pix3[0])); + s += abs(pix1[1] - avg2(pix2[1], pix3[1])); + s += abs(pix1[2] - avg2(pix2[2], pix3[2])); + s += abs(pix1[3] - avg2(pix2[3], pix3[3])); + s += abs(pix1[4] - avg2(pix2[4], pix3[4])); + s += abs(pix1[5] - avg2(pix2[5], pix3[5])); + s += abs(pix1[6] - avg2(pix2[6], pix3[6])); + s += abs(pix1[7] - avg2(pix2[7], pix3[7])); + s += abs(pix1[8] - avg2(pix2[8], pix3[8])); + s += abs(pix1[9] - avg2(pix2[9], pix3[9])); + s += abs(pix1[10] - avg2(pix2[10], pix3[10])); + s += abs(pix1[11] - avg2(pix2[11], pix3[11])); + s += abs(pix1[12] - avg2(pix2[12], pix3[12])); + s += abs(pix1[13] - avg2(pix2[13], pix3[13])); + s += abs(pix1[14] - avg2(pix2[14], pix3[14])); + s += abs(pix1[15] - avg2(pix2[15], pix3[15])); + pix1 += line_size; + pix2 += line_size; + pix3 += line_size; + } + return s; +} + +static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + uint8_t *pix3 = pix2 + line_size; + + for (i = 0; i < h; i++) { + s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); + s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); + s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); + s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); + s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); + s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); + s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); + s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); + s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9])); + s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10])); + s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11])); + s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12])); + s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13])); + s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14])); + s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15])); + s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16])); + pix1 += line_size; + pix2 += line_size; + pix3 += line_size; + } + return s; +} + +static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + + for (i = 0; i < h; i++) { + s += abs(pix1[0] - pix2[0]); + s += abs(pix1[1] - pix2[1]); + s += abs(pix1[2] - pix2[2]); + s += abs(pix1[3] - pix2[3]); + s += abs(pix1[4] - pix2[4]); + s += abs(pix1[5] - pix2[5]); + s += abs(pix1[6] - pix2[6]); + s += abs(pix1[7] - pix2[7]); + pix1 += line_size; + pix2 += line_size; + } + return s; +} + +static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + + for (i = 0; i < h; i++) { + s += abs(pix1[0] - avg2(pix2[0], pix2[1])); + s += abs(pix1[1] - avg2(pix2[1], pix2[2])); + s += abs(pix1[2] - avg2(pix2[2], pix2[3])); + s += abs(pix1[3] - avg2(pix2[3], pix2[4])); + s += abs(pix1[4] - avg2(pix2[4], pix2[5])); + s += abs(pix1[5] - avg2(pix2[5], pix2[6])); + s += abs(pix1[6] - avg2(pix2[6], pix2[7])); + s += abs(pix1[7] - avg2(pix2[7], pix2[8])); + pix1 += line_size; + pix2 += line_size; + } + return s; +} + +static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + uint8_t *pix3 = pix2 + line_size; + + for (i = 0; i < h; i++) { + s += abs(pix1[0] - avg2(pix2[0], pix3[0])); + s += abs(pix1[1] - avg2(pix2[1], pix3[1])); + s += abs(pix1[2] - avg2(pix2[2], pix3[2])); + s += abs(pix1[3] - avg2(pix2[3], pix3[3])); + s += abs(pix1[4] - avg2(pix2[4], pix3[4])); + s += abs(pix1[5] - avg2(pix2[5], pix3[5])); + s += abs(pix1[6] - avg2(pix2[6], pix3[6])); + s += abs(pix1[7] - avg2(pix2[7], pix3[7])); + pix1 += line_size; + pix2 += line_size; + pix3 += line_size; + } + return s; +} + +static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int s = 0, i; + uint8_t *pix3 = pix2 + line_size; + + for (i = 0; i < h; i++) { + s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1])); + s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2])); + s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3])); + s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4])); + s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5])); + s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6])); + s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7])); + s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8])); + pix1 += line_size; + pix2 += line_size; + pix3 += line_size; + } + return s; +} + +static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) +{ + int score1 = 0, score2 = 0, x, y; + + for (y = 0; y < h; y++) { + for (x = 0; x < 16; x++) + score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); + if (y + 1 < h) { + for (x = 0; x < 15; x++) + score2 += FFABS(s1[x] - s1[x + stride] - + s1[x + 1] + s1[x + stride + 1]) - + FFABS(s2[x] - s2[x + stride] - + s2[x + 1] + s2[x + stride + 1]); + } + s1 += stride; + s2 += stride; + } + + if (c) + return score1 + FFABS(score2) * c->avctx->nsse_weight; + else + return score1 + FFABS(score2) * 8; +} + +static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h) +{ + int score1 = 0, score2 = 0, x, y; + + for (y = 0; y < h; y++) { + for (x = 0; x < 8; x++) + score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]); + if (y + 1 < h) { + for (x = 0; x < 7; x++) + score2 += FFABS(s1[x] - s1[x + stride] - + s1[x + 1] + s1[x + stride + 1]) - + FFABS(s2[x] - s2[x + stride] - + s2[x + 1] + s2[x + stride + 1]); + } + s1 += stride; + s2 += stride; + } + + if (c) + return score1 + FFABS(score2) * c->avctx->nsse_weight; + else + return score1 + FFABS(score2) * 8; +} + +static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b, + int stride, int h) +{ + return 0; +} + +void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type) +{ + int i; + + memset(cmp, 0, sizeof(void *) * 6); + + for (i = 0; i < 6; i++) { + switch (type & 0xFF) { + case FF_CMP_SAD: + cmp[i] = c->sad[i]; + break; + case FF_CMP_SATD: + cmp[i] = c->hadamard8_diff[i]; + break; + case FF_CMP_SSE: + cmp[i] = c->sse[i]; + break; + case FF_CMP_DCT: + cmp[i] = c->dct_sad[i]; + break; + case FF_CMP_DCT264: + cmp[i] = c->dct264_sad[i]; + break; + case FF_CMP_DCTMAX: + cmp[i] = c->dct_max[i]; + break; + case FF_CMP_PSNR: + cmp[i] = c->quant_psnr[i]; + break; + case FF_CMP_BIT: + cmp[i] = c->bit[i]; + break; + case FF_CMP_RD: + cmp[i] = c->rd[i]; + break; + case FF_CMP_VSAD: + cmp[i] = c->vsad[i]; + break; + case FF_CMP_VSSE: + cmp[i] = c->vsse[i]; + break; + case FF_CMP_ZERO: + cmp[i] = zero_cmp; + break; + case FF_CMP_NSSE: + cmp[i] = c->nsse[i]; + break; + default: + av_log(NULL, AV_LOG_ERROR, + "internal error in cmp function selection\n"); + } + } +} + +#define BUTTERFLY2(o1, o2, i1, i2) \ + o1 = (i1) + (i2); \ + o2 = (i1) - (i2); + +#define BUTTERFLY1(x, y) \ + { \ + int a, b; \ + a = x; \ + b = y; \ + x = a + b; \ + y = a - b; \ + } + +#define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y))) + +static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst, + uint8_t *src, int stride, int h) +{ + int i, temp[64], sum = 0; + + assert(h == 8); + + for (i = 0; i < 8; i++) { + // FIXME: try pointer walks + BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], + src[stride * i + 0] - dst[stride * i + 0], + src[stride * i + 1] - dst[stride * i + 1]); + BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], + src[stride * i + 2] - dst[stride * i + 2], + src[stride * i + 3] - dst[stride * i + 3]); + BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], + src[stride * i + 4] - dst[stride * i + 4], + src[stride * i + 5] - dst[stride * i + 5]); + BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], + src[stride * i + 6] - dst[stride * i + 6], + src[stride * i + 7] - dst[stride * i + 7]); + + BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); + BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); + BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); + BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); + + BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); + BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); + BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); + BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); + } + + for (i = 0; i < 8; i++) { + BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); + BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); + BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); + BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); + + BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); + BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); + BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); + BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); + + sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); + } + return sum; +} + +static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src, + uint8_t *dummy, int stride, int h) +{ + int i, temp[64], sum = 0; + + assert(h == 8); + + for (i = 0; i < 8; i++) { + // FIXME: try pointer walks + BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1], + src[stride * i + 0], src[stride * i + 1]); + BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3], + src[stride * i + 2], src[stride * i + 3]); + BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5], + src[stride * i + 4], src[stride * i + 5]); + BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7], + src[stride * i + 6], src[stride * i + 7]); + + BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]); + BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]); + BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]); + BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]); + + BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]); + BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]); + BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]); + BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]); + } + + for (i = 0; i < 8; i++) { + BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]); + BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]); + BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]); + BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]); + + BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]); + BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]); + BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]); + BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]); + + sum += + BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) + + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) + + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) + + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]); + } + + sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean + + return sum; +} + +static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1, + uint8_t *src2, int stride, int h) +{ + LOCAL_ALIGNED_16(int16_t, temp, [64]); + + assert(h == 8); + + s->pdsp.diff_pixels(temp, src1, src2, stride); + s->fdsp.fdct(temp); + return s->mecc.sum_abs_dctelem(temp); +} + +#if CONFIG_GPL +#define DCT8_1D \ + { \ + const int s07 = SRC(0) + SRC(7); \ + const int s16 = SRC(1) + SRC(6); \ + const int s25 = SRC(2) + SRC(5); \ + const int s34 = SRC(3) + SRC(4); \ + const int a0 = s07 + s34; \ + const int a1 = s16 + s25; \ + const int a2 = s07 - s34; \ + const int a3 = s16 - s25; \ + const int d07 = SRC(0) - SRC(7); \ + const int d16 = SRC(1) - SRC(6); \ + const int d25 = SRC(2) - SRC(5); \ + const int d34 = SRC(3) - SRC(4); \ + const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \ + const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \ + const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \ + const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \ + DST(0, a0 + a1); \ + DST(1, a4 + (a7 >> 2)); \ + DST(2, a2 + (a3 >> 1)); \ + DST(3, a5 + (a6 >> 2)); \ + DST(4, a0 - a1); \ + DST(5, a6 - (a5 >> 2)); \ + DST(6, (a2 >> 1) - a3); \ + DST(7, (a4 >> 2) - a7); \ + } + +static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1, + uint8_t *src2, int stride, int h) +{ + int16_t dct[8][8]; + int i, sum = 0; + + s->pdsp.diff_pixels(dct[0], src1, src2, stride); + +#define SRC(x) dct[i][x] +#define DST(x, v) dct[i][x] = v + for (i = 0; i < 8; i++) + DCT8_1D +#undef SRC +#undef DST + +#define SRC(x) dct[x][i] +#define DST(x, v) sum += FFABS(v) + for (i = 0; i < 8; i++) + DCT8_1D +#undef SRC +#undef DST + return sum; +} +#endif + +static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1, + uint8_t *src2, int stride, int h) +{ + LOCAL_ALIGNED_16(int16_t, temp, [64]); + int sum = 0, i; + + assert(h == 8); + + s->pdsp.diff_pixels(temp, src1, src2, stride); + s->fdsp.fdct(temp); + + for (i = 0; i < 64; i++) + sum = FFMAX(sum, FFABS(temp[i])); + + return sum; +} + +static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1, + uint8_t *src2, int stride, int h) +{ + LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]); + int16_t *const bak = temp + 64; + int sum = 0, i; + + assert(h == 8); + s->mb_intra = 0; + + s->pdsp.diff_pixels(temp, src1, src2, stride); + + memcpy(bak, temp, 64 * sizeof(int16_t)); + + s->block_last_index[0 /* FIXME */] = + s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); + s->dct_unquantize_inter(s, temp, 0, s->qscale); + ff_simple_idct_8(temp); // FIXME + + for (i = 0; i < 64; i++) + sum += (temp[i] - bak[i]) * (temp[i] - bak[i]); + + return sum; +} + +static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, + int stride, int h) +{ + const uint8_t *scantable = s->intra_scantable.permutated; + LOCAL_ALIGNED_16(int16_t, temp, [64]); + LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]); + LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]); + int i, last, run, bits, level, distortion, start_i; + const int esc_length = s->ac_esc_length; + uint8_t *length, *last_length; + + assert(h == 8); + + copy_block8(lsrc1, src1, 8, stride, 8); + copy_block8(lsrc2, src2, 8, stride, 8); + + s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8); + + s->block_last_index[0 /* FIXME */] = + last = + s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); + + bits = 0; + + if (s->mb_intra) { + start_i = 1; + length = s->intra_ac_vlc_length; + last_length = s->intra_ac_vlc_last_length; + bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma + } else { + start_i = 0; + length = s->inter_ac_vlc_length; + last_length = s->inter_ac_vlc_last_length; + } + + if (last >= start_i) { + run = 0; + for (i = start_i; i < last; i++) { + int j = scantable[i]; + level = temp[j]; + + if (level) { + level += 64; + if ((level & (~127)) == 0) + bits += length[UNI_AC_ENC_INDEX(run, level)]; + else + bits += esc_length; + run = 0; + } else + run++; + } + i = scantable[last]; + + level = temp[i] + 64; + + assert(level - 64); + + if ((level & (~127)) == 0) { + bits += last_length[UNI_AC_ENC_INDEX(run, level)]; + } else + bits += esc_length; + } + + if (last >= 0) { + if (s->mb_intra) + s->dct_unquantize_intra(s, temp, 0, s->qscale); + else + s->dct_unquantize_inter(s, temp, 0, s->qscale); + } + + s->idsp.idct_add(lsrc2, 8, temp); + + distortion = s->mecc.sse[1](NULL, lsrc2, lsrc1, 8, 8); + + return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7); +} + +static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, + int stride, int h) +{ + const uint8_t *scantable = s->intra_scantable.permutated; + LOCAL_ALIGNED_16(int16_t, temp, [64]); + int i, last, run, bits, level, start_i; + const int esc_length = s->ac_esc_length; + uint8_t *length, *last_length; + + assert(h == 8); + + s->pdsp.diff_pixels(temp, src1, src2, stride); + + s->block_last_index[0 /* FIXME */] = + last = + s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i); + + bits = 0; + + if (s->mb_intra) { + start_i = 1; + length = s->intra_ac_vlc_length; + last_length = s->intra_ac_vlc_last_length; + bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma + } else { + start_i = 0; + length = s->inter_ac_vlc_length; + last_length = s->inter_ac_vlc_last_length; + } + + if (last >= start_i) { + run = 0; + for (i = start_i; i < last; i++) { + int j = scantable[i]; + level = temp[j]; + + if (level) { + level += 64; + if ((level & (~127)) == 0) + bits += length[UNI_AC_ENC_INDEX(run, level)]; + else + bits += esc_length; + run = 0; + } else + run++; + } + i = scantable[last]; + + level = temp[i] + 64; + + assert(level - 64); + + if ((level & (~127)) == 0) + bits += last_length[UNI_AC_ENC_INDEX(run, level)]; + else + bits += esc_length; + } + + return bits; +} + +#define VSAD_INTRA(size) \ +static int vsad_intra ## size ## _c(MpegEncContext *c, \ + uint8_t *s, uint8_t *dummy, \ + int stride, int h) \ +{ \ + int score = 0, x, y; \ + \ + for (y = 1; y < h; y++) { \ + for (x = 0; x < size; x += 4) { \ + score += FFABS(s[x] - s[x + stride]) + \ + FFABS(s[x + 1] - s[x + stride + 1]) + \ + FFABS(s[x + 2] - s[x + 2 + stride]) + \ + FFABS(s[x + 3] - s[x + 3 + stride]); \ + } \ + s += stride; \ + } \ + \ + return score; \ +} +VSAD_INTRA(8) +VSAD_INTRA(16) + +static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, + int stride, int h) +{ + int score = 0, x, y; + + for (y = 1; y < h; y++) { + for (x = 0; x < 16; x++) + score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); + s1 += stride; + s2 += stride; + } + + return score; +} + +#define SQ(a) ((a) * (a)) +#define VSSE_INTRA(size) \ +static int vsse_intra ## size ## _c(MpegEncContext *c, \ + uint8_t *s, uint8_t *dummy, \ + int stride, int h) \ +{ \ + int score = 0, x, y; \ + \ + for (y = 1; y < h; y++) { \ + for (x = 0; x < size; x += 4) { \ + score += SQ(s[x] - s[x + stride]) + \ + SQ(s[x + 1] - s[x + stride + 1]) + \ + SQ(s[x + 2] - s[x + stride + 2]) + \ + SQ(s[x + 3] - s[x + stride + 3]); \ + } \ + s += stride; \ + } \ + \ + return score; \ +} +VSSE_INTRA(8) +VSSE_INTRA(16) + +static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, + int stride, int h) +{ + int score = 0, x, y; + + for (y = 1; y < h; y++) { + for (x = 0; x < 16; x++) + score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); + s1 += stride; + s2 += stride; + } + + return score; +} + +#define WRAPPER8_16_SQ(name8, name16) \ +static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \ + int stride, int h) \ +{ \ + int score = 0; \ + \ + score += name8(s, dst, src, stride, 8); \ + score += name8(s, dst + 8, src + 8, stride, 8); \ + if (h == 16) { \ + dst += 8 * stride; \ + src += 8 * stride; \ + score += name8(s, dst, src, stride, 8); \ + score += name8(s, dst + 8, src + 8, stride, 8); \ + } \ + return score; \ +} + +WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c) +WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c) +WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c) +#if CONFIG_GPL +WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c) +#endif +WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c) +WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) +WRAPPER8_16_SQ(rd8x8_c, rd16_c) +WRAPPER8_16_SQ(bit8x8_c, bit16_c) + +av_cold void ff_me_cmp_init_static(void) +{ + int i; + + for (i = 0; i < 512; i++) + ff_square_tab[i] = (i - 256) * (i - 256); +} + +av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx) +{ + c->sum_abs_dctelem = sum_abs_dctelem_c; + + /* TODO [0] 16 [1] 8 */ + c->pix_abs[0][0] = pix_abs16_c; + c->pix_abs[0][1] = pix_abs16_x2_c; + c->pix_abs[0][2] = pix_abs16_y2_c; + c->pix_abs[0][3] = pix_abs16_xy2_c; + c->pix_abs[1][0] = pix_abs8_c; + c->pix_abs[1][1] = pix_abs8_x2_c; + c->pix_abs[1][2] = pix_abs8_y2_c; + c->pix_abs[1][3] = pix_abs8_xy2_c; + +#define SET_CMP_FUNC(name) \ + c->name[0] = name ## 16_c; \ + c->name[1] = name ## 8x8_c; + + SET_CMP_FUNC(hadamard8_diff) + c->hadamard8_diff[4] = hadamard8_intra16_c; + c->hadamard8_diff[5] = hadamard8_intra8x8_c; + SET_CMP_FUNC(dct_sad) + SET_CMP_FUNC(dct_max) +#if CONFIG_GPL + SET_CMP_FUNC(dct264_sad) +#endif + c->sad[0] = pix_abs16_c; + c->sad[1] = pix_abs8_c; + c->sse[0] = sse16_c; + c->sse[1] = sse8_c; + c->sse[2] = sse4_c; + SET_CMP_FUNC(quant_psnr) + SET_CMP_FUNC(rd) + SET_CMP_FUNC(bit) + c->vsad[0] = vsad16_c; + c->vsad[4] = vsad_intra16_c; + c->vsad[5] = vsad_intra8_c; + c->vsse[0] = vsse16_c; + c->vsse[4] = vsse_intra16_c; + c->vsse[5] = vsse_intra8_c; + c->nsse[0] = nsse16_c; + c->nsse[1] = nsse8_c; + + if (ARCH_ARM) + ff_me_cmp_init_arm(c, avctx); + if (ARCH_PPC) + ff_me_cmp_init_ppc(c, avctx); + if (ARCH_X86) + ff_me_cmp_init_x86(c, avctx); +} diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h new file mode 100644 index 0000000000..05ae30b0c1 --- /dev/null +++ b/libavcodec/me_cmp.h @@ -0,0 +1,73 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ME_CMP_H +#define AVCODEC_ME_CMP_H + +#include + +#include "avcodec.h" + +extern uint32_t ff_square_tab[512]; + +struct MpegEncContext; +/* Motion estimation: + * h is limited to { width / 2, width, 2 * width }, + * but never larger than 16 and never smaller than 2. + * Although currently h < 4 is not used as functions with + * width < 8 are neither used nor implemented. */ +typedef int (*me_cmp_func)(struct MpegEncContext *c, + uint8_t *blk1 /* align width (8 or 16) */, + uint8_t *blk2 /* align 1 */, int line_size, int h); + +typedef struct MECmpContext { + int (*sum_abs_dctelem)(int16_t *block /* align 16 */); + + me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ + me_cmp_func sse[6]; + me_cmp_func hadamard8_diff[6]; + me_cmp_func dct_sad[6]; + me_cmp_func quant_psnr[6]; + me_cmp_func bit[6]; + me_cmp_func rd[6]; + me_cmp_func vsad[6]; + me_cmp_func vsse[6]; + me_cmp_func nsse[6]; + me_cmp_func dct_max[6]; + me_cmp_func dct264_sad[6]; + + me_cmp_func me_pre_cmp[6]; + me_cmp_func me_cmp[6]; + me_cmp_func me_sub_cmp[6]; + me_cmp_func mb_cmp[6]; + me_cmp_func ildct_cmp[6]; // only width 16 used + me_cmp_func frame_skip_cmp[6]; // only width 8 used + + me_cmp_func pix_abs[2][4]; +} MECmpContext; + +void ff_me_cmp_init_static(void); + +void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx); +void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx); +void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx); +void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx); + +void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type); + +#endif /* AVCODEC_ME_CMP_H */ diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c index a8a93b73e8..dee28f05ab 100644 --- a/libavcodec/motion_est.c +++ b/libavcodec/motion_est.c @@ -317,10 +317,10 @@ int ff_init_me(MpegEncContext *s){ av_log(s->avctx, AV_LOG_INFO, "ME_MAP size may be a little small for the selected diamond size\n"); } - ff_set_cmp(&s->dsp, s->dsp.me_pre_cmp, c->avctx->me_pre_cmp); - ff_set_cmp(&s->dsp, s->dsp.me_cmp, c->avctx->me_cmp); - ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, c->avctx->me_sub_cmp); - ff_set_cmp(&s->dsp, s->dsp.mb_cmp, c->avctx->mb_cmp); + ff_set_cmp(&s->mecc, s->mecc.me_pre_cmp, c->avctx->me_pre_cmp); + ff_set_cmp(&s->mecc, s->mecc.me_cmp, c->avctx->me_cmp); + ff_set_cmp(&s->mecc, s->mecc.me_sub_cmp, c->avctx->me_sub_cmp); + ff_set_cmp(&s->mecc, s->mecc.mb_cmp, c->avctx->mb_cmp); c->flags = get_flags(c, 0, c->avctx->me_cmp &FF_CMP_CHROMA); c->sub_flags= get_flags(c, 0, c->avctx->me_sub_cmp&FF_CMP_CHROMA); @@ -361,12 +361,10 @@ int ff_init_me(MpegEncContext *s){ /* 8x8 fullpel search would need a 4x4 chroma compare, which we do * not have yet, and even if we had, the motion estimation code * does not expect it. */ - if((c->avctx->me_cmp&FF_CMP_CHROMA)/* && !s->dsp.me_cmp[2]*/){ - s->dsp.me_cmp[2]= zero_cmp; - } - if((c->avctx->me_sub_cmp&FF_CMP_CHROMA) && !s->dsp.me_sub_cmp[2]){ - s->dsp.me_sub_cmp[2]= zero_cmp; - } + if ((c->avctx->me_cmp & FF_CMP_CHROMA) /* && !s->mecc.me_cmp[2] */) + s->mecc.me_cmp[2] = zero_cmp; + if ((c->avctx->me_sub_cmp & FF_CMP_CHROMA) && !s->mecc.me_sub_cmp[2]) + s->mecc.me_sub_cmp[2] = zero_cmp; c->hpel_put[2][0]= c->hpel_put[2][1]= c->hpel_put[2][2]= c->hpel_put[2][3]= zero_hpel; @@ -379,7 +377,7 @@ int ff_init_me(MpegEncContext *s){ #define CHECK_SAD_HALF_MV(suffix, x, y) \ {\ - d= s->dsp.pix_abs[size][(x?1:0)+(y?2:0)](NULL, pix, ptr+((x)>>1), stride, h);\ + d = s->mecc.pix_abs[size][(x ? 1 : 0) + (y ? 2 : 0)](NULL, pix, ptr + ((x) >> 1), stride, h); \ d += (mv_penalty[pen_x + x] + mv_penalty[pen_y + y])*penalty_factor;\ COPY3_IF_LT(dminh, d, dx, x, dy, y)\ } @@ -615,7 +613,7 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) dmin4= c->sub_motion_search(s, &mx4, &my4, dmin4, block, block, size, h); - if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ + if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) { int dxy; const int offset= ((block&1) + (block>>1)*stride)*8; uint8_t *dest_y = c->scratchpad + offset; @@ -657,8 +655,11 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) if(same) return INT_MAX; - if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ - dmin_sum += s->dsp.mb_cmp[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*16*stride, c->scratchpad, stride, 16); + if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) { + dmin_sum += s->mecc.mb_cmp[0](s, + s->new_picture.f->data[0] + + s->mb_x * 16 + s->mb_y * 16 * stride, + c->scratchpad, stride, 16); } if(c->avctx->mb_cmp&FF_CMP_CHROMA){ @@ -680,8 +681,8 @@ static inline int h263_mv4_search(MpegEncContext *s, int mx, int my, int shift) s->hdsp.put_pixels_tab [1][dxy](c->scratchpad + 8, s->last_picture.f->data[2] + offset, s->uvlinesize, 8); } - dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad , s->uvlinesize, 8); - dmin_sum += s->dsp.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*8*s->uvlinesize, c->scratchpad+8, s->uvlinesize, 8); + dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad, s->uvlinesize, 8); + dmin_sum += s->mecc.mb_cmp[1](s, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * 8 * s->uvlinesize, c->scratchpad + 8, s->uvlinesize, 8); } c->pred_x= mx; @@ -777,7 +778,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index, mv_table[xy][0]= mx_i; mv_table[xy][1]= my_i; - if(s->dsp.me_sub_cmp[0] != s->dsp.mb_cmp[0]){ + if (s->mecc.me_sub_cmp[0] != s->mecc.mb_cmp[0]) { int dxy; //FIXME chroma ME @@ -789,7 +790,7 @@ static int interlaced_search(MpegEncContext *s, int ref_index, }else{ s->hdsp.put_pixels_tab [size][dxy](c->scratchpad, ref , stride, h); } - dmin= s->dsp.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h); + dmin = s->mecc.mb_cmp[size](s, c->src[block][0], c->scratchpad, stride, h); dmin+= (mv_penalty[mx_i-c->pred_x] + mv_penalty[my_i-c->pred_y] + 1)*c->mb_penalty_factor; }else dmin+= c->mb_penalty_factor; //field_select bits @@ -940,7 +941,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, /* At this point (mx,my) are full-pell and the relative displacement */ ppix = c->ref[0][0] + (my * s->linesize) + mx; - vard = s->dsp.sse[0](NULL, pix, ppix, s->linesize, 16); + vard = s->mecc.sse[0](NULL, pix, ppix, s->linesize, 16); pic->mc_mb_var[s->mb_stride * mb_y + mb_x] = (vard+128)>>8; c->mc_mb_var_sum_temp += (vard+128)>>8; @@ -1037,7 +1038,7 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, *(uint32_t*)(&c->scratchpad[i*s->linesize+12]) = mean; } - intra_score= s->dsp.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16); + intra_score= s->mecc.mb_cmp[0](s, c->scratchpad, pix, s->linesize, 16); } intra_score += c->mb_penalty_factor*16; @@ -1237,7 +1238,7 @@ static inline int check_bidir_mv(MpegEncContext * s, fbmin = (mv_penalty_f[motion_fx-pred_fx] + mv_penalty_f[motion_fy-pred_fy])*c->mb_penalty_factor +(mv_penalty_b[motion_bx-pred_bx] + mv_penalty_b[motion_by-pred_by])*c->mb_penalty_factor - + s->dsp.mb_cmp[size](s, src_data[0], dest_y, stride, h); //FIXME new_pic + + s->mecc.mb_cmp[size](s, src_data[0], dest_y, stride, h); // FIXME new_pic if(c->avctx->mb_cmp&FF_CMP_CHROMA){ } diff --git a/libavcodec/motion_est_template.c b/libavcodec/motion_est_template.c index 575dadd269..01936c6a83 100644 --- a/libavcodec/motion_est_template.c +++ b/libavcodec/motion_est_template.c @@ -63,8 +63,8 @@ static int hpel_motion_search(MpegEncContext * s, //FIXME factorize - cmp_sub= s->dsp.me_sub_cmp[size]; - chroma_cmp_sub= s->dsp.me_sub_cmp[size+1]; + cmp_sub = s->mecc.me_sub_cmp[size]; + chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1]; if(c->skip){ //FIXME move out of hpel? *mx_ptr = 0; @@ -166,7 +166,6 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my, int src_index, int ref_index, int size, int h, int add_rate) { -// const int check_luma= s->dsp.me_sub_cmp != s->dsp.mb_cmp; MotionEstContext * const c= &s->me; const int penalty_factor= c->mb_penalty_factor; const int flags= c->mb_flags; @@ -179,8 +178,8 @@ static inline int get_mb_score(MpegEncContext *s, int mx, int my, //FIXME factorize - cmp_sub= s->dsp.mb_cmp[size]; - chroma_cmp_sub= s->dsp.mb_cmp[size+1]; + cmp_sub = s->mecc.mb_cmp[size]; + chroma_cmp_sub = s->mecc.mb_cmp[size + 1]; // assert(!c->skip); // assert(c->avctx->me_sub_cmp != c->avctx->mb_cmp); @@ -226,12 +225,12 @@ static int qpel_motion_search(MpegEncContext * s, LOAD_COMMON int flags= c->sub_flags; - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; //factorize FIXME + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; // FIXME: factorize //FIXME factorize - cmp_sub= s->dsp.me_sub_cmp[size]; - chroma_cmp_sub= s->dsp.me_sub_cmp[size+1]; + cmp_sub = s->mecc.me_sub_cmp[size]; + chroma_cmp_sub = s->mecc.me_sub_cmp[size + 1]; if(c->skip){ //FIXME somehow move up (benchmark) *mx_ptr = 0; @@ -427,8 +426,8 @@ static av_always_inline int small_diamond_search(MpegEncContext * s, int *best, LOAD_COMMON2 unsigned map_generation = c->map_generation; - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; { /* ensure that the best point is in the MAP as h/qpel refinement needs it */ const unsigned key = (best[1]<map_generation; - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; for(dia_size=1; dia_size<=4; dia_size++){ int dir; @@ -511,8 +510,8 @@ static int hex_search(MpegEncContext * s, int *best, int dmin, int x,y,d; const int dec= dia_size & (dia_size-1); - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; for(;dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){ do{ @@ -548,8 +547,8 @@ static int l2s_dia_search(MpegEncContext * s, int *best, int dmin, static const int hex[8][2]={{-2, 0}, {-1,-1}, { 0,-2}, { 1,-1}, { 2, 0}, { 1, 1}, { 0, 2}, {-1, 1}}; - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; for(; dia_size; dia_size= dec ? dia_size-1 : dia_size>>1){ do{ @@ -587,8 +586,8 @@ static int umh_search(MpegEncContext * s, int *best, int dmin, {-2, 3}, { 0, 4}, { 2, 3}, {-2,-3}, { 0,-4}, { 2,-3},}; - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; x= best[0]; y= best[1]; @@ -630,8 +629,8 @@ static int full_search(MpegEncContext * s, int *best, int dmin, int x,y, d; const int dia_size= c->dia_size&0xFF; - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; for(y=FFMAX(-dia_size, ymin); y<=FFMIN(dia_size,ymax); y++){ for(x=FFMAX(-dia_size, xmin); x<=FFMIN(dia_size,xmax); x++){ @@ -694,8 +693,8 @@ static int sab_diamond_search(MpegEncContext * s, int *best, int dmin, LOAD_COMMON2 unsigned map_generation = c->map_generation; - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; /*Note jmap_generation; - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; for(dia_size=1; dia_size<=c->dia_size; dia_size++){ int dir, start, end; @@ -880,12 +879,12 @@ static av_always_inline int epzs_motion_search_internal(MpegEncContext * s, int if(c->pre_pass){ penalty_factor= c->pre_penalty_factor; - cmpf= s->dsp.me_pre_cmp[size]; - chroma_cmpf= s->dsp.me_pre_cmp[size+1]; + cmpf = s->mecc.me_pre_cmp[size]; + chroma_cmpf = s->mecc.me_pre_cmp[size + 1]; }else{ penalty_factor= c->penalty_factor; - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; } map_generation= update_map_generation(c); @@ -1009,8 +1008,8 @@ static int epzs_motion_search4(MpegEncContext * s, int flags= c->flags; LOAD_COMMON2 - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; map_generation= update_map_generation(c); @@ -1068,8 +1067,8 @@ static int epzs_motion_search2(MpegEncContext * s, int flags= c->flags; LOAD_COMMON2 - cmpf= s->dsp.me_cmp[size]; - chroma_cmpf= s->dsp.me_cmp[size+1]; + cmpf = s->mecc.me_cmp[size]; + chroma_cmpf = s->mecc.me_cmp[size + 1]; map_generation= update_map_generation(c); diff --git a/libavcodec/mpeg4videoenc.c b/libavcodec/mpeg4videoenc.c index f120932443..6b87ec7edb 100644 --- a/libavcodec/mpeg4videoenc.c +++ b/libavcodec/mpeg4videoenc.c @@ -689,7 +689,7 @@ void ff_mpeg4_encode_mb(MpegEncContext *s, int16_t block[6][64], b_pic = pic->f->data[0] + offset; if (!pic->shared) b_pic += INPLACE_OFFSET; - diff = s->dsp.sad[0](NULL, p_pic, b_pic, s->linesize, 16); + diff = s->mecc.sad[0](NULL, p_pic, b_pic, s->linesize, 16); if (diff > s->qscale * 70) { // FIXME check that 70 is optimal s->mb_skipped = 0; break; diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c index a4a37d4931..e1daa2aee0 100644 --- a/libavcodec/mpegvideo.c +++ b/libavcodec/mpegvideo.c @@ -378,9 +378,9 @@ static void mpeg_er_decode_mb(void *opaque, int ref, int mv_dir, int mv_type, av_cold int ff_dct_common_init(MpegEncContext *s) { ff_blockdsp_init(&s->bdsp, s->avctx); - ff_dsputil_init(&s->dsp, s->avctx); ff_hpeldsp_init(&s->hdsp, s->avctx->flags); ff_idctdsp_init(&s->idsp, s->avctx); + ff_me_cmp_init(&s->mecc, s->avctx); ff_mpegvideodsp_init(&s->mdsp); ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample); @@ -1051,7 +1051,7 @@ static int init_er(MpegEncContext *s) int i; er->avctx = s->avctx; - er->dsp = &s->dsp; + er->mecc = &s->mecc; er->mb_index2xy = s->mb_index2xy; er->mb_num = s->mb_num; diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h index b498d50168..7dd4228c95 100644 --- a/libavcodec/mpegvideo.h +++ b/libavcodec/mpegvideo.h @@ -30,13 +30,13 @@ #include "avcodec.h" #include "blockdsp.h" -#include "dsputil.h" #include "error_resilience.h" #include "fdctdsp.h" #include "get_bits.h" #include "h263dsp.h" #include "hpeldsp.h" #include "idctdsp.h" +#include "me_cmp.h" #include "mpegvideodsp.h" #include "mpegvideoencdsp.h" #include "pixblockdsp.h" @@ -356,10 +356,10 @@ typedef struct MpegEncContext { int h263_long_vectors; ///< use horrible h263v1 long vector mode BlockDSPContext bdsp; - DSPContext dsp; ///< pointers for accelerated dsp functions FDCTDSPContext fdsp; HpelDSPContext hdsp; IDCTDSPContext idsp; + MECmpContext mecc; MpegVideoDSPContext mdsp; MpegvideoEncDSPContext mpvencdsp; PixblockDSPContext pdsp; diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c index e2504c7b72..f0bf73e210 100644 --- a/libavcodec/mpegvideo_enc.c +++ b/libavcodec/mpegvideo_enc.c @@ -702,6 +702,7 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx) ff_MPV_encode_init_x86(s); ff_fdctdsp_init(&s->fdsp, avctx); + ff_me_cmp_init(&s->mecc, avctx); ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx); ff_pixblockdsp_init(&s->pdsp, avctx); ff_qpeldsp_init(&s->qdsp); @@ -744,8 +745,8 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx) s->quant_precision = 5; - ff_set_cmp(&s->dsp, s->dsp.ildct_cmp, s->avctx->ildct_cmp); - ff_set_cmp(&s->dsp, s->dsp.frame_skip_cmp, s->avctx->frame_skip_cmp); + ff_set_cmp(&s->mecc, s->mecc.ildct_cmp, s->avctx->ildct_cmp); + ff_set_cmp(&s->mecc, s->mecc.frame_skip_cmp, s->avctx->frame_skip_cmp); if (CONFIG_H261_ENCODER && s->out_format == FMT_H261) ff_h261_encode_init(s); @@ -895,8 +896,8 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src, for (y = 0; y < h; y += 16) { for (x = 0; x < w; x += 16) { int offset = x + y * stride; - int sad = s->dsp.sad[0](NULL, src + offset, ref + offset, stride, - 16); + int sad = s->mecc.sad[0](NULL, src + offset, ref + offset, + stride, 16); int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8; int sae = get_sae(src + offset, mean, stride); @@ -1053,7 +1054,7 @@ static int skip_check(MpegEncContext *s, Picture *p, Picture *ref) int off = p->shared ? 0 : 16; uint8_t *dptr = p->f->data[plane] + 8 * (x + y * stride) + off; uint8_t *rptr = ref->f->data[plane] + 8 * (x + y * stride); - int v = s->dsp.frame_skip_cmp[1](s, dptr, rptr, stride, 8); + int v = s->mecc.frame_skip_cmp[1](s, dptr, rptr, stride, 8); switch (s->avctx->frame_skip_exp) { case 0: score = FFMAX(score, v); break; @@ -1923,16 +1924,15 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, int progressive_score, interlaced_score; s->interlaced_dct = 0; - progressive_score = s->dsp.ildct_cmp[4](s, ptr_y, - NULL, wrap_y, 8) + - s->dsp.ildct_cmp[4](s, ptr_y + wrap_y * 8, - NULL, wrap_y, 8) - 400; + progressive_score = s->mecc.ildct_cmp[4](s, ptr_y, NULL, wrap_y, 8) + + s->mecc.ildct_cmp[4](s, ptr_y + wrap_y * 8, + NULL, wrap_y, 8) - 400; if (progressive_score > 0) { - interlaced_score = s->dsp.ildct_cmp[4](s, ptr_y, - NULL, wrap_y * 2, 8) + - s->dsp.ildct_cmp[4](s, ptr_y + wrap_y, - NULL, wrap_y * 2, 8); + interlaced_score = s->mecc.ildct_cmp[4](s, ptr_y, + NULL, wrap_y * 2, 8) + + s->mecc.ildct_cmp[4](s, ptr_y + wrap_y, + NULL, wrap_y * 2, 8); if (progressive_score > interlaced_score) { s->interlaced_dct = 1; @@ -1996,23 +1996,20 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, int progressive_score, interlaced_score; s->interlaced_dct = 0; - progressive_score = s->dsp.ildct_cmp[0](s, dest_y, - ptr_y, wrap_y, - 8) + - s->dsp.ildct_cmp[0](s, dest_y + wrap_y * 8, - ptr_y + wrap_y * 8, wrap_y, - 8) - 400; + progressive_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y, wrap_y, 8) + + s->mecc.ildct_cmp[0](s, dest_y + wrap_y * 8, + ptr_y + wrap_y * 8, + wrap_y, 8) - 400; if (s->avctx->ildct_cmp == FF_CMP_VSSE) progressive_score -= 400; if (progressive_score > 0) { - interlaced_score = s->dsp.ildct_cmp[0](s, dest_y, - ptr_y, - wrap_y * 2, 8) + - s->dsp.ildct_cmp[0](s, dest_y + wrap_y, - ptr_y + wrap_y, - wrap_y * 2, 8); + interlaced_score = s->mecc.ildct_cmp[0](s, dest_y, ptr_y, + wrap_y * 2, 8) + + s->mecc.ildct_cmp[0](s, dest_y + wrap_y, + ptr_y + wrap_y, + wrap_y * 2, 8); if (progressive_score > interlaced_score) { s->interlaced_dct = 1; @@ -2049,33 +2046,28 @@ static av_always_inline void encode_mb_internal(MpegEncContext *s, if (s->current_picture.mc_mb_var[s->mb_stride * mb_y + mb_x] < 2 * s->qscale * s->qscale) { // FIXME optimize - if (s->dsp.sad[1](NULL, ptr_y , dest_y, - wrap_y, 8) < 20 * s->qscale) + if (s->mecc.sad[1](NULL, ptr_y, dest_y, wrap_y, 8) < 20 * s->qscale) skip_dct[0] = 1; - if (s->dsp.sad[1](NULL, ptr_y + 8, - dest_y + 8, wrap_y, 8) < 20 * s->qscale) + if (s->mecc.sad[1](NULL, ptr_y + 8, dest_y + 8, wrap_y, 8) < 20 * s->qscale) skip_dct[1] = 1; - if (s->dsp.sad[1](NULL, ptr_y + dct_offset, - dest_y + dct_offset, wrap_y, 8) < 20 * s->qscale) + if (s->mecc.sad[1](NULL, ptr_y + dct_offset, dest_y + dct_offset, + wrap_y, 8) < 20 * s->qscale) skip_dct[2] = 1; - if (s->dsp.sad[1](NULL, ptr_y + dct_offset + 8, - dest_y + dct_offset + 8, - wrap_y, 8) < 20 * s->qscale) + if (s->mecc.sad[1](NULL, ptr_y + dct_offset + 8, dest_y + dct_offset + 8, + wrap_y, 8) < 20 * s->qscale) skip_dct[3] = 1; - if (s->dsp.sad[1](NULL, ptr_cb, dest_cb, - wrap_c, 8) < 20 * s->qscale) + if (s->mecc.sad[1](NULL, ptr_cb, dest_cb, wrap_c, 8) < 20 * s->qscale) skip_dct[4] = 1; - if (s->dsp.sad[1](NULL, ptr_cr, dest_cr, - wrap_c, 8) < 20 * s->qscale) + if (s->mecc.sad[1](NULL, ptr_cr, dest_cr, wrap_c, 8) < 20 * s->qscale) skip_dct[5] = 1; if (!s->chroma_y_shift) { /* 422 */ - if (s->dsp.sad[1](NULL, ptr_cb + (dct_offset >> 1), - dest_cb + (dct_offset >> 1), - wrap_c, 8) < 20 * s->qscale) + if (s->mecc.sad[1](NULL, ptr_cb + (dct_offset >> 1), + dest_cb + (dct_offset >> 1), + wrap_c, 8) < 20 * s->qscale) skip_dct[6] = 1; - if (s->dsp.sad[1](NULL, ptr_cr + (dct_offset >> 1), - dest_cr + (dct_offset >> 1), - wrap_c, 8) < 20 * s->qscale) + if (s->mecc.sad[1](NULL, ptr_cr + (dct_offset >> 1), + dest_cr + (dct_offset >> 1), + wrap_c, 8) < 20 * s->qscale) skip_dct[7] = 1; } } @@ -2340,9 +2332,9 @@ static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, in int x,y; if(w==16 && h==16) - return s->dsp.sse[0](NULL, src1, src2, stride, 16); + return s->mecc.sse[0](NULL, src1, src2, stride, 16); else if(w==8 && h==8) - return s->dsp.sse[1](NULL, src1, src2, stride, 8); + return s->mecc.sse[1](NULL, src1, src2, stride, 8); for(y=0; yavctx->mb_cmp == FF_CMP_NSSE){ - return s->dsp.nsse[0](s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16) - +s->dsp.nsse[1](s, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8) - +s->dsp.nsse[1](s, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8); + return s->mecc.nsse[0](s, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize * 16, s->dest[0], s->linesize, 16) + + s->mecc.nsse[1](s, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[1], s->uvlinesize, 8) + + s->mecc.nsse[1](s, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[2], s->uvlinesize, 8); }else{ - return s->dsp.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], s->linesize, 16) - +s->dsp.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[1], s->uvlinesize, 8) - +s->dsp.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x*8 + s->mb_y*s->uvlinesize*8,s->dest[2], s->uvlinesize, 8); + return s->mecc.sse[0](NULL, s->new_picture.f->data[0] + s->mb_x * 16 + s->mb_y * s->linesize * 16, s->dest[0], s->linesize, 16) + + s->mecc.sse[1](NULL, s->new_picture.f->data[1] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[1], s->uvlinesize, 8) + + s->mecc.sse[1](NULL, s->new_picture.f->data[2] + s->mb_x * 8 + s->mb_y * s->uvlinesize * 8, s->dest[2], s->uvlinesize, 8); } else return sse(s, s->new_picture.f->data[0] + s->mb_x*16 + s->mb_y*s->linesize*16, s->dest[0], w, h, s->linesize) diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c index 8202034643..109bbe5da8 100644 --- a/libavcodec/mpegvideoencdsp.c +++ b/libavcodec/mpegvideoencdsp.c @@ -24,8 +24,8 @@ #include "libavutil/attributes.h" #include "libavutil/imgutils.h" #include "avcodec.h" -#include "dsputil.h" #include "imgconvert.h" +#include "me_cmp.h" #include "mpegvideoencdsp.h" static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index eb9623b750..a75d9bf9a0 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -2,7 +2,6 @@ OBJS += ppc/fmtconvert_altivec.o \ OBJS-$(CONFIG_AUDIODSP) += ppc/audiodsp.o OBJS-$(CONFIG_BLOCKDSP) += ppc/blockdsp.o -OBJS-$(CONFIG_DSPUTIL) += ppc/dsputil_altivec.o OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o OBJS-$(CONFIG_H264DSP) += ppc/h264dsp.o @@ -11,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP) += ppc/hpeldsp_altivec.o OBJS-$(CONFIG_HUFFYUVDSP) += ppc/huffyuvdsp_altivec.o OBJS-$(CONFIG_FDCTDSP) += ppc/fdctdsp.o OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o +OBJS-$(CONFIG_ME_CMP) += ppc/me_cmp.o OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \ ppc/mpegvideodsp.o diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c deleted file mode 100644 index 33f9b5ae13..0000000000 --- a/libavcodec/ppc/dsputil_altivec.c +++ /dev/null @@ -1,767 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#if HAVE_ALTIVEC_H -#include -#endif - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/ppc/cpu.h" -#include "libavutil/ppc/types_altivec.h" -#include "libavutil/ppc/util_altivec.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" - -#if HAVE_ALTIVEC -static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int i, s = 0; - const vector unsigned char zero = - (const vector unsigned char) vec_splat_u8(0); - vector unsigned char perm1 = vec_lvsl(0, pix2); - vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); - vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); - vector signed int sumdiffs; - - for (i = 0; i < h; i++) { - /* Read unaligned pixels into our vectors. The vectors are as follows: - * pix1v: pix1[0] - pix1[15] - * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */ - vector unsigned char pix1v = vec_ld(0, pix1); - vector unsigned char pix2l = vec_ld(0, pix2); - vector unsigned char pix2r = vec_ld(16, pix2); - vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); - vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); - - /* Calculate the average vector. */ - vector unsigned char avgv = vec_avg(pix2v, pix2iv); - - /* Calculate a sum of abs differences vector. */ - vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv), - vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad. */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - /* Sum up the four partial sums, and put the result into s. */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int i, s = 0; - const vector unsigned char zero = - (const vector unsigned char) vec_splat_u8(0); - vector unsigned char perm = vec_lvsl(0, pix2); - vector unsigned char pix1v, pix3v, avgv, t5; - vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); - vector signed int sumdiffs; - uint8_t *pix3 = pix2 + line_size; - - /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one - * iteration becomes pix2 in the next iteration. We can use this - * fact to avoid a potentially expensive unaligned read, each - * time around the loop. - * Read unaligned pixels into our vectors. The vectors are as follows: - * pix2v: pix2[0] - pix2[15] - * Split the pixel vectors into shorts. */ - vector unsigned char pix2l = vec_ld(0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm); - - for (i = 0; i < h; i++) { - /* Read unaligned pixels into our vectors. The vectors are as follows: - * pix1v: pix1[0] - pix1[15] - * pix3v: pix3[0] - pix3[15] */ - pix1v = vec_ld(0, pix1); - - pix2l = vec_ld(0, pix3); - pix2r = vec_ld(15, pix3); - pix3v = vec_perm(pix2l, pix2r, perm); - - /* Calculate the average vector. */ - avgv = vec_avg(pix2v, pix3v); - - /* Calculate a sum of abs differences vector. */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad. */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2v = pix3v; - pix3 += line_size; - } - - /* Sum up the four partial sums, and put the result into s. */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - return s; -} - -static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int i, s = 0; - uint8_t *pix3 = pix2 + line_size; - const vector unsigned char zero = - (const vector unsigned char) vec_splat_u8(0); - const vector unsigned short two = - (const vector unsigned short) vec_splat_u16(2); - vector unsigned char avgv, t5; - vector unsigned char perm1 = vec_lvsl(0, pix2); - vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); - vector unsigned char pix1v, pix3v, pix3iv; - vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; - vector unsigned short avghv, avglv; - vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); - vector signed int sumdiffs; - - /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one - * iteration becomes pix2 in the next iteration. We can use this - * fact to avoid a potentially expensive unaligned read, as well - * as some splitting, and vector addition each time around the loop. - * Read unaligned pixels into our vectors. The vectors are as follows: - * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] - * Split the pixel vectors into shorts. */ - vector unsigned char pix2l = vec_ld(0, pix2); - vector unsigned char pix2r = vec_ld(16, pix2); - vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); - vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); - - vector unsigned short pix2hv = - (vector unsigned short) vec_mergeh(zero, pix2v); - vector unsigned short pix2lv = - (vector unsigned short) vec_mergel(zero, pix2v); - vector unsigned short pix2ihv = - (vector unsigned short) vec_mergeh(zero, pix2iv); - vector unsigned short pix2ilv = - (vector unsigned short) vec_mergel(zero, pix2iv); - vector unsigned short t1 = vec_add(pix2hv, pix2ihv); - vector unsigned short t2 = vec_add(pix2lv, pix2ilv); - vector unsigned short t3, t4; - - for (i = 0; i < h; i++) { - /* Read unaligned pixels into our vectors. The vectors are as follows: - * pix1v: pix1[0] - pix1[15] - * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */ - pix1v = vec_ld(0, pix1); - - pix2l = vec_ld(0, pix3); - pix2r = vec_ld(16, pix3); - pix3v = vec_perm(pix2l, pix2r, perm1); - pix3iv = vec_perm(pix2l, pix2r, perm2); - - /* Note that AltiVec does have vec_avg, but this works on vector pairs - * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the - * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when - * it should be 1. Instead, we have to split the pixel vectors into - * vectors of shorts and do the averaging by hand. */ - - /* Split the pixel vectors into shorts. */ - pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); - pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); - pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); - pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); - - /* Do the averaging on them. */ - t3 = vec_add(pix3hv, pix3ihv); - t4 = vec_add(pix3lv, pix3ilv); - - avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); - avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); - - /* Pack the shorts back into a result. */ - avgv = vec_pack(avghv, avglv); - - /* Calculate a sum of abs differences vector. */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad. */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix3 += line_size; - /* Transfer the calculated values for pix3 into pix2. */ - t1 = t3; - t2 = t4; - } - /* Sum up the four partial sums, and put the result into s. */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int i, s; - const vector unsigned int zero = - (const vector unsigned int) vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix2); - vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); - vector signed int sumdiffs; - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2. */ - vector unsigned char pix2l = vec_ld(0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - vector unsigned char t1 = vec_ld(0, pix1); - vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); - - /* Calculate a sum of abs differences vector. */ - vector unsigned char t3 = vec_max(t1, t2); - vector unsigned char t4 = vec_min(t1, t2); - vector unsigned char t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad. */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s. */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int i, s; - const vector unsigned int zero = - (const vector unsigned int) vec_splat_u32(0); - const vector unsigned char permclear = - (vector unsigned char) - { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; - vector unsigned char perm1 = vec_lvsl(0, pix1); - vector unsigned char perm2 = vec_lvsl(0, pix2); - vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); - vector signed int sumdiffs; - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2. - * Since we're reading 16 pixels, and actually only want 8, - * mask out the last 8 pixels. The 0s don't change the sum. */ - vector unsigned char pix1l = vec_ld(0, pix1); - vector unsigned char pix1r = vec_ld(7, pix1); - vector unsigned char pix2l = vec_ld(0, pix2); - vector unsigned char pix2r = vec_ld(7, pix2); - vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), - permclear); - vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), - permclear); - - /* Calculate a sum of abs differences vector. */ - vector unsigned char t3 = vec_max(t1, t2); - vector unsigned char t4 = vec_min(t1, t2); - vector unsigned char t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad. */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s. */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. - * It's the sad8_altivec code above w/ squaring added. */ -static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int i, s; - const vector unsigned int zero = - (const vector unsigned int) vec_splat_u32(0); - const vector unsigned char permclear = - (vector unsigned char) - { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; - vector unsigned char perm1 = vec_lvsl(0, pix1); - vector unsigned char perm2 = vec_lvsl(0, pix2); - vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); - vector signed int sumsqr; - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2. - * Since we're reading 16 pixels, and actually only want 8, - * mask out the last 8 pixels. The 0s don't change the sum. */ - vector unsigned char pix1l = vec_ld(0, pix1); - vector unsigned char pix1r = vec_ld(7, pix1); - vector unsigned char pix2l = vec_ld(0, pix2); - vector unsigned char pix2r = vec_ld(7, pix2); - vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), - permclear); - vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), - permclear); - - /* Since we want to use unsigned chars, we can take advantage - * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ - - /* Calculate abs differences vector. */ - vector unsigned char t3 = vec_max(t1, t2); - vector unsigned char t4 = vec_min(t1, t2); - vector unsigned char t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum. */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s. */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced. - * It's the sad16_altivec code above w/ squaring added. */ -static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int i, s; - const vector unsigned int zero = - (const vector unsigned int) vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix2); - vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); - vector signed int sumsqr; - - for (i = 0; i < h; i++) { - /* Read potentially unaligned pixels into t1 and t2. */ - vector unsigned char pix2l = vec_ld(0, pix2); - vector unsigned char pix2r = vec_ld(15, pix2); - vector unsigned char t1 = vec_ld(0, pix1); - vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); - - /* Since we want to use unsigned chars, we can take advantage - * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ - - /* Calculate abs differences vector. */ - vector unsigned char t3 = vec_max(t1, t2); - vector unsigned char t4 = vec_min(t1, t2); - vector unsigned char t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum. */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s. */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, - uint8_t *src, int stride, int h) -{ - int sum; - register const vector unsigned char vzero = - (const vector unsigned char) vec_splat_u8(0); - register vector signed short temp0, temp1, temp2, temp3, temp4, - temp5, temp6, temp7; - { - register const vector signed short vprod1 = - (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; - register const vector signed short vprod2 = - (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; - register const vector signed short vprod3 = - (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; - register const vector unsigned char perm1 = - (const vector unsigned char) - { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; - register const vector unsigned char perm2 = - (const vector unsigned char) - { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; - register const vector unsigned char perm3 = - (const vector unsigned char) - { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; - -#define ONEITERBUTTERFLY(i, res) \ - { \ - register vector unsigned char src1 = vec_ld(stride * i, src); \ - register vector unsigned char src2 = vec_ld(stride * i + 15, src); \ - register vector unsigned char srcO = \ - vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - register vector unsigned char dst1 = vec_ld(stride * i, dst); \ - register vector unsigned char dst2 = vec_ld(stride * i + 15, dst); \ - register vector unsigned char dstO = \ - vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - \ - /* Promote the unsigned chars to signed shorts. */ \ - /* We're in the 8x8 function, we only care for the first 8. */ \ - register vector signed short srcV = \ - (vector signed short) vec_mergeh((vector signed char) vzero, \ - (vector signed char) srcO); \ - register vector signed short dstV = \ - (vector signed short) vec_mergeh((vector signed char) vzero, \ - (vector signed char) dstO); \ - \ - /* subtractions inside the first butterfly */ \ - register vector signed short but0 = vec_sub(srcV, dstV); \ - register vector signed short op1 = vec_perm(but0, but0, perm1); \ - register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ - register vector signed short op2 = vec_perm(but1, but1, perm2); \ - register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ - register vector signed short op3 = vec_perm(but2, but2, perm3); \ - res = vec_mladd(but2, vprod3, op3); \ - } - ONEITERBUTTERFLY(0, temp0); - ONEITERBUTTERFLY(1, temp1); - ONEITERBUTTERFLY(2, temp2); - ONEITERBUTTERFLY(3, temp3); - ONEITERBUTTERFLY(4, temp4); - ONEITERBUTTERFLY(5, temp5); - ONEITERBUTTERFLY(6, temp6); - ONEITERBUTTERFLY(7, temp7); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - vsum = vec_sums(vsum, (vector signed int) vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } - return sum; -} - -/* - * 16x8 works with 16 elements; it allows to avoid replicating loads, and - * gives the compiler more room for scheduling. It's only used from - * inside hadamard8_diff16_altivec. - * - * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has - * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in - * registers by itself. The following code includes hand-made register - * allocation. It's not clean, but on a 7450 the resulting code is much faster - * (best case falls from 700+ cycles to 550). - * - * xlc doesn't add spill code, but it doesn't know how to schedule for the - * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses - * 25% fewer instructions...) - * - * On the 970, the hand-made RA is still a win (around 690 vs. around 780), - * but xlc goes to around 660 on the regular C code... - */ -static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst, - uint8_t *src, int stride, int h) -{ - int sum; - register vector signed short - temp0 __asm__ ("v0"), - temp1 __asm__ ("v1"), - temp2 __asm__ ("v2"), - temp3 __asm__ ("v3"), - temp4 __asm__ ("v4"), - temp5 __asm__ ("v5"), - temp6 __asm__ ("v6"), - temp7 __asm__ ("v7"); - register vector signed short - temp0S __asm__ ("v8"), - temp1S __asm__ ("v9"), - temp2S __asm__ ("v10"), - temp3S __asm__ ("v11"), - temp4S __asm__ ("v12"), - temp5S __asm__ ("v13"), - temp6S __asm__ ("v14"), - temp7S __asm__ ("v15"); - register const vector unsigned char vzero __asm__ ("v31") = - (const vector unsigned char) vec_splat_u8(0); - { - register const vector signed short vprod1 __asm__ ("v16") = - (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; - - register const vector signed short vprod2 __asm__ ("v17") = - (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; - - register const vector signed short vprod3 __asm__ ("v18") = - (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; - - register const vector unsigned char perm1 __asm__ ("v19") = - (const vector unsigned char) - { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; - - register const vector unsigned char perm2 __asm__ ("v20") = - (const vector unsigned char) - { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; - - register const vector unsigned char perm3 __asm__ ("v21") = - (const vector unsigned char) - { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; - -#define ONEITERBUTTERFLY(i, res1, res2) \ - { \ - register vector unsigned char src1 __asm__ ("v22") = \ - vec_ld(stride * i, src); \ - register vector unsigned char src2 __asm__ ("v23") = \ - vec_ld(stride * i + 16, src); \ - register vector unsigned char srcO __asm__ ("v22") = \ - vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - register vector unsigned char dst1 __asm__ ("v24") = \ - vec_ld(stride * i, dst); \ - register vector unsigned char dst2 __asm__ ("v25") = \ - vec_ld(stride * i + 16, dst); \ - register vector unsigned char dstO __asm__ ("v23") = \ - vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - \ - /* Promote the unsigned chars to signed shorts. */ \ - register vector signed short srcV __asm__ ("v24") = \ - (vector signed short) vec_mergeh((vector signed char) vzero, \ - (vector signed char) srcO); \ - register vector signed short dstV __asm__ ("v25") = \ - (vector signed short) vec_mergeh((vector signed char) vzero, \ - (vector signed char) dstO); \ - register vector signed short srcW __asm__ ("v26") = \ - (vector signed short) vec_mergel((vector signed char) vzero, \ - (vector signed char) srcO); \ - register vector signed short dstW __asm__ ("v27") = \ - (vector signed short) vec_mergel((vector signed char) vzero, \ - (vector signed char) dstO); \ - \ - /* subtractions inside the first butterfly */ \ - register vector signed short but0 __asm__ ("v28") = \ - vec_sub(srcV, dstV); \ - register vector signed short but0S __asm__ ("v29") = \ - vec_sub(srcW, dstW); \ - register vector signed short op1 __asm__ ("v30") = \ - vec_perm(but0, but0, perm1); \ - register vector signed short but1 __asm__ ("v22") = \ - vec_mladd(but0, vprod1, op1); \ - register vector signed short op1S __asm__ ("v23") = \ - vec_perm(but0S, but0S, perm1); \ - register vector signed short but1S __asm__ ("v24") = \ - vec_mladd(but0S, vprod1, op1S); \ - register vector signed short op2 __asm__ ("v25") = \ - vec_perm(but1, but1, perm2); \ - register vector signed short but2 __asm__ ("v26") = \ - vec_mladd(but1, vprod2, op2); \ - register vector signed short op2S __asm__ ("v27") = \ - vec_perm(but1S, but1S, perm2); \ - register vector signed short but2S __asm__ ("v28") = \ - vec_mladd(but1S, vprod2, op2S); \ - register vector signed short op3 __asm__ ("v29") = \ - vec_perm(but2, but2, perm3); \ - register vector signed short op3S __asm__ ("v30") = \ - vec_perm(but2S, but2S, perm3); \ - res1 = vec_mladd(but2, vprod3, op3); \ - res2 = vec_mladd(but2S, vprod3, op3S); \ - } - ONEITERBUTTERFLY(0, temp0, temp0S); - ONEITERBUTTERFLY(1, temp1, temp1S); - ONEITERBUTTERFLY(2, temp2, temp2S); - ONEITERBUTTERFLY(3, temp3, temp3S); - ONEITERBUTTERFLY(4, temp4, temp4S); - ONEITERBUTTERFLY(5, temp5, temp5S); - ONEITERBUTTERFLY(6, temp6, temp6S); - ONEITERBUTTERFLY(7, temp7, temp7S); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - register vector signed short line0S = vec_add(temp0S, temp1S); - register vector signed short line1S = vec_sub(temp0S, temp1S); - register vector signed short line2S = vec_add(temp2S, temp3S); - register vector signed short line3S = vec_sub(temp2S, temp3S); - register vector signed short line4S = vec_add(temp4S, temp5S); - register vector signed short line5S = vec_sub(temp4S, temp5S); - register vector signed short line6S = vec_add(temp6S, temp7S); - register vector signed short line7S = vec_sub(temp6S, temp7S); - - register vector signed short line0BS = vec_add(line0S, line2S); - register vector signed short line2BS = vec_sub(line0S, line2S); - register vector signed short line1BS = vec_add(line1S, line3S); - register vector signed short line3BS = vec_sub(line1S, line3S); - register vector signed short line4BS = vec_add(line4S, line6S); - register vector signed short line6BS = vec_sub(line4S, line6S); - register vector signed short line5BS = vec_add(line5S, line7S); - register vector signed short line7BS = vec_sub(line5S, line7S); - - register vector signed short line0CS = vec_add(line0BS, line4BS); - register vector signed short line4CS = vec_sub(line0BS, line4BS); - register vector signed short line1CS = vec_add(line1BS, line5BS); - register vector signed short line5CS = vec_sub(line1BS, line5BS); - register vector signed short line2CS = vec_add(line2BS, line6BS); - register vector signed short line6CS = vec_sub(line2BS, line6BS); - register vector signed short line3CS = vec_add(line3BS, line7BS); - register vector signed short line7CS = vec_sub(line3BS, line7BS); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - - vsum = vec_sum4s(vec_abs(line0CS), vsum); - vsum = vec_sum4s(vec_abs(line1CS), vsum); - vsum = vec_sum4s(vec_abs(line2CS), vsum); - vsum = vec_sum4s(vec_abs(line3CS), vsum); - vsum = vec_sum4s(vec_abs(line4CS), vsum); - vsum = vec_sum4s(vec_abs(line5CS), vsum); - vsum = vec_sum4s(vec_abs(line6CS), vsum); - vsum = vec_sum4s(vec_abs(line7CS), vsum); - vsum = vec_sums(vsum, (vector signed int) vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } - return sum; -} - -static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst, - uint8_t *src, int stride, int h) -{ - int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - - if (h == 16) { - dst += 8 * stride; - src += 8 * stride; - score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - } - return score; -} -#endif /* HAVE_ALTIVEC */ - -av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx) -{ -#if HAVE_ALTIVEC - if (!PPC_ALTIVEC(av_get_cpu_flags())) - return; - - c->pix_abs[0][1] = sad16_x2_altivec; - c->pix_abs[0][2] = sad16_y2_altivec; - c->pix_abs[0][3] = sad16_xy2_altivec; - c->pix_abs[0][0] = sad16_altivec; - c->pix_abs[1][0] = sad8_altivec; - - c->sad[0] = sad16_altivec; - c->sad[1] = sad8_altivec; - c->sse[0] = sse16_altivec; - c->sse[1] = sse8_altivec; - - c->hadamard8_diff[0] = hadamard8_diff16_altivec; - c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; -#endif /* HAVE_ALTIVEC */ -} diff --git a/libavcodec/ppc/me_cmp.c b/libavcodec/ppc/me_cmp.c new file mode 100644 index 0000000000..88c7feaa7e --- /dev/null +++ b/libavcodec/ppc/me_cmp.c @@ -0,0 +1,767 @@ +/* + * Copyright (c) 2002 Brian Foley + * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#if HAVE_ALTIVEC_H +#include +#endif + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/types_altivec.h" +#include "libavutil/ppc/util_altivec.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/mpegvideo.h" +#include "libavcodec/me_cmp.h" + +#if HAVE_ALTIVEC +static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int i, s = 0; + const vector unsigned char zero = + (const vector unsigned char) vec_splat_u8(0); + vector unsigned char perm1 = vec_lvsl(0, pix2); + vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); + vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); + vector signed int sumdiffs; + + for (i = 0; i < h; i++) { + /* Read unaligned pixels into our vectors. The vectors are as follows: + * pix1v: pix1[0] - pix1[15] + * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */ + vector unsigned char pix1v = vec_ld(0, pix1); + vector unsigned char pix2l = vec_ld(0, pix2); + vector unsigned char pix2r = vec_ld(16, pix2); + vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); + vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); + + /* Calculate the average vector. */ + vector unsigned char avgv = vec_avg(pix2v, pix2iv); + + /* Calculate a sum of abs differences vector. */ + vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv), + vec_min(pix1v, avgv)); + + /* Add each 4 pixel group together and put 4 results into sad. */ + sad = vec_sum4s(t5, sad); + + pix1 += line_size; + pix2 += line_size; + } + /* Sum up the four partial sums, and put the result into s. */ + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); + sumdiffs = vec_splat(sumdiffs, 3); + vec_ste(sumdiffs, 0, &s); + + return s; +} + +static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int i, s = 0; + const vector unsigned char zero = + (const vector unsigned char) vec_splat_u8(0); + vector unsigned char perm = vec_lvsl(0, pix2); + vector unsigned char pix1v, pix3v, avgv, t5; + vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); + vector signed int sumdiffs; + uint8_t *pix3 = pix2 + line_size; + + /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one + * iteration becomes pix2 in the next iteration. We can use this + * fact to avoid a potentially expensive unaligned read, each + * time around the loop. + * Read unaligned pixels into our vectors. The vectors are as follows: + * pix2v: pix2[0] - pix2[15] + * Split the pixel vectors into shorts. */ + vector unsigned char pix2l = vec_ld(0, pix2); + vector unsigned char pix2r = vec_ld(15, pix2); + vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm); + + for (i = 0; i < h; i++) { + /* Read unaligned pixels into our vectors. The vectors are as follows: + * pix1v: pix1[0] - pix1[15] + * pix3v: pix3[0] - pix3[15] */ + pix1v = vec_ld(0, pix1); + + pix2l = vec_ld(0, pix3); + pix2r = vec_ld(15, pix3); + pix3v = vec_perm(pix2l, pix2r, perm); + + /* Calculate the average vector. */ + avgv = vec_avg(pix2v, pix3v); + + /* Calculate a sum of abs differences vector. */ + t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); + + /* Add each 4 pixel group together and put 4 results into sad. */ + sad = vec_sum4s(t5, sad); + + pix1 += line_size; + pix2v = pix3v; + pix3 += line_size; + } + + /* Sum up the four partial sums, and put the result into s. */ + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); + sumdiffs = vec_splat(sumdiffs, 3); + vec_ste(sumdiffs, 0, &s); + return s; +} + +static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int i, s = 0; + uint8_t *pix3 = pix2 + line_size; + const vector unsigned char zero = + (const vector unsigned char) vec_splat_u8(0); + const vector unsigned short two = + (const vector unsigned short) vec_splat_u16(2); + vector unsigned char avgv, t5; + vector unsigned char perm1 = vec_lvsl(0, pix2); + vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); + vector unsigned char pix1v, pix3v, pix3iv; + vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; + vector unsigned short avghv, avglv; + vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); + vector signed int sumdiffs; + + /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one + * iteration becomes pix2 in the next iteration. We can use this + * fact to avoid a potentially expensive unaligned read, as well + * as some splitting, and vector addition each time around the loop. + * Read unaligned pixels into our vectors. The vectors are as follows: + * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] + * Split the pixel vectors into shorts. */ + vector unsigned char pix2l = vec_ld(0, pix2); + vector unsigned char pix2r = vec_ld(16, pix2); + vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); + vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); + + vector unsigned short pix2hv = + (vector unsigned short) vec_mergeh(zero, pix2v); + vector unsigned short pix2lv = + (vector unsigned short) vec_mergel(zero, pix2v); + vector unsigned short pix2ihv = + (vector unsigned short) vec_mergeh(zero, pix2iv); + vector unsigned short pix2ilv = + (vector unsigned short) vec_mergel(zero, pix2iv); + vector unsigned short t1 = vec_add(pix2hv, pix2ihv); + vector unsigned short t2 = vec_add(pix2lv, pix2ilv); + vector unsigned short t3, t4; + + for (i = 0; i < h; i++) { + /* Read unaligned pixels into our vectors. The vectors are as follows: + * pix1v: pix1[0] - pix1[15] + * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */ + pix1v = vec_ld(0, pix1); + + pix2l = vec_ld(0, pix3); + pix2r = vec_ld(16, pix3); + pix3v = vec_perm(pix2l, pix2r, perm1); + pix3iv = vec_perm(pix2l, pix2r, perm2); + + /* Note that AltiVec does have vec_avg, but this works on vector pairs + * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the + * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when + * it should be 1. Instead, we have to split the pixel vectors into + * vectors of shorts and do the averaging by hand. */ + + /* Split the pixel vectors into shorts. */ + pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); + pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); + pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); + pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); + + /* Do the averaging on them. */ + t3 = vec_add(pix3hv, pix3ihv); + t4 = vec_add(pix3lv, pix3ilv); + + avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); + avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); + + /* Pack the shorts back into a result. */ + avgv = vec_pack(avghv, avglv); + + /* Calculate a sum of abs differences vector. */ + t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); + + /* Add each 4 pixel group together and put 4 results into sad. */ + sad = vec_sum4s(t5, sad); + + pix1 += line_size; + pix3 += line_size; + /* Transfer the calculated values for pix3 into pix2. */ + t1 = t3; + t2 = t4; + } + /* Sum up the four partial sums, and put the result into s. */ + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); + sumdiffs = vec_splat(sumdiffs, 3); + vec_ste(sumdiffs, 0, &s); + + return s; +} + +static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int i, s; + const vector unsigned int zero = + (const vector unsigned int) vec_splat_u32(0); + vector unsigned char perm = vec_lvsl(0, pix2); + vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); + vector signed int sumdiffs; + + for (i = 0; i < h; i++) { + /* Read potentially unaligned pixels into t1 and t2. */ + vector unsigned char pix2l = vec_ld(0, pix2); + vector unsigned char pix2r = vec_ld(15, pix2); + vector unsigned char t1 = vec_ld(0, pix1); + vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); + + /* Calculate a sum of abs differences vector. */ + vector unsigned char t3 = vec_max(t1, t2); + vector unsigned char t4 = vec_min(t1, t2); + vector unsigned char t5 = vec_sub(t3, t4); + + /* Add each 4 pixel group together and put 4 results into sad. */ + sad = vec_sum4s(t5, sad); + + pix1 += line_size; + pix2 += line_size; + } + + /* Sum up the four partial sums, and put the result into s. */ + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); + sumdiffs = vec_splat(sumdiffs, 3); + vec_ste(sumdiffs, 0, &s); + + return s; +} + +static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int i, s; + const vector unsigned int zero = + (const vector unsigned int) vec_splat_u32(0); + const vector unsigned char permclear = + (vector unsigned char) + { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; + vector unsigned char perm1 = vec_lvsl(0, pix1); + vector unsigned char perm2 = vec_lvsl(0, pix2); + vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); + vector signed int sumdiffs; + + for (i = 0; i < h; i++) { + /* Read potentially unaligned pixels into t1 and t2. + * Since we're reading 16 pixels, and actually only want 8, + * mask out the last 8 pixels. The 0s don't change the sum. */ + vector unsigned char pix1l = vec_ld(0, pix1); + vector unsigned char pix1r = vec_ld(7, pix1); + vector unsigned char pix2l = vec_ld(0, pix2); + vector unsigned char pix2r = vec_ld(7, pix2); + vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), + permclear); + vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), + permclear); + + /* Calculate a sum of abs differences vector. */ + vector unsigned char t3 = vec_max(t1, t2); + vector unsigned char t4 = vec_min(t1, t2); + vector unsigned char t5 = vec_sub(t3, t4); + + /* Add each 4 pixel group together and put 4 results into sad. */ + sad = vec_sum4s(t5, sad); + + pix1 += line_size; + pix2 += line_size; + } + + /* Sum up the four partial sums, and put the result into s. */ + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); + sumdiffs = vec_splat(sumdiffs, 3); + vec_ste(sumdiffs, 0, &s); + + return s; +} + +/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. + * It's the sad8_altivec code above w/ squaring added. */ +static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int i, s; + const vector unsigned int zero = + (const vector unsigned int) vec_splat_u32(0); + const vector unsigned char permclear = + (vector unsigned char) + { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; + vector unsigned char perm1 = vec_lvsl(0, pix1); + vector unsigned char perm2 = vec_lvsl(0, pix2); + vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); + vector signed int sumsqr; + + for (i = 0; i < h; i++) { + /* Read potentially unaligned pixels into t1 and t2. + * Since we're reading 16 pixels, and actually only want 8, + * mask out the last 8 pixels. The 0s don't change the sum. */ + vector unsigned char pix1l = vec_ld(0, pix1); + vector unsigned char pix1r = vec_ld(7, pix1); + vector unsigned char pix2l = vec_ld(0, pix2); + vector unsigned char pix2r = vec_ld(7, pix2); + vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), + permclear); + vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), + permclear); + + /* Since we want to use unsigned chars, we can take advantage + * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ + + /* Calculate abs differences vector. */ + vector unsigned char t3 = vec_max(t1, t2); + vector unsigned char t4 = vec_min(t1, t2); + vector unsigned char t5 = vec_sub(t3, t4); + + /* Square the values and add them to our sum. */ + sum = vec_msum(t5, t5, sum); + + pix1 += line_size; + pix2 += line_size; + } + + /* Sum up the four partial sums, and put the result into s. */ + sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); + sumsqr = vec_splat(sumsqr, 3); + vec_ste(sumsqr, 0, &s); + + return s; +} + +/* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced. + * It's the sad16_altivec code above w/ squaring added. */ +static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int i, s; + const vector unsigned int zero = + (const vector unsigned int) vec_splat_u32(0); + vector unsigned char perm = vec_lvsl(0, pix2); + vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); + vector signed int sumsqr; + + for (i = 0; i < h; i++) { + /* Read potentially unaligned pixels into t1 and t2. */ + vector unsigned char pix2l = vec_ld(0, pix2); + vector unsigned char pix2r = vec_ld(15, pix2); + vector unsigned char t1 = vec_ld(0, pix1); + vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); + + /* Since we want to use unsigned chars, we can take advantage + * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ + + /* Calculate abs differences vector. */ + vector unsigned char t3 = vec_max(t1, t2); + vector unsigned char t4 = vec_min(t1, t2); + vector unsigned char t5 = vec_sub(t3, t4); + + /* Square the values and add them to our sum. */ + sum = vec_msum(t5, t5, sum); + + pix1 += line_size; + pix2 += line_size; + } + + /* Sum up the four partial sums, and put the result into s. */ + sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); + sumsqr = vec_splat(sumsqr, 3); + vec_ste(sumsqr, 0, &s); + + return s; +} + +static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, + uint8_t *src, int stride, int h) +{ + int sum; + register const vector unsigned char vzero = + (const vector unsigned char) vec_splat_u8(0); + register vector signed short temp0, temp1, temp2, temp3, temp4, + temp5, temp6, temp7; + { + register const vector signed short vprod1 = + (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; + register const vector signed short vprod2 = + (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; + register const vector signed short vprod3 = + (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; + register const vector unsigned char perm1 = + (const vector unsigned char) + { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; + register const vector unsigned char perm2 = + (const vector unsigned char) + { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; + register const vector unsigned char perm3 = + (const vector unsigned char) + { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; + +#define ONEITERBUTTERFLY(i, res) \ + { \ + register vector unsigned char src1 = vec_ld(stride * i, src); \ + register vector unsigned char src2 = vec_ld(stride * i + 15, src); \ + register vector unsigned char srcO = \ + vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ + register vector unsigned char dst1 = vec_ld(stride * i, dst); \ + register vector unsigned char dst2 = vec_ld(stride * i + 15, dst); \ + register vector unsigned char dstO = \ + vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ + \ + /* Promote the unsigned chars to signed shorts. */ \ + /* We're in the 8x8 function, we only care for the first 8. */ \ + register vector signed short srcV = \ + (vector signed short) vec_mergeh((vector signed char) vzero, \ + (vector signed char) srcO); \ + register vector signed short dstV = \ + (vector signed short) vec_mergeh((vector signed char) vzero, \ + (vector signed char) dstO); \ + \ + /* subtractions inside the first butterfly */ \ + register vector signed short but0 = vec_sub(srcV, dstV); \ + register vector signed short op1 = vec_perm(but0, but0, perm1); \ + register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ + register vector signed short op2 = vec_perm(but1, but1, perm2); \ + register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ + register vector signed short op3 = vec_perm(but2, but2, perm3); \ + res = vec_mladd(but2, vprod3, op3); \ + } + ONEITERBUTTERFLY(0, temp0); + ONEITERBUTTERFLY(1, temp1); + ONEITERBUTTERFLY(2, temp2); + ONEITERBUTTERFLY(3, temp3); + ONEITERBUTTERFLY(4, temp4); + ONEITERBUTTERFLY(5, temp5); + ONEITERBUTTERFLY(6, temp6); + ONEITERBUTTERFLY(7, temp7); + } +#undef ONEITERBUTTERFLY + { + register vector signed int vsum; + register vector signed short line0 = vec_add(temp0, temp1); + register vector signed short line1 = vec_sub(temp0, temp1); + register vector signed short line2 = vec_add(temp2, temp3); + register vector signed short line3 = vec_sub(temp2, temp3); + register vector signed short line4 = vec_add(temp4, temp5); + register vector signed short line5 = vec_sub(temp4, temp5); + register vector signed short line6 = vec_add(temp6, temp7); + register vector signed short line7 = vec_sub(temp6, temp7); + + register vector signed short line0B = vec_add(line0, line2); + register vector signed short line2B = vec_sub(line0, line2); + register vector signed short line1B = vec_add(line1, line3); + register vector signed short line3B = vec_sub(line1, line3); + register vector signed short line4B = vec_add(line4, line6); + register vector signed short line6B = vec_sub(line4, line6); + register vector signed short line5B = vec_add(line5, line7); + register vector signed short line7B = vec_sub(line5, line7); + + register vector signed short line0C = vec_add(line0B, line4B); + register vector signed short line4C = vec_sub(line0B, line4B); + register vector signed short line1C = vec_add(line1B, line5B); + register vector signed short line5C = vec_sub(line1B, line5B); + register vector signed short line2C = vec_add(line2B, line6B); + register vector signed short line6C = vec_sub(line2B, line6B); + register vector signed short line3C = vec_add(line3B, line7B); + register vector signed short line7C = vec_sub(line3B, line7B); + + vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); + vsum = vec_sum4s(vec_abs(line1C), vsum); + vsum = vec_sum4s(vec_abs(line2C), vsum); + vsum = vec_sum4s(vec_abs(line3C), vsum); + vsum = vec_sum4s(vec_abs(line4C), vsum); + vsum = vec_sum4s(vec_abs(line5C), vsum); + vsum = vec_sum4s(vec_abs(line6C), vsum); + vsum = vec_sum4s(vec_abs(line7C), vsum); + vsum = vec_sums(vsum, (vector signed int) vzero); + vsum = vec_splat(vsum, 3); + vec_ste(vsum, 0, &sum); + } + return sum; +} + +/* + * 16x8 works with 16 elements; it allows to avoid replicating loads, and + * gives the compiler more room for scheduling. It's only used from + * inside hadamard8_diff16_altivec. + * + * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has + * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in + * registers by itself. The following code includes hand-made register + * allocation. It's not clean, but on a 7450 the resulting code is much faster + * (best case falls from 700+ cycles to 550). + * + * xlc doesn't add spill code, but it doesn't know how to schedule for the + * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses + * 25% fewer instructions...) + * + * On the 970, the hand-made RA is still a win (around 690 vs. around 780), + * but xlc goes to around 660 on the regular C code... + */ +static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst, + uint8_t *src, int stride, int h) +{ + int sum; + register vector signed short + temp0 __asm__ ("v0"), + temp1 __asm__ ("v1"), + temp2 __asm__ ("v2"), + temp3 __asm__ ("v3"), + temp4 __asm__ ("v4"), + temp5 __asm__ ("v5"), + temp6 __asm__ ("v6"), + temp7 __asm__ ("v7"); + register vector signed short + temp0S __asm__ ("v8"), + temp1S __asm__ ("v9"), + temp2S __asm__ ("v10"), + temp3S __asm__ ("v11"), + temp4S __asm__ ("v12"), + temp5S __asm__ ("v13"), + temp6S __asm__ ("v14"), + temp7S __asm__ ("v15"); + register const vector unsigned char vzero __asm__ ("v31") = + (const vector unsigned char) vec_splat_u8(0); + { + register const vector signed short vprod1 __asm__ ("v16") = + (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; + + register const vector signed short vprod2 __asm__ ("v17") = + (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; + + register const vector signed short vprod3 __asm__ ("v18") = + (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; + + register const vector unsigned char perm1 __asm__ ("v19") = + (const vector unsigned char) + { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, + 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; + + register const vector unsigned char perm2 __asm__ ("v20") = + (const vector unsigned char) + { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, + 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; + + register const vector unsigned char perm3 __asm__ ("v21") = + (const vector unsigned char) + { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; + +#define ONEITERBUTTERFLY(i, res1, res2) \ + { \ + register vector unsigned char src1 __asm__ ("v22") = \ + vec_ld(stride * i, src); \ + register vector unsigned char src2 __asm__ ("v23") = \ + vec_ld(stride * i + 16, src); \ + register vector unsigned char srcO __asm__ ("v22") = \ + vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ + register vector unsigned char dst1 __asm__ ("v24") = \ + vec_ld(stride * i, dst); \ + register vector unsigned char dst2 __asm__ ("v25") = \ + vec_ld(stride * i + 16, dst); \ + register vector unsigned char dstO __asm__ ("v23") = \ + vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ + \ + /* Promote the unsigned chars to signed shorts. */ \ + register vector signed short srcV __asm__ ("v24") = \ + (vector signed short) vec_mergeh((vector signed char) vzero, \ + (vector signed char) srcO); \ + register vector signed short dstV __asm__ ("v25") = \ + (vector signed short) vec_mergeh((vector signed char) vzero, \ + (vector signed char) dstO); \ + register vector signed short srcW __asm__ ("v26") = \ + (vector signed short) vec_mergel((vector signed char) vzero, \ + (vector signed char) srcO); \ + register vector signed short dstW __asm__ ("v27") = \ + (vector signed short) vec_mergel((vector signed char) vzero, \ + (vector signed char) dstO); \ + \ + /* subtractions inside the first butterfly */ \ + register vector signed short but0 __asm__ ("v28") = \ + vec_sub(srcV, dstV); \ + register vector signed short but0S __asm__ ("v29") = \ + vec_sub(srcW, dstW); \ + register vector signed short op1 __asm__ ("v30") = \ + vec_perm(but0, but0, perm1); \ + register vector signed short but1 __asm__ ("v22") = \ + vec_mladd(but0, vprod1, op1); \ + register vector signed short op1S __asm__ ("v23") = \ + vec_perm(but0S, but0S, perm1); \ + register vector signed short but1S __asm__ ("v24") = \ + vec_mladd(but0S, vprod1, op1S); \ + register vector signed short op2 __asm__ ("v25") = \ + vec_perm(but1, but1, perm2); \ + register vector signed short but2 __asm__ ("v26") = \ + vec_mladd(but1, vprod2, op2); \ + register vector signed short op2S __asm__ ("v27") = \ + vec_perm(but1S, but1S, perm2); \ + register vector signed short but2S __asm__ ("v28") = \ + vec_mladd(but1S, vprod2, op2S); \ + register vector signed short op3 __asm__ ("v29") = \ + vec_perm(but2, but2, perm3); \ + register vector signed short op3S __asm__ ("v30") = \ + vec_perm(but2S, but2S, perm3); \ + res1 = vec_mladd(but2, vprod3, op3); \ + res2 = vec_mladd(but2S, vprod3, op3S); \ + } + ONEITERBUTTERFLY(0, temp0, temp0S); + ONEITERBUTTERFLY(1, temp1, temp1S); + ONEITERBUTTERFLY(2, temp2, temp2S); + ONEITERBUTTERFLY(3, temp3, temp3S); + ONEITERBUTTERFLY(4, temp4, temp4S); + ONEITERBUTTERFLY(5, temp5, temp5S); + ONEITERBUTTERFLY(6, temp6, temp6S); + ONEITERBUTTERFLY(7, temp7, temp7S); + } +#undef ONEITERBUTTERFLY + { + register vector signed int vsum; + + register vector signed short line0 = vec_add(temp0, temp1); + register vector signed short line1 = vec_sub(temp0, temp1); + register vector signed short line2 = vec_add(temp2, temp3); + register vector signed short line3 = vec_sub(temp2, temp3); + register vector signed short line4 = vec_add(temp4, temp5); + register vector signed short line5 = vec_sub(temp4, temp5); + register vector signed short line6 = vec_add(temp6, temp7); + register vector signed short line7 = vec_sub(temp6, temp7); + + register vector signed short line0B = vec_add(line0, line2); + register vector signed short line2B = vec_sub(line0, line2); + register vector signed short line1B = vec_add(line1, line3); + register vector signed short line3B = vec_sub(line1, line3); + register vector signed short line4B = vec_add(line4, line6); + register vector signed short line6B = vec_sub(line4, line6); + register vector signed short line5B = vec_add(line5, line7); + register vector signed short line7B = vec_sub(line5, line7); + + register vector signed short line0C = vec_add(line0B, line4B); + register vector signed short line4C = vec_sub(line0B, line4B); + register vector signed short line1C = vec_add(line1B, line5B); + register vector signed short line5C = vec_sub(line1B, line5B); + register vector signed short line2C = vec_add(line2B, line6B); + register vector signed short line6C = vec_sub(line2B, line6B); + register vector signed short line3C = vec_add(line3B, line7B); + register vector signed short line7C = vec_sub(line3B, line7B); + + register vector signed short line0S = vec_add(temp0S, temp1S); + register vector signed short line1S = vec_sub(temp0S, temp1S); + register vector signed short line2S = vec_add(temp2S, temp3S); + register vector signed short line3S = vec_sub(temp2S, temp3S); + register vector signed short line4S = vec_add(temp4S, temp5S); + register vector signed short line5S = vec_sub(temp4S, temp5S); + register vector signed short line6S = vec_add(temp6S, temp7S); + register vector signed short line7S = vec_sub(temp6S, temp7S); + + register vector signed short line0BS = vec_add(line0S, line2S); + register vector signed short line2BS = vec_sub(line0S, line2S); + register vector signed short line1BS = vec_add(line1S, line3S); + register vector signed short line3BS = vec_sub(line1S, line3S); + register vector signed short line4BS = vec_add(line4S, line6S); + register vector signed short line6BS = vec_sub(line4S, line6S); + register vector signed short line5BS = vec_add(line5S, line7S); + register vector signed short line7BS = vec_sub(line5S, line7S); + + register vector signed short line0CS = vec_add(line0BS, line4BS); + register vector signed short line4CS = vec_sub(line0BS, line4BS); + register vector signed short line1CS = vec_add(line1BS, line5BS); + register vector signed short line5CS = vec_sub(line1BS, line5BS); + register vector signed short line2CS = vec_add(line2BS, line6BS); + register vector signed short line6CS = vec_sub(line2BS, line6BS); + register vector signed short line3CS = vec_add(line3BS, line7BS); + register vector signed short line7CS = vec_sub(line3BS, line7BS); + + vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); + vsum = vec_sum4s(vec_abs(line1C), vsum); + vsum = vec_sum4s(vec_abs(line2C), vsum); + vsum = vec_sum4s(vec_abs(line3C), vsum); + vsum = vec_sum4s(vec_abs(line4C), vsum); + vsum = vec_sum4s(vec_abs(line5C), vsum); + vsum = vec_sum4s(vec_abs(line6C), vsum); + vsum = vec_sum4s(vec_abs(line7C), vsum); + + vsum = vec_sum4s(vec_abs(line0CS), vsum); + vsum = vec_sum4s(vec_abs(line1CS), vsum); + vsum = vec_sum4s(vec_abs(line2CS), vsum); + vsum = vec_sum4s(vec_abs(line3CS), vsum); + vsum = vec_sum4s(vec_abs(line4CS), vsum); + vsum = vec_sum4s(vec_abs(line5CS), vsum); + vsum = vec_sum4s(vec_abs(line6CS), vsum); + vsum = vec_sum4s(vec_abs(line7CS), vsum); + vsum = vec_sums(vsum, (vector signed int) vzero); + vsum = vec_splat(vsum, 3); + vec_ste(vsum, 0, &sum); + } + return sum; +} + +static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst, + uint8_t *src, int stride, int h) +{ + int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); + + if (h == 16) { + dst += 8 * stride; + src += 8 * stride; + score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); + } + return score; +} +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx) +{ +#if HAVE_ALTIVEC + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + c->pix_abs[0][1] = sad16_x2_altivec; + c->pix_abs[0][2] = sad16_y2_altivec; + c->pix_abs[0][3] = sad16_xy2_altivec; + c->pix_abs[0][0] = sad16_altivec; + c->pix_abs[1][0] = sad8_altivec; + + c->sad[0] = sad16_altivec; + c->sad[1] = sad8_altivec; + c->sse[0] = sse16_altivec; + c->sse[1] = sse8_altivec; + + c->hadamard8_diff[0] = hadamard8_diff16_altivec; + c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; +#endif /* HAVE_ALTIVEC */ +} diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c index 5e3cc55efe..506ee9be9e 100644 --- a/libavcodec/svq1enc.c +++ b/libavcodec/svq1enc.c @@ -27,8 +27,8 @@ */ #include "avcodec.h" -#include "dsputil.h" #include "hpeldsp.h" +#include "me_cmp.h" #include "mpegvideo.h" #include "h263.h" #include "internal.h" @@ -306,7 +306,7 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane, s->m.current_picture.motion_val[0] = s->motion_val8[plane] + 2; s->m.p_mv_table = s->motion_val16[plane] + s->m.mb_stride + 1; - s->m.dsp = s->dsp; // move + s->m.mecc = s->mecc; // move ff_init_me(&s->m); s->m.me.dia_size = s->avctx->dia_size; @@ -431,8 +431,8 @@ static int svq1_encode_plane(SVQ1EncContext *s, int plane, best = score[1] <= score[0]; vlc = ff_svq1_block_type_vlc[SVQ1_BLOCK_SKIP]; - score[2] = s->dsp.sse[0](NULL, src + 16 * x, ref, - stride, 16); + score[2] = s->mecc.sse[0](NULL, src + 16 * x, ref, + stride, 16); score[2] += vlc[1] * lambda; if (score[2] < score[best] && mx == 0 && my == 0) { best = 2; @@ -509,8 +509,8 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx) SVQ1EncContext *const s = avctx->priv_data; int ret; - ff_dsputil_init(&s->dsp, avctx); ff_hpeldsp_init(&s->hdsp, avctx->flags); + ff_me_cmp_init(&s->mecc, avctx); ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx); avctx->coded_frame = av_frame_alloc(); diff --git a/libavcodec/svq1enc.h b/libavcodec/svq1enc.h index 1fe2815f19..516e875657 100644 --- a/libavcodec/svq1enc.h +++ b/libavcodec/svq1enc.h @@ -25,9 +25,9 @@ #include "libavutil/frame.h" #include "avcodec.h" -#include "dsputil.h" #include "get_bits.h" #include "hpeldsp.h" +#include "me_cmp.h" #include "mpegvideo.h" #include "put_bits.h" @@ -37,7 +37,7 @@ typedef struct SVQ1EncContext { * of MpegEncContext, so this will be removed then. */ MpegEncContext m; AVCodecContext *avctx; - DSPContext dsp; + MECmpContext mecc; HpelDSPContext hdsp; AVFrame *current_picture; AVFrame *last_picture; diff --git a/libavcodec/utils.c b/libavcodec/utils.c index 3af56e9f46..2abc37696f 100644 --- a/libavcodec/utils.c +++ b/libavcodec/utils.c @@ -39,8 +39,8 @@ #include "libavutil/samplefmt.h" #include "libavutil/dict.h" #include "avcodec.h" -#include "dsputil.h" #include "libavutil/opt.h" +#include "me_cmp.h" #include "mpegvideo.h" #include "thread.h" #include "internal.h" @@ -100,8 +100,8 @@ static av_cold void avcodec_init(void) return; initialized = 1; - if (CONFIG_DSPUTIL) - ff_dsputil_static_init(); + if (CONFIG_ME_CMP) + ff_me_cmp_init_static(); } int av_codec_is_encoder(const AVCodec *codec) diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index c8804bb37c..3e9b41c890 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -6,7 +6,6 @@ OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o OBJS-$(CONFIG_DCT) += x86/dct_init.o -OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o OBJS-$(CONFIG_FFT) += x86/fft_init.o OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o @@ -19,6 +18,7 @@ OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_mmx.o OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o OBJS-$(CONFIG_LPC) += x86/lpc.o +OBJS-$(CONFIG_ME_CMP) += x86/me_cmp_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \ x86/mpegvideodsp.o @@ -70,7 +70,6 @@ YASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o YASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o YASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o -YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputilenc.o YASM-OBJS-$(CONFIG_FFT) += x86/fft.o YASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ @@ -90,6 +89,7 @@ YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \ x86/hpeldsp.o YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o +YASM-OBJS-$(CONFIG_ME_CMP) += x86/me_cmp.o YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o YASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o YASM-OBJS-$(CONFIG_PIXBLOCKDSP) += x86/pixblockdsp.o diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c deleted file mode 100644 index 1303757dd2..0000000000 --- a/libavcodec/x86/dsputil_init.c +++ /dev/null @@ -1,1321 +0,0 @@ -/* - * MMX optimized DSP utils - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2002-2004 Michael Niedermayer - * - * MMX optimization by Nick Kurshev - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/x86/asm.h" -#include "libavutil/x86/cpu.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/mpegvideo.h" - -#if HAVE_INLINE_ASM - -static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %4, %%ecx \n" - "shr $1, %%ecx \n" - "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */ - "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */ - "1: \n" - "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */ - "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */ - "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */ - "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1, %%mm5 \n" - "movq %%mm3, %%mm6 \n" - "psubusb %%mm2, %%mm1 \n" - "psubusb %%mm4, %%mm3 \n" - "psubusb %%mm5, %%mm2 \n" - "psubusb %%mm6, %%mm4 \n" - - "por %%mm1, %%mm2 \n" - "por %%mm3, %%mm4 \n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2, %%mm1 \n" - "movq %%mm4, %%mm3 \n" - - "punpckhbw %%mm0, %%mm2 \n" - "punpckhbw %%mm0, %%mm4 \n" - "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */ - "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */ - - "pmaddwd %%mm2, %%mm2 \n" - "pmaddwd %%mm4, %%mm4 \n" - "pmaddwd %%mm1, %%mm1 \n" - "pmaddwd %%mm3, %%mm3 \n" - - "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */ - "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */ - - "paddd %%mm2, %%mm1 \n" - "paddd %%mm4, %%mm3 \n" - "paddd %%mm1, %%mm7 \n" - "paddd %%mm3, %%mm7 \n" - - "decl %%ecx \n" - "jnz 1b \n" - - "movq %%mm7, %%mm1 \n" - "psrlq $32, %%mm7 \n" /* shift hi dword to lo */ - "paddd %%mm7, %%mm1 \n" - "movd %%mm1, %2 \n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} - -static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %4, %%ecx\n" - "pxor %%mm0, %%mm0\n" /* mm0 = 0 */ - "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */ - "1:\n" - "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */ - "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */ - "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */ - "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */ - - /* todo: mm1-mm2, mm3-mm4 */ - /* algo: subtract mm1 from mm2 with saturation and vice versa */ - /* OR the results to get absolute difference */ - "movq %%mm1, %%mm5\n" - "movq %%mm3, %%mm6\n" - "psubusb %%mm2, %%mm1\n" - "psubusb %%mm4, %%mm3\n" - "psubusb %%mm5, %%mm2\n" - "psubusb %%mm6, %%mm4\n" - - "por %%mm1, %%mm2\n" - "por %%mm3, %%mm4\n" - - /* now convert to 16-bit vectors so we can square them */ - "movq %%mm2, %%mm1\n" - "movq %%mm4, %%mm3\n" - - "punpckhbw %%mm0, %%mm2\n" - "punpckhbw %%mm0, %%mm4\n" - "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */ - "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */ - - "pmaddwd %%mm2, %%mm2\n" - "pmaddwd %%mm4, %%mm4\n" - "pmaddwd %%mm1, %%mm1\n" - "pmaddwd %%mm3, %%mm3\n" - - "add %3, %0\n" - "add %3, %1\n" - - "paddd %%mm2, %%mm1\n" - "paddd %%mm4, %%mm3\n" - "paddd %%mm1, %%mm7\n" - "paddd %%mm3, %%mm7\n" - - "decl %%ecx\n" - "jnz 1b\n" - - "movq %%mm7, %%mm1\n" - "psrlq $32, %%mm7\n" /* shift hi dword to lo */ - "paddd %%mm7, %%mm1\n" - "movd %%mm1, %2\n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} - -static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h) -{ - int tmp; - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm7, %%mm7\n" - "pxor %%mm6, %%mm6\n" - - "movq (%0), %%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "1:\n" - - "movq (%0), %%mm0\n" - "movq %%mm0, %%mm1\n" - "psllq $8, %%mm0\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm0\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq %%mm4, %%mm1\n" - "psllq $8, %%mm4\n" - "psrlq $8, %%mm1\n" - "psrlq $8, %%mm4\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7, %%mm0\n" - "punpckhwd %%mm7, %%mm6\n" - "paddd %%mm0, %%mm6\n" - - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6, %%mm0\n" - "movd %%mm0, %1\n" - : "+r" (pix1), "=r" (tmp) - : "r" ((x86_reg) line_size), "g" (h - 2) - : "%ecx"); - - return tmp; -} - -static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h) -{ - int tmp; - uint8_t *pix = pix1; - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm7, %%mm7\n" - "pxor %%mm6, %%mm6\n" - - "movq (%0), %%mm0\n" - "movq 1(%0), %%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq 1(%0), %%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "1:\n" - - "movq (%0), %%mm0\n" - "movq 1(%0), %%mm1\n" - "movq %%mm0, %%mm2\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm0\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm2\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm0\n" - "psubw %%mm3, %%mm2\n" - "psubw %%mm0, %%mm4\n" - "psubw %%mm2, %%mm5\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm4, %%mm3\n\t" - "pcmpgtw %%mm5, %%mm1\n\t" - "pxor %%mm3, %%mm4\n" - "pxor %%mm1, %%mm5\n" - "psubw %%mm3, %%mm4\n" - "psubw %%mm1, %%mm5\n" - "paddw %%mm4, %%mm5\n" - "paddw %%mm5, %%mm6\n" - - "add %2, %0\n" - - "movq (%0), %%mm4\n" - "movq 1(%0), %%mm1\n" - "movq %%mm4, %%mm5\n" - "movq %%mm1, %%mm3\n" - "punpcklbw %%mm7, %%mm4\n" - "punpcklbw %%mm7, %%mm1\n" - "punpckhbw %%mm7, %%mm5\n" - "punpckhbw %%mm7, %%mm3\n" - "psubw %%mm1, %%mm4\n" - "psubw %%mm3, %%mm5\n" - "psubw %%mm4, %%mm0\n" - "psubw %%mm5, %%mm2\n" - "pxor %%mm3, %%mm3\n" - "pxor %%mm1, %%mm1\n" - "pcmpgtw %%mm0, %%mm3\n\t" - "pcmpgtw %%mm2, %%mm1\n\t" - "pxor %%mm3, %%mm0\n" - "pxor %%mm1, %%mm2\n" - "psubw %%mm3, %%mm0\n" - "psubw %%mm1, %%mm2\n" - "paddw %%mm0, %%mm2\n" - "paddw %%mm2, %%mm6\n" - - "add %2, %0\n" - "subl $2, %%ecx\n" - " jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "punpcklwd %%mm7, %%mm0\n" - "punpckhwd %%mm7, %%mm6\n" - "paddd %%mm0, %%mm6\n" - - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddd %%mm6, %%mm0\n" - "movd %%mm0, %1\n" - : "+r" (pix1), "=r" (tmp) - : "r" ((x86_reg) line_size), "g" (h - 2) - : "%ecx"); - - return tmp + hf_noise8_mmx(pix + 8, line_size, h); -} - -static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int score1, score2; - - if (c) - score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); - else - score1 = sse16_mmx(c, pix1, pix2, line_size, h); - score2 = hf_noise16_mmx(pix1, line_size, h) - - hf_noise16_mmx(pix2, line_size, h); - - if (c) - return score1 + FFABS(score2) * c->avctx->nsse_weight; - else - return score1 + FFABS(score2) * 8; -} - -static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int score1 = sse8_mmx(c, pix1, pix2, line_size, h); - int score2 = hf_noise8_mmx(pix1, line_size, h) - - hf_noise8_mmx(pix2, line_size, h); - - if (c) - return score1 + FFABS(score2) * c->avctx->nsse_weight; - else - return score1 + FFABS(score2) * 8; -} - -static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, - int line_size, int h) -{ - int tmp; - - assert((((int) pix) & 7) == 0); - assert((line_size & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), %%mm2\n" \ - "movq 8(%0), %%mm3\n" \ - "add %2,%0\n" \ - "movq %%mm2, " #out0 "\n" \ - "movq %%mm3, " #out1 "\n" \ - "psubusb " #in0 ", %%mm2\n" \ - "psubusb " #in1 ", %%mm3\n" \ - "psubusb " #out0 ", " #in0 "\n" \ - "psubusb " #out1 ", " #in1 "\n" \ - "por %%mm2, " #in0 "\n" \ - "por %%mm3, " #in1 "\n" \ - "movq " #in0 ", %%mm2\n" \ - "movq " #in1 ", %%mm3\n" \ - "punpcklbw %%mm7, " #in0 "\n" \ - "punpcklbw %%mm7, " #in1 "\n" \ - "punpckhbw %%mm7, %%mm2\n" \ - "punpckhbw %%mm7, %%mm3\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw %%mm3, %%mm2\n" \ - "paddw %%mm2, " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n" - - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pxor %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq 8(%0), %%mm1\n" - "add %2, %0\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddw %%mm6, %%mm0\n" - "movq %%mm0, %%mm6\n" - "psrlq $16, %%mm0\n" - "paddw %%mm6, %%mm0\n" - "movd %%mm0, %1\n" - : "+r" (pix), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp & 0xFFFF; -} -#undef SUM - -static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, - int line_size, int h) -{ - int tmp; - - assert((((int) pix) & 7) == 0); - assert((line_size & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), " #out0 "\n" \ - "movq 8(%0), " #out1 "\n" \ - "add %2, %0\n" \ - "psadbw " #out0 ", " #in0 "\n" \ - "psadbw " #out1 ", " #in1 "\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n" - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pxor %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq 8(%0), %%mm1\n" - "add %2, %0\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6, %1\n" - : "+r" (pix), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} -#undef SUM - -static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - assert((((int) pix1) & 7) == 0); - assert((((int) pix2) & 7) == 0); - assert((line_size & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), %%mm2\n" \ - "movq (%1), " #out0 "\n" \ - "movq 8(%0), %%mm3\n" \ - "movq 8(%1), " #out1 "\n" \ - "add %3, %0\n" \ - "add %3, %1\n" \ - "psubb " #out0 ", %%mm2\n" \ - "psubb " #out1 ", %%mm3\n" \ - "pxor %%mm7, %%mm2\n" \ - "pxor %%mm7, %%mm3\n" \ - "movq %%mm2, " #out0 "\n" \ - "movq %%mm3, " #out1 "\n" \ - "psubusb " #in0 ", %%mm2\n" \ - "psubusb " #in1 ", %%mm3\n" \ - "psubusb " #out0 ", " #in0 "\n" \ - "psubusb " #out1 ", " #in1 "\n" \ - "por %%mm2, " #in0 "\n" \ - "por %%mm3, " #in1 "\n" \ - "movq " #in0 ", %%mm2\n" \ - "movq " #in1 ", %%mm3\n" \ - "punpcklbw %%mm7, " #in0 "\n" \ - "punpcklbw %%mm7, " #in1 "\n" \ - "punpckhbw %%mm7, %%mm2\n" \ - "punpckhbw %%mm7, %%mm3\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw %%mm3, %%mm2\n" \ - "paddw %%mm2, " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n" - - - __asm__ volatile ( - "movl %4, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pcmpeqw %%mm7, %%mm7\n" - "psllw $15, %%mm7\n" - "packsswb %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq (%1), %%mm2\n" - "movq 8(%0), %%mm1\n" - "movq 8(%1), %%mm3\n" - "add %3, %0\n" - "add %3, %1\n" - "psubb %%mm2, %%mm0\n" - "psubb %%mm3, %%mm1\n" - "pxor %%mm7, %%mm0\n" - "pxor %%mm7, %%mm1\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movq %%mm6, %%mm0\n" - "psrlq $32, %%mm6\n" - "paddw %%mm6, %%mm0\n" - "movq %%mm0, %%mm6\n" - "psrlq $16, %%mm0\n" - "paddw %%mm6, %%mm0\n" - "movd %%mm0, %2\n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp & 0x7FFF; -} -#undef SUM - -static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - assert((((int) pix1) & 7) == 0); - assert((((int) pix2) & 7) == 0); - assert((line_size & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), " #out0 "\n" \ - "movq (%1), %%mm2\n" \ - "movq 8(%0), " #out1 "\n" \ - "movq 8(%1), %%mm3\n" \ - "add %3, %0\n" \ - "add %3, %1\n" \ - "psubb %%mm2, " #out0 "\n" \ - "psubb %%mm3, " #out1 "\n" \ - "pxor %%mm7, " #out0 "\n" \ - "pxor %%mm7, " #out1 "\n" \ - "psadbw " #out0 ", " #in0 "\n" \ - "psadbw " #out1 ", " #in1 "\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n " - - __asm__ volatile ( - "movl %4, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pcmpeqw %%mm7, %%mm7\n" - "psllw $15, %%mm7\n" - "packsswb %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq (%1), %%mm2\n" - "movq 8(%0), %%mm1\n" - "movq 8(%1), %%mm3\n" - "add %3, %0\n" - "add %3, %1\n" - "psubb %%mm2, %%mm0\n" - "psubb %%mm3, %%mm1\n" - "pxor %%mm7, %%mm0\n" - "pxor %%mm7, %%mm1\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6, %2\n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} -#undef SUM - -#define MMABS_MMX(a,z) \ - "pxor " #z ", " #z " \n\t" \ - "pcmpgtw " #a ", " #z " \n\t" \ - "pxor " #z ", " #a " \n\t" \ - "psubw " #z ", " #a " \n\t" - -#define MMABS_MMXEXT(a, z) \ - "pxor " #z ", " #z " \n\t" \ - "psubw " #a ", " #z " \n\t" \ - "pmaxsw " #z ", " #a " \n\t" - -#define MMABS_SSSE3(a,z) \ - "pabsw " #a ", " #a " \n\t" - -#define MMABS_SUM(a,z, sum) \ - MMABS(a,z) \ - "paddusw " #a ", " #sum " \n\t" - -/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get - * up to about 100k on extreme inputs. But that's very unlikely to occur in - * natural video, and it's even more unlikely to not have any alternative - * mvs/modes with lower cost. */ -#define HSUM_MMX(a, t, dst) \ - "movq " #a ", " #t " \n\t" \ - "psrlq $32, " #a " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movq " #a ", " #t " \n\t" \ - "psrlq $16, " #a " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movd " #a ", " #dst " \n\t" \ - -#define HSUM_MMXEXT(a, t, dst) \ - "pshufw $0x0E, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "pshufw $0x01, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movd " #a ", " #dst " \n\t" \ - -#define HSUM_SSE2(a, t, dst) \ - "movhlps " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "pshuflw $0x0E, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "pshuflw $0x01, " #a ", " #t " \n\t" \ - "paddusw " #t ", " #a " \n\t" \ - "movd " #a ", " #dst " \n\t" \ - -#define DCT_SAD4(m, mm, o) \ - "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \ - "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \ - "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \ - "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \ - MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \ - MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \ - MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \ - MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \ - -#define DCT_SAD_MMX \ - "pxor %%mm0, %%mm0 \n\t" \ - "pxor %%mm1, %%mm1 \n\t" \ - DCT_SAD4(q, %%mm, 0) \ - DCT_SAD4(q, %%mm, 8) \ - DCT_SAD4(q, %%mm, 64) \ - DCT_SAD4(q, %%mm, 72) \ - "paddusw %%mm1, %%mm0 \n\t" \ - HSUM(%%mm0, %%mm1, %0) - -#define DCT_SAD_SSE2 \ - "pxor %%xmm0, %%xmm0 \n\t" \ - "pxor %%xmm1, %%xmm1 \n\t" \ - DCT_SAD4(dqa, %%xmm, 0) \ - DCT_SAD4(dqa, %%xmm, 64) \ - "paddusw %%xmm1, %%xmm0 \n\t" \ - HSUM(%%xmm0, %%xmm1, %0) - -#define DCT_SAD_FUNC(cpu) \ -static int sum_abs_dctelem_ ## cpu(int16_t *block) \ -{ \ - int sum; \ - __asm__ volatile ( \ - DCT_SAD \ - :"=r"(sum) \ - :"r"(block)); \ - return sum & 0xFFFF; \ -} - -#define DCT_SAD DCT_SAD_MMX -#define HSUM(a, t, dst) HSUM_MMX(a, t, dst) -#define MMABS(a, z) MMABS_MMX(a, z) -DCT_SAD_FUNC(mmx) -#undef MMABS -#undef HSUM - -#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst) -#define MMABS(a, z) MMABS_MMXEXT(a, z) -DCT_SAD_FUNC(mmxext) -#undef HSUM -#undef DCT_SAD - -#define DCT_SAD DCT_SAD_SSE2 -#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst) -DCT_SAD_FUNC(sse2) -#undef MMABS - -#if HAVE_SSSE3_INLINE -#define MMABS(a, z) MMABS_SSSE3(a, z) -DCT_SAD_FUNC(ssse3) -#undef MMABS -#endif -#undef HSUM -#undef DCT_SAD - - -DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { - 0x0000000000000000ULL, - 0x0001000100010001ULL, - 0x0002000200020002ULL, -}; - -DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL; - -static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - x86_reg len = -(stride * h); - __asm__ volatile ( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq (%2, %%"REG_a"), %%mm2 \n\t" - "movq (%2, %%"REG_a"), %%mm4 \n\t" - "add %3, %%"REG_a" \n\t" - "psubusb %%mm0, %%mm2 \n\t" - "psubusb %%mm4, %%mm0 \n\t" - "movq (%1, %%"REG_a"), %%mm1 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "movq (%2, %%"REG_a"), %%mm5 \n\t" - "psubusb %%mm1, %%mm3 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "por %%mm2, %%mm0 \n\t" - "por %%mm1, %%mm3 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm3, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %3, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride)); -} - -static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, - int stride, int h) -{ - __asm__ volatile ( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg) stride)); -} - -static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, - int stride, int h) -{ - int ret; - __asm__ volatile ( - "pxor %%xmm2, %%xmm2 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movdqu (%1), %%xmm0 \n\t" - "movdqu (%1, %4), %%xmm1 \n\t" - "psadbw (%2), %%xmm0 \n\t" - "psadbw (%2, %4), %%xmm1 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "paddw %%xmm1, %%xmm2 \n\t" - "lea (%1,%4,2), %1 \n\t" - "lea (%2,%4,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - "movhlps %%xmm2, %%xmm0 \n\t" - "paddw %%xmm0, %%xmm2 \n\t" - "movd %%xmm2, %3 \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret) - : "r" ((x86_reg) stride)); - return ret; -} - -static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, - int stride, int h) -{ - __asm__ volatile ( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "pavgb 1(%1, %3), %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg) stride)); -} - -static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, - int stride, int h) -{ - __asm__ volatile ( - "movq (%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2, %3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg) stride)); -} - -static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, - int stride, int h) -{ - __asm__ volatile ( - "movq "MANGLE(bone)", %%mm5 \n\t" - "movq (%1), %%mm0 \n\t" - "pavgb 1(%1), %%mm0 \n\t" - "add %3, %1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1,%3), %%mm2 \n\t" - "pavgb 1(%1), %%mm1 \n\t" - "pavgb 1(%1,%3), %%mm2 \n\t" - "psubusb %%mm5, %%mm1 \n\t" - "pavgb %%mm1, %%mm0 \n\t" - "pavgb %%mm2, %%mm1 \n\t" - "psadbw (%2), %%mm0 \n\t" - "psadbw (%2,%3), %%mm1 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm1, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "lea (%1,%3,2), %1 \n\t" - "lea (%2,%3,2), %2 \n\t" - "sub $2, %0 \n\t" - " jg 1b \n\t" - : "+r" (h), "+r" (blk1), "+r" (blk2) - : "r" ((x86_reg) stride)); -} - -static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, - int stride, int h) -{ - x86_reg len = -(stride * h); - __asm__ volatile ( - ".p2align 4 \n\t" - "1: \n\t" - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq (%2, %%"REG_a"), %%mm1 \n\t" - "movq (%1, %%"REG_a"), %%mm2 \n\t" - "movq (%2, %%"REG_a"), %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm0, %%mm1 \n\t" - "paddw %%mm2, %%mm3 \n\t" - "movq (%3, %%"REG_a"), %%mm4 \n\t" - "movq (%3, %%"REG_a"), %%mm2 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "psrlw $1, %%mm1 \n\t" - "psrlw $1, %%mm3 \n\t" - "packuswb %%mm3, %%mm1 \n\t" - "psubusb %%mm1, %%mm4 \n\t" - "psubusb %%mm2, %%mm1 \n\t" - "por %%mm4, %%mm1 \n\t" - "movq %%mm1, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "add %4, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), - "r" ((x86_reg) stride)); -} - -static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - x86_reg len = -(stride * h); - __asm__ volatile ( - "movq (%1, %%"REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm3, %%mm1 \n\t" - ".p2align 4 \n\t" - "1: \n\t" - "movq (%2, %%"REG_a"), %%mm2 \n\t" - "movq 1(%2, %%"REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddw %%mm4, %%mm2 \n\t" - "paddw %%mm5, %%mm3 \n\t" - "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" - "paddw %%mm2, %%mm0 \n\t" - "paddw %%mm3, %%mm1 \n\t" - "paddw %%mm5, %%mm0 \n\t" - "paddw %%mm5, %%mm1 \n\t" - "movq (%3, %%"REG_a"), %%mm4 \n\t" - "movq (%3, %%"REG_a"), %%mm5 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "psubusb %%mm0, %%mm4 \n\t" - "psubusb %%mm5, %%mm0 \n\t" - "por %%mm4, %%mm0 \n\t" - "movq %%mm0, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpckhbw %%mm7, %%mm4 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "paddw %%mm4, %%mm6 \n\t" - "movq %%mm2, %%mm0 \n\t" - "movq %%mm3, %%mm1 \n\t" - "add %4, %%"REG_a" \n\t" - " js 1b \n\t" - : "+a" (len) - : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), - "r" ((x86_reg) stride)); -} - -static inline int sum_mmx(void) -{ - int ret; - __asm__ volatile ( - "movq %%mm6, %%mm0 \n\t" - "psrlq $32, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "movq %%mm6, %%mm0 \n\t" - "psrlq $16, %%mm6 \n\t" - "paddw %%mm0, %%mm6 \n\t" - "movd %%mm6, %0 \n\t" - : "=r" (ret)); - return ret & 0xFFFF; -} - -static inline int sum_mmxext(void) -{ - int ret; - __asm__ volatile ( - "movd %%mm6, %0 \n\t" - : "=r" (ret)); - return ret; -} - -static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); -} - -static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) -{ - sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); -} - -#define PIX_SAD(suf) \ -static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, int stride, int h) \ -{ \ - assert(h == 8); \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - :); \ - \ - sad8_1_ ## suf(blk1, blk2, stride, 8); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, int stride, int h) \ -{ \ - assert(h == 8); \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - "movq %0, %%mm5 \n\t" \ - :: "m" (round_tab[1])); \ - \ - sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, int stride, int h) \ -{ \ - assert(h == 8); \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - "movq %0, %%mm5 \n\t" \ - :: "m" (round_tab[1])); \ - \ - sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, int stride, int h) \ -{ \ - assert(h == 8); \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - ::); \ - \ - sad8_4_ ## suf(blk1, blk2, stride, 8); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, int stride, int h) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - :); \ - \ - sad8_1_ ## suf(blk1, blk2, stride, h); \ - sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, int stride, int h) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - "movq %0, %%mm5 \n\t" \ - :: "m" (round_tab[1])); \ - \ - sad8_x2a_ ## suf(blk1, blk2, stride, h); \ - sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, int stride, int h) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - "movq %0, %%mm5 \n\t" \ - :: "m" (round_tab[1])); \ - \ - sad8_y2a_ ## suf(blk1, blk2, stride, h); \ - sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ - \ - return sum_ ## suf(); \ -} \ - \ -static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ - uint8_t *blk1, int stride, int h) \ -{ \ - __asm__ volatile ( \ - "pxor %%mm7, %%mm7 \n\t" \ - "pxor %%mm6, %%mm6 \n\t" \ - ::); \ - \ - sad8_4_ ## suf(blk1, blk2, stride, h); \ - sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ - \ - return sum_ ## suf(); \ -} \ - -PIX_SAD(mmx) -PIX_SAD(mmxext) - -#endif /* HAVE_INLINE_ASM */ - -int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h); - -#define hadamard_func(cpu) \ - int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ - uint8_t *src2, int stride, int h); \ - int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ - uint8_t *src2, int stride, int h); - -hadamard_func(mmx) -hadamard_func(mmxext) -hadamard_func(sse2) -hadamard_func(ssse3) - -av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx) -{ - int cpu_flags = av_get_cpu_flags(); - -#if HAVE_INLINE_ASM - if (INLINE_MMX(cpu_flags)) { - c->sum_abs_dctelem = sum_abs_dctelem_mmx; - - c->pix_abs[0][0] = sad16_mmx; - c->pix_abs[0][1] = sad16_x2_mmx; - c->pix_abs[0][2] = sad16_y2_mmx; - c->pix_abs[0][3] = sad16_xy2_mmx; - c->pix_abs[1][0] = sad8_mmx; - c->pix_abs[1][1] = sad8_x2_mmx; - c->pix_abs[1][2] = sad8_y2_mmx; - c->pix_abs[1][3] = sad8_xy2_mmx; - - c->sad[0] = sad16_mmx; - c->sad[1] = sad8_mmx; - - c->sse[0] = sse16_mmx; - c->sse[1] = sse8_mmx; - c->vsad[4] = vsad_intra16_mmx; - - c->nsse[0] = nsse16_mmx; - c->nsse[1] = nsse8_mmx; - - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->vsad[0] = vsad16_mmx; - } - } - - if (INLINE_MMXEXT(cpu_flags)) { - c->sum_abs_dctelem = sum_abs_dctelem_mmxext; - - c->vsad[4] = vsad_intra16_mmxext; - - c->pix_abs[0][0] = sad16_mmxext; - c->pix_abs[1][0] = sad8_mmxext; - - c->sad[0] = sad16_mmxext; - c->sad[1] = sad8_mmxext; - - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->pix_abs[0][1] = sad16_x2_mmxext; - c->pix_abs[0][2] = sad16_y2_mmxext; - c->pix_abs[0][3] = sad16_xy2_mmxext; - c->pix_abs[1][1] = sad8_x2_mmxext; - c->pix_abs[1][2] = sad8_y2_mmxext; - c->pix_abs[1][3] = sad8_xy2_mmxext; - - c->vsad[0] = vsad16_mmxext; - } - } - - if (INLINE_SSE2(cpu_flags)) { - c->sum_abs_dctelem = sum_abs_dctelem_sse2; - } - - if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) { - c->sad[0] = sad16_sse2; - } - -#if HAVE_SSSE3_INLINE - if (INLINE_SSSE3(cpu_flags)) { - c->sum_abs_dctelem = sum_abs_dctelem_ssse3; - } -#endif -#endif /* HAVE_INLINE_ASM */ - - if (EXTERNAL_MMX(cpu_flags)) { - c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; - c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; - } - - if (EXTERNAL_MMXEXT(cpu_flags)) { - c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; - c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; - } - - if (EXTERNAL_SSE2(cpu_flags)) { - c->sse[0] = ff_sse16_sse2; - -#if HAVE_ALIGNED_STACK - c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; - c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; -#endif - } - - if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) { - c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; - c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; - } -} diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm deleted file mode 100644 index 8d989c26f8..0000000000 --- a/libavcodec/x86/dsputilenc.asm +++ /dev/null @@ -1,336 +0,0 @@ -;***************************************************************************** -;* MMX optimized DSP utils -;***************************************************************************** -;* Copyright (c) 2000, 2001 Fabrice Bellard -;* Copyright (c) 2002-2004 Michael Niedermayer -;* -;* This file is part of Libav. -;* -;* Libav is free software; you can redistribute it and/or -;* modify it under the terms of the GNU Lesser General Public -;* License as published by the Free Software Foundation; either -;* version 2.1 of the License, or (at your option) any later version. -;* -;* Libav is distributed in the hope that it will be useful, -;* but WITHOUT ANY WARRANTY; without even the implied warranty of -;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -;* Lesser General Public License for more details. -;* -;* You should have received a copy of the GNU Lesser General Public -;* License along with Libav; if not, write to the Free Software -;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -;***************************************************************************** - -%include "libavutil/x86/x86util.asm" - -SECTION .text - -%macro DIFF_PIXELS_1 4 - movh %1, %3 - movh %2, %4 - punpcklbw %2, %1 - punpcklbw %1, %1 - psubw %1, %2 -%endmacro - -; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 -; %6=temporary storage location -; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) -%macro DIFF_PIXELS_8 6 - DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] - DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] - DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] - add %1, %5 - add %2, %5 - DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] - DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] - DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] - DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] -%ifdef m8 - DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] -%else - mova [%6], m0 - DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] - mova m0, [%6] -%endif - sub %1, %5 - sub %2, %5 -%endmacro - -%macro HADAMARD8 0 - SUMSUB_BADC w, 0, 1, 2, 3 - SUMSUB_BADC w, 4, 5, 6, 7 - SUMSUB_BADC w, 0, 2, 1, 3 - SUMSUB_BADC w, 4, 6, 5, 7 - SUMSUB_BADC w, 0, 4, 1, 5 - SUMSUB_BADC w, 2, 6, 3, 7 -%endmacro - -%macro ABS1_SUM 3 - ABS1 %1, %2 - paddusw %3, %1 -%endmacro - -%macro ABS2_SUM 6 - ABS2 %1, %2, %3, %4 - paddusw %5, %1 - paddusw %6, %2 -%endmacro - -%macro ABS_SUM_8x8_64 1 - ABS2 m0, m1, m8, m9 - ABS2_SUM m2, m3, m8, m9, m0, m1 - ABS2_SUM m4, m5, m8, m9, m0, m1 - ABS2_SUM m6, m7, m8, m9, m0, m1 - paddusw m0, m1 -%endmacro - -%macro ABS_SUM_8x8_32 1 - mova [%1], m7 - ABS1 m0, m7 - ABS1 m1, m7 - ABS1_SUM m2, m7, m0 - ABS1_SUM m3, m7, m1 - ABS1_SUM m4, m7, m0 - ABS1_SUM m5, m7, m1 - ABS1_SUM m6, m7, m0 - mova m2, [%1] - ABS1_SUM m2, m7, m1 - paddusw m0, m1 -%endmacro - -; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to -; about 100k on extreme inputs. But that's very unlikely to occur in natural video, -; and it's even more unlikely to not have any alternative mvs/modes with lower cost. -%macro HSUM 3 -%if cpuflag(sse2) - movhlps %2, %1 - paddusw %1, %2 - pshuflw %2, %1, 0xE - paddusw %1, %2 - pshuflw %2, %1, 0x1 - paddusw %1, %2 - movd %3, %1 -%elif cpuflag(mmxext) - pshufw %2, %1, 0xE - paddusw %1, %2 - pshufw %2, %1, 0x1 - paddusw %1, %2 - movd %3, %1 -%elif cpuflag(mmx) - mova %2, %1 - psrlq %1, 32 - paddusw %1, %2 - mova %2, %1 - psrlq %1, 16 - paddusw %1, %2 - movd %3, %1 -%endif -%endmacro - -%macro STORE4 5 - mova [%1+mmsize*0], %2 - mova [%1+mmsize*1], %3 - mova [%1+mmsize*2], %4 - mova [%1+mmsize*3], %5 -%endmacro - -%macro LOAD4 5 - mova %2, [%1+mmsize*0] - mova %3, [%1+mmsize*1] - mova %4, [%1+mmsize*2] - mova %5, [%1+mmsize*3] -%endmacro - -%macro hadamard8_16_wrapper 2 -cglobal hadamard8_diff, 4, 4, %1 -%ifndef m8 - %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) - SUB rsp, pad -%endif - call hadamard8x8_diff %+ SUFFIX -%ifndef m8 - ADD rsp, pad -%endif - RET - -cglobal hadamard8_diff16, 5, 6, %1 -%ifndef m8 - %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) - SUB rsp, pad -%endif - - call hadamard8x8_diff %+ SUFFIX - mov r5d, eax - - add r1, 8 - add r2, 8 - call hadamard8x8_diff %+ SUFFIX - add r5d, eax - - cmp r4d, 16 - jne .done - - lea r1, [r1+r3*8-8] - lea r2, [r2+r3*8-8] - call hadamard8x8_diff %+ SUFFIX - add r5d, eax - - add r1, 8 - add r2, 8 - call hadamard8x8_diff %+ SUFFIX - add r5d, eax - -.done: - mov eax, r5d -%ifndef m8 - ADD rsp, pad -%endif - RET -%endmacro - -%macro HADAMARD8_DIFF 0-1 -%if cpuflag(sse2) -hadamard8x8_diff %+ SUFFIX: - lea r0, [r3*3] - DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize - HADAMARD8 -%if ARCH_X86_64 - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 -%else - TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] -%endif - HADAMARD8 - ABS_SUM_8x8 rsp+gprsize - HSUM m0, m1, eax - and eax, 0xFFFF - ret - -hadamard8_16_wrapper %1, 3 -%elif cpuflag(mmx) -ALIGN 16 -; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, -; uint8_t *src2, int stride, int h) -; r0 = void *s = unused, int h = unused (always 8) -; note how r1, r2 and r3 are not clobbered in this function, so 16x16 -; can simply call this 2x2x (and that's why we access rsp+gprsize -; everywhere, which is rsp of calling func -hadamard8x8_diff %+ SUFFIX: - lea r0, [r3*3] - - ; first 4x8 pixels - DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 - HADAMARD8 - mova [rsp+gprsize+0x60], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - STORE4 rsp+gprsize, m0, m1, m2, m3 - mova m7, [rsp+gprsize+0x60] - TRANSPOSE4x4W 4, 5, 6, 7, 0 - STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 - - ; second 4x8 pixels - DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 - HADAMARD8 - mova [rsp+gprsize+0x60], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 - mova m7, [rsp+gprsize+0x60] - TRANSPOSE4x4W 4, 5, 6, 7, 0 - - LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 - HADAMARD8 - ABS_SUM_8x8_32 rsp+gprsize+0x60 - mova [rsp+gprsize+0x60], m0 - - LOAD4 rsp+gprsize , m0, m1, m2, m3 - LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 - HADAMARD8 - ABS_SUM_8x8_32 rsp+gprsize - paddusw m0, [rsp+gprsize+0x60] - - HSUM m0, m1, eax - and rax, 0xFFFF - ret - -hadamard8_16_wrapper 0, 14 -%endif -%endmacro - -INIT_MMX mmx -HADAMARD8_DIFF - -INIT_MMX mmxext -HADAMARD8_DIFF - -INIT_XMM sse2 -%if ARCH_X86_64 -%define ABS_SUM_8x8 ABS_SUM_8x8_64 -%else -%define ABS_SUM_8x8 ABS_SUM_8x8_32 -%endif -HADAMARD8_DIFF 10 - -INIT_XMM ssse3 -%define ABS_SUM_8x8 ABS_SUM_8x8_64 -HADAMARD8_DIFF 9 - -INIT_XMM sse2 -; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, -; int line_size, int h); -cglobal sse16, 5, 5, 8 - shr r4d, 1 - pxor m0, m0 ; mm0 = 0 - pxor m7, m7 ; mm7 holds the sum - -.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned - movu m1, [r1 ] ; mm1 = pix1[0][0-15] - movu m2, [r2 ] ; mm2 = pix2[0][0-15] - movu m3, [r1+r3] ; mm3 = pix1[1][0-15] - movu m4, [r2+r3] ; mm4 = pix2[1][0-15] - - ; todo: mm1-mm2, mm3-mm4 - ; algo: subtract mm1 from mm2 with saturation and vice versa - ; OR the result to get the absolute difference - mova m5, m1 - mova m6, m3 - psubusb m1, m2 - psubusb m3, m4 - psubusb m2, m5 - psubusb m4, m6 - - por m2, m1 - por m4, m3 - - ; now convert to 16-bit vectors so we can square them - mova m1, m2 - mova m3, m4 - - punpckhbw m2, m0 - punpckhbw m4, m0 - punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) - punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) - - pmaddwd m2, m2 - pmaddwd m4, m4 - pmaddwd m1, m1 - pmaddwd m3, m3 - - lea r1, [r1+r3*2] ; pix1 += 2*line_size - lea r2, [r2+r3*2] ; pix2 += 2*line_size - - paddd m1, m2 - paddd m3, m4 - paddd m7, m1 - paddd m7, m3 - - dec r4 - jnz .next2lines - - mova m1, m7 - psrldq m7, 8 ; shift hi qword to lo - paddd m7, m1 - mova m1, m7 - psrldq m7, 4 ; shift hi dword to lo - paddd m7, m1 - movd eax, m7 ; return value - RET diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm new file mode 100644 index 0000000000..1a87f37b39 --- /dev/null +++ b/libavcodec/x86/me_cmp.asm @@ -0,0 +1,336 @@ +;***************************************************************************** +;* SIMD-optimized motion compensation estimation +;***************************************************************************** +;* Copyright (c) 2000, 2001 Fabrice Bellard +;* Copyright (c) 2002-2004 Michael Niedermayer +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro DIFF_PIXELS_1 4 + movh %1, %3 + movh %2, %4 + punpcklbw %2, %1 + punpcklbw %1, %1 + psubw %1, %2 +%endmacro + +; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 +; %6=temporary storage location +; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) +%macro DIFF_PIXELS_8 6 + DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] + DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] + DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] + add %1, %5 + add %2, %5 + DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] + DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] + DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] + DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] +%ifdef m8 + DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] +%else + mova [%6], m0 + DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] + mova m0, [%6] +%endif + sub %1, %5 + sub %2, %5 +%endmacro + +%macro HADAMARD8 0 + SUMSUB_BADC w, 0, 1, 2, 3 + SUMSUB_BADC w, 4, 5, 6, 7 + SUMSUB_BADC w, 0, 2, 1, 3 + SUMSUB_BADC w, 4, 6, 5, 7 + SUMSUB_BADC w, 0, 4, 1, 5 + SUMSUB_BADC w, 2, 6, 3, 7 +%endmacro + +%macro ABS1_SUM 3 + ABS1 %1, %2 + paddusw %3, %1 +%endmacro + +%macro ABS2_SUM 6 + ABS2 %1, %2, %3, %4 + paddusw %5, %1 + paddusw %6, %2 +%endmacro + +%macro ABS_SUM_8x8_64 1 + ABS2 m0, m1, m8, m9 + ABS2_SUM m2, m3, m8, m9, m0, m1 + ABS2_SUM m4, m5, m8, m9, m0, m1 + ABS2_SUM m6, m7, m8, m9, m0, m1 + paddusw m0, m1 +%endmacro + +%macro ABS_SUM_8x8_32 1 + mova [%1], m7 + ABS1 m0, m7 + ABS1 m1, m7 + ABS1_SUM m2, m7, m0 + ABS1_SUM m3, m7, m1 + ABS1_SUM m4, m7, m0 + ABS1_SUM m5, m7, m1 + ABS1_SUM m6, m7, m0 + mova m2, [%1] + ABS1_SUM m2, m7, m1 + paddusw m0, m1 +%endmacro + +; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to +; about 100k on extreme inputs. But that's very unlikely to occur in natural video, +; and it's even more unlikely to not have any alternative mvs/modes with lower cost. +%macro HSUM 3 +%if cpuflag(sse2) + movhlps %2, %1 + paddusw %1, %2 + pshuflw %2, %1, 0xE + paddusw %1, %2 + pshuflw %2, %1, 0x1 + paddusw %1, %2 + movd %3, %1 +%elif cpuflag(mmxext) + pshufw %2, %1, 0xE + paddusw %1, %2 + pshufw %2, %1, 0x1 + paddusw %1, %2 + movd %3, %1 +%elif cpuflag(mmx) + mova %2, %1 + psrlq %1, 32 + paddusw %1, %2 + mova %2, %1 + psrlq %1, 16 + paddusw %1, %2 + movd %3, %1 +%endif +%endmacro + +%macro STORE4 5 + mova [%1+mmsize*0], %2 + mova [%1+mmsize*1], %3 + mova [%1+mmsize*2], %4 + mova [%1+mmsize*3], %5 +%endmacro + +%macro LOAD4 5 + mova %2, [%1+mmsize*0] + mova %3, [%1+mmsize*1] + mova %4, [%1+mmsize*2] + mova %5, [%1+mmsize*3] +%endmacro + +%macro hadamard8_16_wrapper 2 +cglobal hadamard8_diff, 4, 4, %1 +%ifndef m8 + %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) + SUB rsp, pad +%endif + call hadamard8x8_diff %+ SUFFIX +%ifndef m8 + ADD rsp, pad +%endif + RET + +cglobal hadamard8_diff16, 5, 6, %1 +%ifndef m8 + %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) + SUB rsp, pad +%endif + + call hadamard8x8_diff %+ SUFFIX + mov r5d, eax + + add r1, 8 + add r2, 8 + call hadamard8x8_diff %+ SUFFIX + add r5d, eax + + cmp r4d, 16 + jne .done + + lea r1, [r1+r3*8-8] + lea r2, [r2+r3*8-8] + call hadamard8x8_diff %+ SUFFIX + add r5d, eax + + add r1, 8 + add r2, 8 + call hadamard8x8_diff %+ SUFFIX + add r5d, eax + +.done: + mov eax, r5d +%ifndef m8 + ADD rsp, pad +%endif + RET +%endmacro + +%macro HADAMARD8_DIFF 0-1 +%if cpuflag(sse2) +hadamard8x8_diff %+ SUFFIX: + lea r0, [r3*3] + DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize + HADAMARD8 +%if ARCH_X86_64 + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 +%else + TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] +%endif + HADAMARD8 + ABS_SUM_8x8 rsp+gprsize + HSUM m0, m1, eax + and eax, 0xFFFF + ret + +hadamard8_16_wrapper %1, 3 +%elif cpuflag(mmx) +ALIGN 16 +; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, +; uint8_t *src2, int stride, int h) +; r0 = void *s = unused, int h = unused (always 8) +; note how r1, r2 and r3 are not clobbered in this function, so 16x16 +; can simply call this 2x2x (and that's why we access rsp+gprsize +; everywhere, which is rsp of calling func +hadamard8x8_diff %+ SUFFIX: + lea r0, [r3*3] + + ; first 4x8 pixels + DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 + HADAMARD8 + mova [rsp+gprsize+0x60], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + STORE4 rsp+gprsize, m0, m1, m2, m3 + mova m7, [rsp+gprsize+0x60] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 + + ; second 4x8 pixels + DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 + HADAMARD8 + mova [rsp+gprsize+0x60], m7 + TRANSPOSE4x4W 0, 1, 2, 3, 7 + STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 + mova m7, [rsp+gprsize+0x60] + TRANSPOSE4x4W 4, 5, 6, 7, 0 + + LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 + HADAMARD8 + ABS_SUM_8x8_32 rsp+gprsize+0x60 + mova [rsp+gprsize+0x60], m0 + + LOAD4 rsp+gprsize , m0, m1, m2, m3 + LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 + HADAMARD8 + ABS_SUM_8x8_32 rsp+gprsize + paddusw m0, [rsp+gprsize+0x60] + + HSUM m0, m1, eax + and rax, 0xFFFF + ret + +hadamard8_16_wrapper 0, 14 +%endif +%endmacro + +INIT_MMX mmx +HADAMARD8_DIFF + +INIT_MMX mmxext +HADAMARD8_DIFF + +INIT_XMM sse2 +%if ARCH_X86_64 +%define ABS_SUM_8x8 ABS_SUM_8x8_64 +%else +%define ABS_SUM_8x8 ABS_SUM_8x8_32 +%endif +HADAMARD8_DIFF 10 + +INIT_XMM ssse3 +%define ABS_SUM_8x8 ABS_SUM_8x8_64 +HADAMARD8_DIFF 9 + +INIT_XMM sse2 +; int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, +; int line_size, int h); +cglobal sse16, 5, 5, 8 + shr r4d, 1 + pxor m0, m0 ; mm0 = 0 + pxor m7, m7 ; mm7 holds the sum + +.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned + movu m1, [r1 ] ; mm1 = pix1[0][0-15] + movu m2, [r2 ] ; mm2 = pix2[0][0-15] + movu m3, [r1+r3] ; mm3 = pix1[1][0-15] + movu m4, [r2+r3] ; mm4 = pix2[1][0-15] + + ; todo: mm1-mm2, mm3-mm4 + ; algo: subtract mm1 from mm2 with saturation and vice versa + ; OR the result to get the absolute difference + mova m5, m1 + mova m6, m3 + psubusb m1, m2 + psubusb m3, m4 + psubusb m2, m5 + psubusb m4, m6 + + por m2, m1 + por m4, m3 + + ; now convert to 16-bit vectors so we can square them + mova m1, m2 + mova m3, m4 + + punpckhbw m2, m0 + punpckhbw m4, m0 + punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) + punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) + + pmaddwd m2, m2 + pmaddwd m4, m4 + pmaddwd m1, m1 + pmaddwd m3, m3 + + lea r1, [r1+r3*2] ; pix1 += 2*line_size + lea r2, [r2+r3*2] ; pix2 += 2*line_size + + paddd m1, m2 + paddd m3, m4 + paddd m7, m1 + paddd m7, m3 + + dec r4 + jnz .next2lines + + mova m1, m7 + psrldq m7, 8 ; shift hi qword to lo + paddd m7, m1 + mova m1, m7 + psrldq m7, 4 ; shift hi dword to lo + paddd m7, m1 + movd eax, m7 ; return value + RET diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c new file mode 100644 index 0000000000..e93b67b053 --- /dev/null +++ b/libavcodec/x86/me_cmp_init.c @@ -0,0 +1,1321 @@ +/* + * SIMD-optimized motion estimation + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * MMX optimization by Nick Kurshev + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/me_cmp.h" +#include "libavcodec/mpegvideo.h" + +#if HAVE_INLINE_ASM + +static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int tmp; + + __asm__ volatile ( + "movl %4, %%ecx \n" + "shr $1, %%ecx \n" + "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */ + "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */ + "1: \n" + "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */ + "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */ + "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */ + "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */ + + /* todo: mm1-mm2, mm3-mm4 */ + /* algo: subtract mm1 from mm2 with saturation and vice versa */ + /* OR the results to get absolute difference */ + "movq %%mm1, %%mm5 \n" + "movq %%mm3, %%mm6 \n" + "psubusb %%mm2, %%mm1 \n" + "psubusb %%mm4, %%mm3 \n" + "psubusb %%mm5, %%mm2 \n" + "psubusb %%mm6, %%mm4 \n" + + "por %%mm1, %%mm2 \n" + "por %%mm3, %%mm4 \n" + + /* now convert to 16-bit vectors so we can square them */ + "movq %%mm2, %%mm1 \n" + "movq %%mm4, %%mm3 \n" + + "punpckhbw %%mm0, %%mm2 \n" + "punpckhbw %%mm0, %%mm4 \n" + "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */ + "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */ + + "pmaddwd %%mm2, %%mm2 \n" + "pmaddwd %%mm4, %%mm4 \n" + "pmaddwd %%mm1, %%mm1 \n" + "pmaddwd %%mm3, %%mm3 \n" + + "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */ + "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */ + + "paddd %%mm2, %%mm1 \n" + "paddd %%mm4, %%mm3 \n" + "paddd %%mm1, %%mm7 \n" + "paddd %%mm3, %%mm7 \n" + + "decl %%ecx \n" + "jnz 1b \n" + + "movq %%mm7, %%mm1 \n" + "psrlq $32, %%mm7 \n" /* shift hi dword to lo */ + "paddd %%mm7, %%mm1 \n" + "movd %%mm1, %2 \n" + : "+r" (pix1), "+r" (pix2), "=r" (tmp) + : "r" ((x86_reg) line_size), "m" (h) + : "%ecx"); + + return tmp; +} + +static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int tmp; + + __asm__ volatile ( + "movl %4, %%ecx\n" + "pxor %%mm0, %%mm0\n" /* mm0 = 0 */ + "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */ + "1:\n" + "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */ + "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */ + "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */ + "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */ + + /* todo: mm1-mm2, mm3-mm4 */ + /* algo: subtract mm1 from mm2 with saturation and vice versa */ + /* OR the results to get absolute difference */ + "movq %%mm1, %%mm5\n" + "movq %%mm3, %%mm6\n" + "psubusb %%mm2, %%mm1\n" + "psubusb %%mm4, %%mm3\n" + "psubusb %%mm5, %%mm2\n" + "psubusb %%mm6, %%mm4\n" + + "por %%mm1, %%mm2\n" + "por %%mm3, %%mm4\n" + + /* now convert to 16-bit vectors so we can square them */ + "movq %%mm2, %%mm1\n" + "movq %%mm4, %%mm3\n" + + "punpckhbw %%mm0, %%mm2\n" + "punpckhbw %%mm0, %%mm4\n" + "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */ + "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */ + + "pmaddwd %%mm2, %%mm2\n" + "pmaddwd %%mm4, %%mm4\n" + "pmaddwd %%mm1, %%mm1\n" + "pmaddwd %%mm3, %%mm3\n" + + "add %3, %0\n" + "add %3, %1\n" + + "paddd %%mm2, %%mm1\n" + "paddd %%mm4, %%mm3\n" + "paddd %%mm1, %%mm7\n" + "paddd %%mm3, %%mm7\n" + + "decl %%ecx\n" + "jnz 1b\n" + + "movq %%mm7, %%mm1\n" + "psrlq $32, %%mm7\n" /* shift hi dword to lo */ + "paddd %%mm7, %%mm1\n" + "movd %%mm1, %2\n" + : "+r" (pix1), "+r" (pix2), "=r" (tmp) + : "r" ((x86_reg) line_size), "m" (h) + : "%ecx"); + + return tmp; +} + +static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h) +{ + int tmp; + + __asm__ volatile ( + "movl %3, %%ecx\n" + "pxor %%mm7, %%mm7\n" + "pxor %%mm6, %%mm6\n" + + "movq (%0), %%mm0\n" + "movq %%mm0, %%mm1\n" + "psllq $8, %%mm0\n" + "psrlq $8, %%mm1\n" + "psrlq $8, %%mm0\n" + "movq %%mm0, %%mm2\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm0\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm2\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + + "add %2, %0\n" + + "movq (%0), %%mm4\n" + "movq %%mm4, %%mm1\n" + "psllq $8, %%mm4\n" + "psrlq $8, %%mm1\n" + "psrlq $8, %%mm4\n" + "movq %%mm4, %%mm5\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm4\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm5\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm4\n" + "psubw %%mm3, %%mm5\n" + "psubw %%mm4, %%mm0\n" + "psubw %%mm5, %%mm2\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm0, %%mm3\n\t" + "pcmpgtw %%mm2, %%mm1\n\t" + "pxor %%mm3, %%mm0\n" + "pxor %%mm1, %%mm2\n" + "psubw %%mm3, %%mm0\n" + "psubw %%mm1, %%mm2\n" + "paddw %%mm0, %%mm2\n" + "paddw %%mm2, %%mm6\n" + + "add %2, %0\n" + "1:\n" + + "movq (%0), %%mm0\n" + "movq %%mm0, %%mm1\n" + "psllq $8, %%mm0\n" + "psrlq $8, %%mm1\n" + "psrlq $8, %%mm0\n" + "movq %%mm0, %%mm2\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm0\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm2\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + "psubw %%mm0, %%mm4\n" + "psubw %%mm2, %%mm5\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm4, %%mm3\n\t" + "pcmpgtw %%mm5, %%mm1\n\t" + "pxor %%mm3, %%mm4\n" + "pxor %%mm1, %%mm5\n" + "psubw %%mm3, %%mm4\n" + "psubw %%mm1, %%mm5\n" + "paddw %%mm4, %%mm5\n" + "paddw %%mm5, %%mm6\n" + + "add %2, %0\n" + + "movq (%0), %%mm4\n" + "movq %%mm4, %%mm1\n" + "psllq $8, %%mm4\n" + "psrlq $8, %%mm1\n" + "psrlq $8, %%mm4\n" + "movq %%mm4, %%mm5\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm4\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm5\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm4\n" + "psubw %%mm3, %%mm5\n" + "psubw %%mm4, %%mm0\n" + "psubw %%mm5, %%mm2\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm0, %%mm3\n\t" + "pcmpgtw %%mm2, %%mm1\n\t" + "pxor %%mm3, %%mm0\n" + "pxor %%mm1, %%mm2\n" + "psubw %%mm3, %%mm0\n" + "psubw %%mm1, %%mm2\n" + "paddw %%mm0, %%mm2\n" + "paddw %%mm2, %%mm6\n" + + "add %2, %0\n" + "subl $2, %%ecx\n" + " jnz 1b\n" + + "movq %%mm6, %%mm0\n" + "punpcklwd %%mm7, %%mm0\n" + "punpckhwd %%mm7, %%mm6\n" + "paddd %%mm0, %%mm6\n" + + "movq %%mm6, %%mm0\n" + "psrlq $32, %%mm6\n" + "paddd %%mm6, %%mm0\n" + "movd %%mm0, %1\n" + : "+r" (pix1), "=r" (tmp) + : "r" ((x86_reg) line_size), "g" (h - 2) + : "%ecx"); + + return tmp; +} + +static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h) +{ + int tmp; + uint8_t *pix = pix1; + + __asm__ volatile ( + "movl %3, %%ecx\n" + "pxor %%mm7, %%mm7\n" + "pxor %%mm6, %%mm6\n" + + "movq (%0), %%mm0\n" + "movq 1(%0), %%mm1\n" + "movq %%mm0, %%mm2\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm0\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm2\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + + "add %2, %0\n" + + "movq (%0), %%mm4\n" + "movq 1(%0), %%mm1\n" + "movq %%mm4, %%mm5\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm4\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm5\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm4\n" + "psubw %%mm3, %%mm5\n" + "psubw %%mm4, %%mm0\n" + "psubw %%mm5, %%mm2\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm0, %%mm3\n\t" + "pcmpgtw %%mm2, %%mm1\n\t" + "pxor %%mm3, %%mm0\n" + "pxor %%mm1, %%mm2\n" + "psubw %%mm3, %%mm0\n" + "psubw %%mm1, %%mm2\n" + "paddw %%mm0, %%mm2\n" + "paddw %%mm2, %%mm6\n" + + "add %2, %0\n" + "1:\n" + + "movq (%0), %%mm0\n" + "movq 1(%0), %%mm1\n" + "movq %%mm0, %%mm2\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm0\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm2\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm0\n" + "psubw %%mm3, %%mm2\n" + "psubw %%mm0, %%mm4\n" + "psubw %%mm2, %%mm5\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm4, %%mm3\n\t" + "pcmpgtw %%mm5, %%mm1\n\t" + "pxor %%mm3, %%mm4\n" + "pxor %%mm1, %%mm5\n" + "psubw %%mm3, %%mm4\n" + "psubw %%mm1, %%mm5\n" + "paddw %%mm4, %%mm5\n" + "paddw %%mm5, %%mm6\n" + + "add %2, %0\n" + + "movq (%0), %%mm4\n" + "movq 1(%0), %%mm1\n" + "movq %%mm4, %%mm5\n" + "movq %%mm1, %%mm3\n" + "punpcklbw %%mm7, %%mm4\n" + "punpcklbw %%mm7, %%mm1\n" + "punpckhbw %%mm7, %%mm5\n" + "punpckhbw %%mm7, %%mm3\n" + "psubw %%mm1, %%mm4\n" + "psubw %%mm3, %%mm5\n" + "psubw %%mm4, %%mm0\n" + "psubw %%mm5, %%mm2\n" + "pxor %%mm3, %%mm3\n" + "pxor %%mm1, %%mm1\n" + "pcmpgtw %%mm0, %%mm3\n\t" + "pcmpgtw %%mm2, %%mm1\n\t" + "pxor %%mm3, %%mm0\n" + "pxor %%mm1, %%mm2\n" + "psubw %%mm3, %%mm0\n" + "psubw %%mm1, %%mm2\n" + "paddw %%mm0, %%mm2\n" + "paddw %%mm2, %%mm6\n" + + "add %2, %0\n" + "subl $2, %%ecx\n" + " jnz 1b\n" + + "movq %%mm6, %%mm0\n" + "punpcklwd %%mm7, %%mm0\n" + "punpckhwd %%mm7, %%mm6\n" + "paddd %%mm0, %%mm6\n" + + "movq %%mm6, %%mm0\n" + "psrlq $32, %%mm6\n" + "paddd %%mm6, %%mm0\n" + "movd %%mm0, %1\n" + : "+r" (pix1), "=r" (tmp) + : "r" ((x86_reg) line_size), "g" (h - 2) + : "%ecx"); + + return tmp + hf_noise8_mmx(pix + 8, line_size, h); +} + +static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int score1, score2; + + if (c) + score1 = c->mecc.sse[0](c, pix1, pix2, line_size, h); + else + score1 = sse16_mmx(c, pix1, pix2, line_size, h); + score2 = hf_noise16_mmx(pix1, line_size, h) - + hf_noise16_mmx(pix2, line_size, h); + + if (c) + return score1 + FFABS(score2) * c->avctx->nsse_weight; + else + return score1 + FFABS(score2) * 8; +} + +static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int score1 = sse8_mmx(c, pix1, pix2, line_size, h); + int score2 = hf_noise8_mmx(pix1, line_size, h) - + hf_noise8_mmx(pix2, line_size, h); + + if (c) + return score1 + FFABS(score2) * c->avctx->nsse_weight; + else + return score1 + FFABS(score2) * 8; +} + +static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, + int line_size, int h) +{ + int tmp; + + assert((((int) pix) & 7) == 0); + assert((line_size & 7) == 0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0), %%mm2\n" \ + "movq 8(%0), %%mm3\n" \ + "add %2,%0\n" \ + "movq %%mm2, " #out0 "\n" \ + "movq %%mm3, " #out1 "\n" \ + "psubusb " #in0 ", %%mm2\n" \ + "psubusb " #in1 ", %%mm3\n" \ + "psubusb " #out0 ", " #in0 "\n" \ + "psubusb " #out1 ", " #in1 "\n" \ + "por %%mm2, " #in0 "\n" \ + "por %%mm3, " #in1 "\n" \ + "movq " #in0 ", %%mm2\n" \ + "movq " #in1 ", %%mm3\n" \ + "punpcklbw %%mm7, " #in0 "\n" \ + "punpcklbw %%mm7, " #in1 "\n" \ + "punpckhbw %%mm7, %%mm2\n" \ + "punpckhbw %%mm7, %%mm3\n" \ + "paddw " #in1 ", " #in0 "\n" \ + "paddw %%mm3, %%mm2\n" \ + "paddw %%mm2, " #in0 "\n" \ + "paddw " #in0 ", %%mm6\n" + + + __asm__ volatile ( + "movl %3, %%ecx\n" + "pxor %%mm6, %%mm6\n" + "pxor %%mm7, %%mm7\n" + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "add %2, %0\n" + "jmp 2f\n" + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + "2:\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movq %%mm6, %%mm0\n" + "psrlq $32, %%mm6\n" + "paddw %%mm6, %%mm0\n" + "movq %%mm0, %%mm6\n" + "psrlq $16, %%mm0\n" + "paddw %%mm6, %%mm0\n" + "movd %%mm0, %1\n" + : "+r" (pix), "=r" (tmp) + : "r" ((x86_reg) line_size), "m" (h) + : "%ecx"); + + return tmp & 0xFFFF; +} +#undef SUM + +static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, + int line_size, int h) +{ + int tmp; + + assert((((int) pix) & 7) == 0); + assert((line_size & 7) == 0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0), " #out0 "\n" \ + "movq 8(%0), " #out1 "\n" \ + "add %2, %0\n" \ + "psadbw " #out0 ", " #in0 "\n" \ + "psadbw " #out1 ", " #in1 "\n" \ + "paddw " #in1 ", " #in0 "\n" \ + "paddw " #in0 ", %%mm6\n" + + __asm__ volatile ( + "movl %3, %%ecx\n" + "pxor %%mm6, %%mm6\n" + "pxor %%mm7, %%mm7\n" + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "add %2, %0\n" + "jmp 2f\n" + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + "2:\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movd %%mm6, %1\n" + : "+r" (pix), "=r" (tmp) + : "r" ((x86_reg) line_size), "m" (h) + : "%ecx"); + + return tmp; +} +#undef SUM + +static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int tmp; + + assert((((int) pix1) & 7) == 0); + assert((((int) pix2) & 7) == 0); + assert((line_size & 7) == 0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0), %%mm2\n" \ + "movq (%1), " #out0 "\n" \ + "movq 8(%0), %%mm3\n" \ + "movq 8(%1), " #out1 "\n" \ + "add %3, %0\n" \ + "add %3, %1\n" \ + "psubb " #out0 ", %%mm2\n" \ + "psubb " #out1 ", %%mm3\n" \ + "pxor %%mm7, %%mm2\n" \ + "pxor %%mm7, %%mm3\n" \ + "movq %%mm2, " #out0 "\n" \ + "movq %%mm3, " #out1 "\n" \ + "psubusb " #in0 ", %%mm2\n" \ + "psubusb " #in1 ", %%mm3\n" \ + "psubusb " #out0 ", " #in0 "\n" \ + "psubusb " #out1 ", " #in1 "\n" \ + "por %%mm2, " #in0 "\n" \ + "por %%mm3, " #in1 "\n" \ + "movq " #in0 ", %%mm2\n" \ + "movq " #in1 ", %%mm3\n" \ + "punpcklbw %%mm7, " #in0 "\n" \ + "punpcklbw %%mm7, " #in1 "\n" \ + "punpckhbw %%mm7, %%mm2\n" \ + "punpckhbw %%mm7, %%mm3\n" \ + "paddw " #in1 ", " #in0 "\n" \ + "paddw %%mm3, %%mm2\n" \ + "paddw %%mm2, " #in0 "\n" \ + "paddw " #in0 ", %%mm6\n" + + + __asm__ volatile ( + "movl %4, %%ecx\n" + "pxor %%mm6, %%mm6\n" + "pcmpeqw %%mm7, %%mm7\n" + "psllw $15, %%mm7\n" + "packsswb %%mm7, %%mm7\n" + "movq (%0), %%mm0\n" + "movq (%1), %%mm2\n" + "movq 8(%0), %%mm1\n" + "movq 8(%1), %%mm3\n" + "add %3, %0\n" + "add %3, %1\n" + "psubb %%mm2, %%mm0\n" + "psubb %%mm3, %%mm1\n" + "pxor %%mm7, %%mm0\n" + "pxor %%mm7, %%mm1\n" + "jmp 2f\n" + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + "2:\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movq %%mm6, %%mm0\n" + "psrlq $32, %%mm6\n" + "paddw %%mm6, %%mm0\n" + "movq %%mm0, %%mm6\n" + "psrlq $16, %%mm0\n" + "paddw %%mm6, %%mm0\n" + "movd %%mm0, %2\n" + : "+r" (pix1), "+r" (pix2), "=r" (tmp) + : "r" ((x86_reg) line_size), "m" (h) + : "%ecx"); + + return tmp & 0x7FFF; +} +#undef SUM + +static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h) +{ + int tmp; + + assert((((int) pix1) & 7) == 0); + assert((((int) pix2) & 7) == 0); + assert((line_size & 7) == 0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0), " #out0 "\n" \ + "movq (%1), %%mm2\n" \ + "movq 8(%0), " #out1 "\n" \ + "movq 8(%1), %%mm3\n" \ + "add %3, %0\n" \ + "add %3, %1\n" \ + "psubb %%mm2, " #out0 "\n" \ + "psubb %%mm3, " #out1 "\n" \ + "pxor %%mm7, " #out0 "\n" \ + "pxor %%mm7, " #out1 "\n" \ + "psadbw " #out0 ", " #in0 "\n" \ + "psadbw " #out1 ", " #in1 "\n" \ + "paddw " #in1 ", " #in0 "\n" \ + "paddw " #in0 ", %%mm6\n " + + __asm__ volatile ( + "movl %4, %%ecx\n" + "pxor %%mm6, %%mm6\n" + "pcmpeqw %%mm7, %%mm7\n" + "psllw $15, %%mm7\n" + "packsswb %%mm7, %%mm7\n" + "movq (%0), %%mm0\n" + "movq (%1), %%mm2\n" + "movq 8(%0), %%mm1\n" + "movq 8(%1), %%mm3\n" + "add %3, %0\n" + "add %3, %1\n" + "psubb %%mm2, %%mm0\n" + "psubb %%mm3, %%mm1\n" + "pxor %%mm7, %%mm0\n" + "pxor %%mm7, %%mm1\n" + "jmp 2f\n" + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + "2:\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movd %%mm6, %2\n" + : "+r" (pix1), "+r" (pix2), "=r" (tmp) + : "r" ((x86_reg) line_size), "m" (h) + : "%ecx"); + + return tmp; +} +#undef SUM + +#define MMABS_MMX(a,z) \ + "pxor " #z ", " #z " \n\t" \ + "pcmpgtw " #a ", " #z " \n\t" \ + "pxor " #z ", " #a " \n\t" \ + "psubw " #z ", " #a " \n\t" + +#define MMABS_MMXEXT(a, z) \ + "pxor " #z ", " #z " \n\t" \ + "psubw " #a ", " #z " \n\t" \ + "pmaxsw " #z ", " #a " \n\t" + +#define MMABS_SSSE3(a,z) \ + "pabsw " #a ", " #a " \n\t" + +#define MMABS_SUM(a,z, sum) \ + MMABS(a,z) \ + "paddusw " #a ", " #sum " \n\t" + +/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get + * up to about 100k on extreme inputs. But that's very unlikely to occur in + * natural video, and it's even more unlikely to not have any alternative + * mvs/modes with lower cost. */ +#define HSUM_MMX(a, t, dst) \ + "movq " #a ", " #t " \n\t" \ + "psrlq $32, " #a " \n\t" \ + "paddusw " #t ", " #a " \n\t" \ + "movq " #a ", " #t " \n\t" \ + "psrlq $16, " #a " \n\t" \ + "paddusw " #t ", " #a " \n\t" \ + "movd " #a ", " #dst " \n\t" \ + +#define HSUM_MMXEXT(a, t, dst) \ + "pshufw $0x0E, " #a ", " #t " \n\t" \ + "paddusw " #t ", " #a " \n\t" \ + "pshufw $0x01, " #a ", " #t " \n\t" \ + "paddusw " #t ", " #a " \n\t" \ + "movd " #a ", " #dst " \n\t" \ + +#define HSUM_SSE2(a, t, dst) \ + "movhlps " #a ", " #t " \n\t" \ + "paddusw " #t ", " #a " \n\t" \ + "pshuflw $0x0E, " #a ", " #t " \n\t" \ + "paddusw " #t ", " #a " \n\t" \ + "pshuflw $0x01, " #a ", " #t " \n\t" \ + "paddusw " #t ", " #a " \n\t" \ + "movd " #a ", " #dst " \n\t" \ + +#define DCT_SAD4(m, mm, o) \ + "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \ + "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \ + "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \ + "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \ + MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \ + MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \ + MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \ + MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \ + +#define DCT_SAD_MMX \ + "pxor %%mm0, %%mm0 \n\t" \ + "pxor %%mm1, %%mm1 \n\t" \ + DCT_SAD4(q, %%mm, 0) \ + DCT_SAD4(q, %%mm, 8) \ + DCT_SAD4(q, %%mm, 64) \ + DCT_SAD4(q, %%mm, 72) \ + "paddusw %%mm1, %%mm0 \n\t" \ + HSUM(%%mm0, %%mm1, %0) + +#define DCT_SAD_SSE2 \ + "pxor %%xmm0, %%xmm0 \n\t" \ + "pxor %%xmm1, %%xmm1 \n\t" \ + DCT_SAD4(dqa, %%xmm, 0) \ + DCT_SAD4(dqa, %%xmm, 64) \ + "paddusw %%xmm1, %%xmm0 \n\t" \ + HSUM(%%xmm0, %%xmm1, %0) + +#define DCT_SAD_FUNC(cpu) \ +static int sum_abs_dctelem_ ## cpu(int16_t *block) \ +{ \ + int sum; \ + __asm__ volatile ( \ + DCT_SAD \ + :"=r"(sum) \ + :"r"(block)); \ + return sum & 0xFFFF; \ +} + +#define DCT_SAD DCT_SAD_MMX +#define HSUM(a, t, dst) HSUM_MMX(a, t, dst) +#define MMABS(a, z) MMABS_MMX(a, z) +DCT_SAD_FUNC(mmx) +#undef MMABS +#undef HSUM + +#define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst) +#define MMABS(a, z) MMABS_MMXEXT(a, z) +DCT_SAD_FUNC(mmxext) +#undef HSUM +#undef DCT_SAD + +#define DCT_SAD DCT_SAD_SSE2 +#define HSUM(a, t, dst) HSUM_SSE2(a, t, dst) +DCT_SAD_FUNC(sse2) +#undef MMABS + +#if HAVE_SSSE3_INLINE +#define MMABS(a, z) MMABS_SSSE3(a, z) +DCT_SAD_FUNC(ssse3) +#undef MMABS +#endif +#undef HSUM +#undef DCT_SAD + + +DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { + 0x0000000000000000ULL, + 0x0001000100010001ULL, + 0x0002000200020002ULL, +}; + +DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL; + +static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + x86_reg len = -(stride * h); + __asm__ volatile ( + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" + "add %3, %%"REG_a" \n\t" + "psubusb %%mm0, %%mm2 \n\t" + "psubusb %%mm4, %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm5 \n\t" + "psubusb %%mm1, %%mm3 \n\t" + "psubusb %%mm5, %%mm1 \n\t" + "por %%mm2, %%mm0 \n\t" + "por %%mm1, %%mm3 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm3, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm3, %%mm2 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "add %3, %%"REG_a" \n\t" + " js 1b \n\t" + : "+a" (len) + : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride)); +} + +static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, + int stride, int h) +{ + __asm__ volatile ( + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2, %3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg) stride)); +} + +static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, + int stride, int h) +{ + int ret; + __asm__ volatile ( + "pxor %%xmm2, %%xmm2 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movdqu (%1), %%xmm0 \n\t" + "movdqu (%1, %4), %%xmm1 \n\t" + "psadbw (%2), %%xmm0 \n\t" + "psadbw (%2, %4), %%xmm1 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "paddw %%xmm1, %%xmm2 \n\t" + "lea (%1,%4,2), %1 \n\t" + "lea (%2,%4,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + "movhlps %%xmm2, %%xmm0 \n\t" + "paddw %%xmm0, %%xmm2 \n\t" + "movd %%xmm2, %3 \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret) + : "r" ((x86_reg) stride)); + return ret; +} + +static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, + int stride, int h) +{ + __asm__ volatile ( + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "pavgb 1(%1), %%mm0 \n\t" + "pavgb 1(%1, %3), %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2, %3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg) stride)); +} + +static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, + int stride, int h) +{ + __asm__ volatile ( + "movq (%1), %%mm0 \n\t" + "add %3, %1 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "pavgb %%mm1, %%mm0 \n\t" + "pavgb %%mm2, %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2, %3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "movq %%mm2, %%mm0 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg) stride)); +} + +static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, + int stride, int h) +{ + __asm__ volatile ( + "movq "MANGLE(bone)", %%mm5 \n\t" + "movq (%1), %%mm0 \n\t" + "pavgb 1(%1), %%mm0 \n\t" + "add %3, %1 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1,%3), %%mm2 \n\t" + "pavgb 1(%1), %%mm1 \n\t" + "pavgb 1(%1,%3), %%mm2 \n\t" + "psubusb %%mm5, %%mm1 \n\t" + "pavgb %%mm1, %%mm0 \n\t" + "pavgb %%mm2, %%mm1 \n\t" + "psadbw (%2), %%mm0 \n\t" + "psadbw (%2,%3), %%mm1 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm1, %%mm6 \n\t" + "movq %%mm2, %%mm0 \n\t" + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%3,2), %2 \n\t" + "sub $2, %0 \n\t" + " jg 1b \n\t" + : "+r" (h), "+r" (blk1), "+r" (blk2) + : "r" ((x86_reg) stride)); +} + +static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, + int stride, int h) +{ + x86_reg len = -(stride * h); + __asm__ volatile ( + ".p2align 4 \n\t" + "1: \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "paddw %%mm2, %%mm3 \n\t" + "movq (%3, %%"REG_a"), %%mm4 \n\t" + "movq (%3, %%"REG_a"), %%mm2 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "paddw %%mm5, %%mm3 \n\t" + "psrlw $1, %%mm1 \n\t" + "psrlw $1, %%mm3 \n\t" + "packuswb %%mm3, %%mm1 \n\t" + "psubusb %%mm1, %%mm4 \n\t" + "psubusb %%mm2, %%mm1 \n\t" + "por %%mm4, %%mm1 \n\t" + "movq %%mm1, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "add %4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+a" (len) + : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), + "r" ((x86_reg) stride)); +} + +static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + x86_reg len = -(stride * h); + __asm__ volatile ( + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq 1(%1, %%"REG_a"), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm3, %%mm1 \n\t" + ".p2align 4 \n\t" + "1: \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" + "movq 1(%2, %%"REG_a"), %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddw %%mm4, %%mm2 \n\t" + "paddw %%mm5, %%mm3 \n\t" + "movq 16+"MANGLE(round_tab)", %%mm5 \n\t" + "paddw %%mm2, %%mm0 \n\t" + "paddw %%mm3, %%mm1 \n\t" + "paddw %%mm5, %%mm0 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "movq (%3, %%"REG_a"), %%mm4 \n\t" + "movq (%3, %%"REG_a"), %%mm5 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm1 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "psubusb %%mm0, %%mm4 \n\t" + "psubusb %%mm5, %%mm0 \n\t" + "por %%mm4, %%mm0 \n\t" + "movq %%mm0, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "paddw %%mm4, %%mm6 \n\t" + "movq %%mm2, %%mm0 \n\t" + "movq %%mm3, %%mm1 \n\t" + "add %4, %%"REG_a" \n\t" + " js 1b \n\t" + : "+a" (len) + : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), + "r" ((x86_reg) stride)); +} + +static inline int sum_mmx(void) +{ + int ret; + __asm__ volatile ( + "movq %%mm6, %%mm0 \n\t" + "psrlq $32, %%mm6 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "movq %%mm6, %%mm0 \n\t" + "psrlq $16, %%mm6 \n\t" + "paddw %%mm0, %%mm6 \n\t" + "movd %%mm6, %0 \n\t" + : "=r" (ret)); + return ret & 0xFFFF; +} + +static inline int sum_mmxext(void) +{ + int ret; + __asm__ volatile ( + "movd %%mm6, %0 \n\t" + : "=r" (ret)); + return ret; +} + +static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h); +} + +static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +{ + sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h); +} + +#define PIX_SAD(suf) \ +static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ + uint8_t *blk1, int stride, int h) \ +{ \ + assert(h == 8); \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "pxor %%mm6, %%mm6 \n\t" \ + :); \ + \ + sad8_1_ ## suf(blk1, blk2, stride, 8); \ + \ + return sum_ ## suf(); \ +} \ + \ +static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ + uint8_t *blk1, int stride, int h) \ +{ \ + assert(h == 8); \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "pxor %%mm6, %%mm6 \n\t" \ + "movq %0, %%mm5 \n\t" \ + :: "m" (round_tab[1])); \ + \ + sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ + \ + return sum_ ## suf(); \ +} \ + \ +static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ + uint8_t *blk1, int stride, int h) \ +{ \ + assert(h == 8); \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "pxor %%mm6, %%mm6 \n\t" \ + "movq %0, %%mm5 \n\t" \ + :: "m" (round_tab[1])); \ + \ + sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ + \ + return sum_ ## suf(); \ +} \ + \ +static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ + uint8_t *blk1, int stride, int h) \ +{ \ + assert(h == 8); \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "pxor %%mm6, %%mm6 \n\t" \ + ::); \ + \ + sad8_4_ ## suf(blk1, blk2, stride, 8); \ + \ + return sum_ ## suf(); \ +} \ + \ +static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ + uint8_t *blk1, int stride, int h) \ +{ \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "pxor %%mm6, %%mm6 \n\t" \ + :); \ + \ + sad8_1_ ## suf(blk1, blk2, stride, h); \ + sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ + \ + return sum_ ## suf(); \ +} \ + \ +static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ + uint8_t *blk1, int stride, int h) \ +{ \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "pxor %%mm6, %%mm6 \n\t" \ + "movq %0, %%mm5 \n\t" \ + :: "m" (round_tab[1])); \ + \ + sad8_x2a_ ## suf(blk1, blk2, stride, h); \ + sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ + \ + return sum_ ## suf(); \ +} \ + \ +static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ + uint8_t *blk1, int stride, int h) \ +{ \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "pxor %%mm6, %%mm6 \n\t" \ + "movq %0, %%mm5 \n\t" \ + :: "m" (round_tab[1])); \ + \ + sad8_y2a_ ## suf(blk1, blk2, stride, h); \ + sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ + \ + return sum_ ## suf(); \ +} \ + \ +static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ + uint8_t *blk1, int stride, int h) \ +{ \ + __asm__ volatile ( \ + "pxor %%mm7, %%mm7 \n\t" \ + "pxor %%mm6, %%mm6 \n\t" \ + ::); \ + \ + sad8_4_ ## suf(blk1, blk2, stride, h); \ + sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ + \ + return sum_ ## suf(); \ +} \ + +PIX_SAD(mmx) +PIX_SAD(mmxext) + +#endif /* HAVE_INLINE_ASM */ + +int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); + +#define hadamard_func(cpu) \ + int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ + uint8_t *src2, int stride, int h); \ + int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ + uint8_t *src2, int stride, int h); + +hadamard_func(mmx) +hadamard_func(mmxext) +hadamard_func(sse2) +hadamard_func(ssse3) + +av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_INLINE_ASM + if (INLINE_MMX(cpu_flags)) { + c->sum_abs_dctelem = sum_abs_dctelem_mmx; + + c->pix_abs[0][0] = sad16_mmx; + c->pix_abs[0][1] = sad16_x2_mmx; + c->pix_abs[0][2] = sad16_y2_mmx; + c->pix_abs[0][3] = sad16_xy2_mmx; + c->pix_abs[1][0] = sad8_mmx; + c->pix_abs[1][1] = sad8_x2_mmx; + c->pix_abs[1][2] = sad8_y2_mmx; + c->pix_abs[1][3] = sad8_xy2_mmx; + + c->sad[0] = sad16_mmx; + c->sad[1] = sad8_mmx; + + c->sse[0] = sse16_mmx; + c->sse[1] = sse8_mmx; + c->vsad[4] = vsad_intra16_mmx; + + c->nsse[0] = nsse16_mmx; + c->nsse[1] = nsse8_mmx; + + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->vsad[0] = vsad16_mmx; + } + } + + if (INLINE_MMXEXT(cpu_flags)) { + c->sum_abs_dctelem = sum_abs_dctelem_mmxext; + + c->vsad[4] = vsad_intra16_mmxext; + + c->pix_abs[0][0] = sad16_mmxext; + c->pix_abs[1][0] = sad8_mmxext; + + c->sad[0] = sad16_mmxext; + c->sad[1] = sad8_mmxext; + + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { + c->pix_abs[0][1] = sad16_x2_mmxext; + c->pix_abs[0][2] = sad16_y2_mmxext; + c->pix_abs[0][3] = sad16_xy2_mmxext; + c->pix_abs[1][1] = sad8_x2_mmxext; + c->pix_abs[1][2] = sad8_y2_mmxext; + c->pix_abs[1][3] = sad8_xy2_mmxext; + + c->vsad[0] = vsad16_mmxext; + } + } + + if (INLINE_SSE2(cpu_flags)) { + c->sum_abs_dctelem = sum_abs_dctelem_sse2; + } + + if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) { + c->sad[0] = sad16_sse2; + } + +#if HAVE_SSSE3_INLINE + if (INLINE_SSSE3(cpu_flags)) { + c->sum_abs_dctelem = sum_abs_dctelem_ssse3; + } +#endif +#endif /* HAVE_INLINE_ASM */ + + if (EXTERNAL_MMX(cpu_flags)) { + c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; + c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; + } + + if (EXTERNAL_MMXEXT(cpu_flags)) { + c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; + c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + c->sse[0] = ff_sse16_sse2; + +#if HAVE_ALIGNED_STACK + c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; + c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; +#endif + } + + if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) { + c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; + c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; + } +} -- cgit v1.2.3