From 33c752be513d09d9dd498beac02c39522d671888 Mon Sep 17 00:00:00 2001 From: James Almer Date: Fri, 19 Sep 2014 19:16:45 -0300 Subject: x86/me_cmp: port mmxext vsad functions to yasm Also add mmxext versions of vsad8 and vsad_intra8, and sse2 versions of vsad16 and vsad_intra16. Since vsad8 and vsad16 are not bitexact, they are accordingly marked as approximate. Reviewed-by: Michael Niedermayer Signed-off-by: James Almer --- libavcodec/x86/me_cmp.asm | 161 +++++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/me_cmp_init.c | 133 ++++++----------------------------- 2 files changed, 181 insertions(+), 113 deletions(-) diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm index b657642c41..95b99c48fd 100644 --- a/libavcodec/x86/me_cmp.asm +++ b/libavcodec/x86/me_cmp.asm @@ -26,6 +26,7 @@ SECTION_RODATA cextern pb_1 +cextern pb_80 SECTION .text @@ -772,3 +773,163 @@ SAD_APPROX_XY2 8 SAD_APPROX_XY2 16 INIT_XMM sse2 SAD_APPROX_XY2 16 + +;-------------------------------------------------------------------- +;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, +; int line_size, int h); +;-------------------------------------------------------------------- +; %1 = 8/16 +%macro VSAD_INTRA 1 +cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h + mova m0, [pix1q] +%if %1 == mmsize + mova m2, [pix1q+lsizeq] + psadbw m0, m2 +%else + mova m2, [pix1q+lsizeq] + mova m3, [pix1q+8] + mova m4, [pix1q+lsizeq+8] + psadbw m0, m2 + psadbw m3, m4 + paddw m0, m3 +%endif + sub hd, 2 + +.loop + lea pix1q, [pix1q + 2*lsizeq] +%if %1 == mmsize + mova m1, [pix1q] + psadbw m2, m1 + paddw m0, m2 + mova m2, [pix1q+lsizeq] + psadbw m1, m2 + paddw m0, m1 +%else + mova m1, [pix1q] + mova m3, [pix1q+8] + psadbw m2, m1 + psadbw m4, m3 + paddw m0, m2 + paddw m0, m4 + mova m2, [pix1q+lsizeq] + mova m4, [pix1q+lsizeq+8] + psadbw m1, m2 + psadbw m3, m4 + paddw m0, m1 + paddw m0, m3 +%endif + sub hd, 2 + jg .loop + +%if mmsize == 16 + pshufd m1, m0, 0xe + paddd m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_MMX mmxext +VSAD_INTRA 8 +VSAD_INTRA 16 +INIT_XMM sse2 +VSAD_INTRA 16 + +;--------------------------------------------------------------------- +;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, +; int line_size, int h); +;--------------------------------------------------------------------- +; %1 = 8/16 +%macro VSAD_APPROX 1 +cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h + mova m1, [pb_80] + mova m0, [pix1q] +%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 + mova m4, [pix1q+lsizeq] +%if mmsize == 16 + movu m3, [pix2q] + movu m2, [pix2q+lsizeq] + psubb m0, m3 + psubb m4, m2 +%else + psubb m0, [pix2q] + psubb m4, [pix2q+lsizeq] +%endif + pxor m0, m1 + pxor m4, m1 + psadbw m0, m4 +%else ; vsad16_mmxext + mova m3, [pix1q+8] + psubb m0, [pix2q] + psubb m3, [pix2q+8] + pxor m0, m1 + pxor m3, m1 + mova m4, [pix1q+lsizeq] + mova m5, [pix1q+lsizeq+8] + psubb m4, [pix2q+lsizeq] + psubb m5, [pix2q+lsizeq+8] + pxor m4, m1 + pxor m5, m1 + psadbw m0, m4 + psadbw m3, m5 + paddw m0, m3 +%endif + sub hd, 2 + +.loop + lea pix1q, [pix1q + 2*lsizeq] + lea pix2q, [pix2q + 2*lsizeq] + mova m2, [pix1q] +%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 +%if mmsize == 16 + movu m3, [pix2q] + psubb m2, m3 +%else + psubb m2, [pix2q] +%endif + pxor m2, m1 + psadbw m4, m2 + paddw m0, m4 + mova m4, [pix1q+lsizeq] + movu m3, [pix2q+lsizeq] + psubb m4, m3 + pxor m4, m1 + psadbw m2, m4 + paddw m0, m2 +%else ; vsad16_mmxext + mova m3, [pix1q+8] + psubb m2, [pix2q] + psubb m3, [pix2q+8] + pxor m2, m1 + pxor m3, m1 + psadbw m4, m2 + psadbw m5, m3 + paddw m0, m4 + paddw m0, m5 + mova m4, [pix1q+lsizeq] + mova m5, [pix1q+lsizeq+8] + psubb m4, [pix2q+lsizeq] + psubb m5, [pix2q+lsizeq+8] + pxor m4, m1 + pxor m5, m1 + psadbw m2, m4 + psadbw m3, m5 + paddw m0, m2 + paddw m0, m3 +%endif + sub hd, 2 + jg .loop + +%if mmsize == 16 + pshufd m1, m0, 0xe + paddd m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_MMX mmxext +VSAD_APPROX 8 +VSAD_APPROX 16 +INIT_XMM sse2 +VSAD_APPROX 16 diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c index cb47d631f9..6dc59f5aa4 100644 --- a/libavcodec/x86/me_cmp_init.c +++ b/libavcodec/x86/me_cmp_init.c @@ -65,6 +65,18 @@ int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h); int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int stride, int h); +int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); +int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); +int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); +int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); +int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); +int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, + int line_size, int h); #define hadamard_func(cpu) \ int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ @@ -177,49 +189,6 @@ static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, } #undef SUM -static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy, - int line_size, int h) -{ - int tmp; - - av_assert2((((int) pix) & 7) == 0); - av_assert2((line_size & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), " #out0 "\n" \ - "movq 8(%0), " #out1 "\n" \ - "add %2, %0\n" \ - "psadbw " #out0 ", " #in0 "\n" \ - "psadbw " #out1 ", " #in1 "\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n" - - __asm__ volatile ( - "movl %3, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pxor %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq 8(%0), %%mm1\n" - "add %2, %0\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6, %1\n" - : "+r" (pix), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} -#undef SUM - static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { @@ -301,68 +270,6 @@ static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, } #undef SUM -static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, - int line_size, int h) -{ - int tmp; - - av_assert2((((int) pix1) & 7) == 0); - av_assert2((((int) pix2) & 7) == 0); - av_assert2((line_size & 7) == 0); - -#define SUM(in0, in1, out0, out1) \ - "movq (%0), " #out0 "\n" \ - "movq (%1), %%mm2\n" \ - "movq 8(%0), " #out1 "\n" \ - "movq 8(%1), %%mm3\n" \ - "add %3, %0\n" \ - "add %3, %1\n" \ - "psubb %%mm2, " #out0 "\n" \ - "psubb %%mm3, " #out1 "\n" \ - "pxor %%mm7, " #out0 "\n" \ - "pxor %%mm7, " #out1 "\n" \ - "psadbw " #out0 ", " #in0 "\n" \ - "psadbw " #out1 ", " #in1 "\n" \ - "paddw " #in1 ", " #in0 "\n" \ - "paddw " #in0 ", %%mm6\n " - - __asm__ volatile ( - "movl %4, %%ecx\n" - "pxor %%mm6, %%mm6\n" - "pcmpeqw %%mm7, %%mm7\n" - "psllw $15, %%mm7\n" - "packsswb %%mm7, %%mm7\n" - "movq (%0), %%mm0\n" - "movq (%1), %%mm2\n" - "movq 8(%0), %%mm1\n" - "movq 8(%1), %%mm3\n" - "add %3, %0\n" - "add %3, %1\n" - "psubb %%mm2, %%mm0\n" - "psubb %%mm3, %%mm1\n" - "pxor %%mm7, %%mm0\n" - "pxor %%mm7, %%mm1\n" - "jmp 2f\n" - "1:\n" - - SUM(%%mm4, %%mm5, %%mm0, %%mm1) - "2:\n" - SUM(%%mm0, %%mm1, %%mm4, %%mm5) - - "subl $2, %%ecx\n" - "jnz 1b\n" - - "movd %%mm6, %2\n" - : "+r" (pix1), "+r" (pix2), "=r" (tmp) - : "r" ((x86_reg) line_size), "m" (h) - : "%ecx"); - - return tmp; -} -#undef SUM - - - DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = { 0x0000000000000000ULL, 0x0001000100010001ULL, @@ -667,14 +574,6 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) } } - if (INLINE_MMXEXT(cpu_flags)) { - c->vsad[4] = vsad_intra16_mmxext; - - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->vsad[0] = vsad16_mmxext; - } - } - #endif /* HAVE_INLINE_ASM */ if (EXTERNAL_MMX(cpu_flags)) { @@ -704,9 +603,15 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) c->pix_abs[1][1] = ff_sad8_x2_mmxext; c->pix_abs[1][2] = ff_sad8_y2_mmxext; + c->vsad[4] = ff_vsad_intra16_mmxext; + c->vsad[5] = ff_vsad_intra8_mmxext; + if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext; c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext; + + c->vsad[0] = ff_vsad16_approx_mmxext; + c->vsad[1] = ff_vsad8_approx_mmxext; } } @@ -724,8 +629,10 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) c->pix_abs[0][1] = ff_sad16_x2_sse2; c->pix_abs[0][2] = ff_sad16_y2_sse2; + c->vsad[4] = ff_vsad_intra16_sse2; if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2; + c->vsad[0] = ff_vsad16_approx_sse2; } } } -- cgit v1.2.3