From 2734ba787b4a2cbc44bbc6499ae82013c790f453 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 26 Jul 2012 22:07:29 -0700 Subject: vp56: port x86 simd to cpuflags. --- libavcodec/x86/vp56dsp.asm | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/vp56dsp.asm b/libavcodec/x86/vp56dsp.asm index 66a97f1593..27a82bccab 100644 --- a/libavcodec/x86/vp56dsp.asm +++ b/libavcodec/x86/vp56dsp.asm @@ -27,7 +27,8 @@ cextern pw_64 SECTION .text -%macro DIAG4_MMX 6 +%macro DIAG4 6 +%if mmsize == 8 movq m0, [%1+%2] movq m1, [%1+%3] movq m3, m0 @@ -64,9 +65,7 @@ SECTION .text psraw m3, 7 packuswb m0, m3 movq [%6], m0 -%endmacro - -%macro DIAG4_SSE2 6 +%else ; mmsize == 16 movq m0, [%1+%2] movq m1, [%1+%3] punpcklbw m0, m7 @@ -86,9 +85,11 @@ SECTION .text psraw m0, 7 packuswb m0, m0 movq [%6], m0 +%endif ; mmsize == 8/16 %endmacro -%macro SPLAT4REGS_MMX 0 +%macro SPLAT4REGS 0 +%if mmsize == 8 movq m5, m3 punpcklwd m3, m3 movq m4, m3 @@ -102,9 +103,7 @@ SECTION .text movq [rsp+8*12], m4 movq [rsp+8*13], m5 movq [rsp+8*14], m2 -%endmacro - -%macro SPLAT4REGS_SSE2 0 +%else ; mmsize == 16 pshuflw m4, m3, 0x0 pshuflw m5, m3, 0x55 pshuflw m6, m3, 0xAA @@ -113,15 +112,16 @@ SECTION .text punpcklqdq m5, m5 punpcklqdq m6, m6 punpcklqdq m3, m3 +%endif ; mmsize == 8/16 %endmacro -%macro vp6_filter_diag4 2 +%macro vp6_filter_diag4 0 ; void ff_vp6_filter_diag4_(uint8_t *dst, uint8_t *src, int stride, ; const int16_t h_weight[4], const int16_t v_weights[4]) -cglobal vp6_filter_diag4_%1, 5, 7, %2 +cglobal vp6_filter_diag4, 5, 7, 8 mov r5, rsp ; backup stack pointer and rsp, ~(mmsize-1) ; align stack -%ifidn %1, sse2 +%if mmsize == 16 sub rsp, 8*11 %else sub rsp, 8*15 @@ -162,12 +162,8 @@ cglobal vp6_filter_diag4_%1, 5, 7, %2 RET %endmacro -INIT_MMX -%define DIAG4 DIAG4_MMX -%define SPLAT4REGS SPLAT4REGS_MMX -vp6_filter_diag4 mmx, 0 +INIT_MMX mmx +vp6_filter_diag4 -INIT_XMM -%define DIAG4 DIAG4_SSE2 -%define SPLAT4REGS SPLAT4REGS_SSE2 -vp6_filter_diag4 sse2, 8 +INIT_XMM sse2 +vp6_filter_diag4 -- cgit v1.2.3 From 158744a4cd63a8dce2060b366ae7b6509351d6c8 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 26 Jul 2012 22:09:46 -0700 Subject: vp56: only compile MMX SIMD on x86-32. All x86-64 CPUs have SSE2, so the MMX version will never be used. This leads to smaller binaries. --- libavcodec/x86/vp56dsp.asm | 2 ++ libavcodec/x86/vp56dsp_init.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/vp56dsp.asm b/libavcodec/x86/vp56dsp.asm index 27a82bccab..ca4d97ec15 100644 --- a/libavcodec/x86/vp56dsp.asm +++ b/libavcodec/x86/vp56dsp.asm @@ -162,8 +162,10 @@ cglobal vp6_filter_diag4, 5, 7, 8 RET %endmacro +%if ARCH_X86_32 INIT_MMX mmx vp6_filter_diag4 +%endif INIT_XMM sse2 vp6_filter_diag4 diff --git a/libavcodec/x86/vp56dsp_init.c b/libavcodec/x86/vp56dsp_init.c index 29892812ac..ae04440611 100644 --- a/libavcodec/x86/vp56dsp_init.c +++ b/libavcodec/x86/vp56dsp_init.c @@ -36,9 +36,11 @@ av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum CodecID codec) int mm_flags = av_get_cpu_flags(); if (CONFIG_VP6_DECODER && codec == CODEC_ID_VP6) { +#if ARCH_X86_32 if (mm_flags & AV_CPU_FLAG_MMX) { c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; } +#endif if (mm_flags & AV_CPU_FLAG_SSE2) { c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; -- cgit v1.2.3 From 76888c64b008bc3acf6e5fe5117a360f2c87aae4 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 26 Jul 2012 22:19:19 -0700 Subject: rv34: port x86 SIMD to cpuflags. --- libavcodec/x86/rv34dsp.asm | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm index 32bcdced8a..c43b77abd2 100644 --- a/libavcodec/x86/rv34dsp.asm +++ b/libavcodec/x86/rv34dsp.asm @@ -46,7 +46,7 @@ SECTION .text %endmacro %macro rv34_idct 1 -cglobal rv34_idct_%1_mmx2, 1, 2, 0 +cglobal rv34_idct_%1, 1, 2, 0 movsx r1, word [r0] IDCT_DC r1 movd m0, r1 @@ -58,14 +58,15 @@ cglobal rv34_idct_%1_mmx2, 1, 2, 0 REP_RET %endmacro -INIT_MMX +INIT_MMX mmx2 %define IDCT_DC IDCT_DC_ROUND rv34_idct dc %define IDCT_DC IDCT_DC_NOROUND rv34_idct dc_noround ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); -cglobal rv34_idct_dc_add_mmx, 3, 3 +INIT_MMX mmx +cglobal rv34_idct_dc_add, 3, 3 ; calculate DC IDCT_DC_ROUND r2 pxor m1, m1 @@ -167,8 +168,8 @@ cglobal rv34_idct_add, 3,3,0, d, s, b ret ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); -INIT_XMM -cglobal rv34_idct_dc_add_sse4, 3, 3, 6 +INIT_XMM sse4 +cglobal rv34_idct_dc_add, 3, 3, 6 ; load data IDCT_DC_ROUND r2 pxor m1, m1 -- cgit v1.2.3 From 4a26fdd8520d5ad7ea6458854610521bbda880d5 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 27 Jul 2012 15:17:27 -0700 Subject: vp3: port x86 SIMD to cpuflags. --- libavcodec/x86/vp3dsp.asm | 94 +++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 47 deletions(-) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index af2f60c6ae..5877520c6c 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -102,8 +102,8 @@ SECTION .text mov [r0+r3 -1], r2w %endmacro -INIT_MMX -cglobal vp3_v_loop_filter_mmx2, 3, 4 +INIT_MMX mmx2 +cglobal vp3_v_loop_filter, 3, 4 %if ARCH_X86_64 movsxd r1, r1d %endif @@ -120,7 +120,7 @@ cglobal vp3_v_loop_filter_mmx2, 3, 4 movq [r0 ], m3 RET -cglobal vp3_h_loop_filter_mmx2, 3, 4 +cglobal vp3_h_loop_filter, 3, 4 %if ARCH_X86_64 movsxd r1, r1d %endif @@ -354,38 +354,6 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 movq I(2), m2 %endmacro -%macro VP3_IDCT_mmx 1 - ; eax = quantized input - ; ebx = dequantizer matrix - ; ecx = IDCT constants - ; M(I) = ecx + MaskOffset(0) + I * 8 - ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 - ; edx = output - ; r0..r7 = mm0..mm7 -%define OC_8 [pw_8] -%define C(x) [vp3_idct_data+16*(x-1)] - - ; at this point, function has completed dequantization + dezigzag + - ; partial transposition; now do the idct itself -%define I(x) [%1+16* x ] -%define J(x) [%1+16*(x-4)+8] - RowIDCT - Transpose - -%define I(x) [%1+16* x +64] -%define J(x) [%1+16*(x-4)+72] - RowIDCT - Transpose - -%define I(x) [%1+16*x] -%define J(x) [%1+16*x] - ColumnIDCT - -%define I(x) [%1+16*x+8] -%define J(x) [%1+16*x+8] - ColumnIDCT -%endmacro - %macro VP3_1D_IDCT_SSE2 0 movdqa m2, I(3) ; xmm2 = i3 movdqa m6, C(3) ; xmm6 = c3 @@ -501,7 +469,8 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 movdqa O(7), m%8 %endmacro -%macro VP3_IDCT_sse2 1 +%macro VP3_IDCT 1 +%if mmsize == 16 %define I(x) [%1+16*x] %define O(x) [%1+16*x] %define C(x) [vp3_idct_data+16*(x-1)] @@ -519,11 +488,42 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 %define ADD(x) paddsw x, [pw_8] VP3_1D_IDCT_SSE2 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 +%else ; mmsize == 8 + ; eax = quantized input + ; ebx = dequantizer matrix + ; ecx = IDCT constants + ; M(I) = ecx + MaskOffset(0) + I * 8 + ; C(I) = ecx + CosineOffset(32) + (I-1) * 8 + ; edx = output + ; r0..r7 = mm0..mm7 +%define OC_8 [pw_8] +%define C(x) [vp3_idct_data+16*(x-1)] + + ; at this point, function has completed dequantization + dezigzag + + ; partial transposition; now do the idct itself +%define I(x) [%1+16* x ] +%define J(x) [%1+16*(x-4)+8] + RowIDCT + Transpose + +%define I(x) [%1+16* x +64] +%define J(x) [%1+16*(x-4)+72] + RowIDCT + Transpose + +%define I(x) [%1+16*x] +%define J(x) [%1+16*x] + ColumnIDCT + +%define I(x) [%1+16*x+8] +%define J(x) [%1+16*x+8] + ColumnIDCT +%endif ; mmsize == 16/8 %endmacro -%macro vp3_idct_funcs 1 -cglobal vp3_idct_put_%1, 3, 4, 9 - VP3_IDCT_%1 r2 +%macro vp3_idct_funcs 0 +cglobal vp3_idct_put, 3, 4, 9 + VP3_IDCT r2 movsxdifnidn r1, r1d mova m4, [pb_80] @@ -565,8 +565,8 @@ cglobal vp3_idct_put_%1, 3, 4, 9 %endrep RET -cglobal vp3_idct_add_%1, 3, 4, 9 - VP3_IDCT_%1 r2 +cglobal vp3_idct_add, 3, 4, 9 + VP3_IDCT r2 mov r3, 4 pxor m4, m4 @@ -607,10 +607,10 @@ cglobal vp3_idct_add_%1, 3, 4, 9 RET %endmacro -INIT_MMX -vp3_idct_funcs mmx -INIT_XMM -vp3_idct_funcs sse2 +INIT_MMX mmx +vp3_idct_funcs +INIT_XMM sse2 +vp3_idct_funcs %macro DC_ADD 0 movq m2, [r0 ] @@ -631,8 +631,8 @@ vp3_idct_funcs sse2 movq [r0+r3 ], m5 %endmacro -INIT_MMX -cglobal vp3_idct_dc_add_mmx2, 3, 4 +INIT_MMX mmx2 +cglobal vp3_idct_dc_add, 3, 4 %if ARCH_X86_64 movsxd r1, r1d %endif -- cgit v1.2.3 From d07ff3cd5a31fad25e2fc89ce8ef98da144c0ee6 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 26 Jul 2012 20:43:50 -0700 Subject: h264_chromamc_10bit: port x86 simd to cpuflags. --- libavcodec/x86/dsputil_mmx.c | 16 +++++++------- libavcodec/x86/h264_chromamc_10bit.asm | 40 +++++++++++++++++----------------- 2 files changed, 28 insertions(+), 28 deletions(-) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index e91ede531e..afbb5312b8 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2117,10 +2117,10 @@ void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ (uint8_t *dst, uint8_t *src, \ int stride, int h, int x, int y); -CHROMA_MC(put, 2, 10, mmxext) -CHROMA_MC(avg, 2, 10, mmxext) -CHROMA_MC(put, 4, 10, mmxext) -CHROMA_MC(avg, 4, 10, mmxext) +CHROMA_MC(put, 2, 10, mmx2) +CHROMA_MC(avg, 2, 10, mmx2) +CHROMA_MC(put, 4, 10, mmx2) +CHROMA_MC(avg, 4, 10, mmx2) CHROMA_MC(put, 8, 10, sse2) CHROMA_MC(avg, 8, 10, sse2) CHROMA_MC(put, 8, 10, avx) @@ -2740,10 +2740,10 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2; } if (bit_depth == 10 && CONFIG_H264CHROMA) { - c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext; - c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext; - c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext; - c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext; + c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2; + c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2; + c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2; + c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2; } c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm index 3f7c513069..370c7b5a46 100644 --- a/libavcodec/x86/h264_chromamc_10bit.asm +++ b/libavcodec/x86/h264_chromamc_10bit.asm @@ -60,10 +60,10 @@ SECTION .text ;----------------------------------------------------------------------------- ; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my) ;----------------------------------------------------------------------------- -%macro CHROMA_MC8 2 +%macro CHROMA_MC8 1 ; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, ; int stride, int h, int mx, int my) -cglobal %1_h264_chroma_mc8_10_%2, 6,7,8 +cglobal %1_h264_chroma_mc8_10, 6,7,8 movsxdifnidn r2, r2d mov r6d, r5d or r6d, r4d @@ -173,8 +173,8 @@ cglobal %1_h264_chroma_mc8_10_%2, 6,7,8 add r0, r2 %endmacro -%macro CHROMA_MC4 2 -cglobal %1_h264_chroma_mc4_10_%2, 6,6,7 +%macro CHROMA_MC4 1 +cglobal %1_h264_chroma_mc4_10, 6,6,7 movsxdifnidn r2, r2d movd m2, r4m ; x movd m3, r5m ; y @@ -203,8 +203,8 @@ cglobal %1_h264_chroma_mc4_10_%2, 6,6,7 ;----------------------------------------------------------------------------- ; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my) ;----------------------------------------------------------------------------- -%macro CHROMA_MC2 2 -cglobal %1_h264_chroma_mc2_10_%2, 6,7 +%macro CHROMA_MC2 1 +cglobal %1_h264_chroma_mc2_10, 6,7 movsxdifnidn r2, r2d mov r6d, r4d shl r4d, 16 @@ -250,24 +250,24 @@ cglobal %1_h264_chroma_mc2_10_%2, 6,7 %endmacro %define CHROMAMC_AVG NOTHING -INIT_XMM -CHROMA_MC8 put, sse2 +INIT_XMM sse2 +CHROMA_MC8 put %if HAVE_AVX -INIT_AVX -CHROMA_MC8 put, avx +INIT_XMM avx +CHROMA_MC8 put %endif -INIT_MMX -CHROMA_MC4 put, mmxext -CHROMA_MC2 put, mmxext +INIT_MMX mmx2 +CHROMA_MC4 put +CHROMA_MC2 put %define CHROMAMC_AVG AVG %define PAVG pavgw -INIT_XMM -CHROMA_MC8 avg, sse2 +INIT_XMM sse2 +CHROMA_MC8 avg %if HAVE_AVX -INIT_AVX -CHROMA_MC8 avg, avx +INIT_XMM avx +CHROMA_MC8 avg %endif -INIT_MMX -CHROMA_MC4 avg, mmxext -CHROMA_MC2 avg, mmxext +INIT_MMX mmx2 +CHROMA_MC4 avg +CHROMA_MC2 avg -- cgit v1.2.3 From a5bbb1242c494fad504f2b6ab2816f0268adb03a Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 27 Jul 2012 17:45:30 -0700 Subject: h264_loopfilter: port x86 simd to cpuflags. --- libavcodec/x86/h264_deblock.asm | 104 +++++++++++++++++----------------- libavcodec/x86/h264_deblock_10bit.asm | 77 +++++++++++++------------ libavcodec/x86/h264dsp_mmx.c | 60 ++++++++++---------- 3 files changed, 120 insertions(+), 121 deletions(-) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 1982dc4bd3..0891ef33da 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -282,8 +282,8 @@ cextern pb_A1 ;----------------------------------------------------------------------------- ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -%macro DEBLOCK_LUMA 1 -cglobal deblock_v_luma_8_%1, 5,5,10 +%macro DEBLOCK_LUMA 0 +cglobal deblock_v_luma_8, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 @@ -327,8 +327,8 @@ cglobal deblock_v_luma_8_%1, 5,5,10 ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -INIT_MMX -cglobal deblock_h_luma_8_%1, 5,9 +INIT_MMX cpuname +cglobal deblock_h_luma_8, 5,9 movsxd r7, r1d lea r8, [r7+r7*2] lea r6, [r0-4] @@ -355,7 +355,7 @@ cglobal deblock_h_luma_8_%1, 5,9 %if WIN64 mov [rsp+0x20], r4 %endif - call deblock_v_luma_8_%1 + call deblock_v_luma_8 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 @@ -384,24 +384,24 @@ cglobal deblock_h_luma_8_%1, 5,9 RET %endmacro -INIT_XMM -DEBLOCK_LUMA sse2 -INIT_AVX -DEBLOCK_LUMA avx +INIT_XMM sse2 +DEBLOCK_LUMA +INIT_XMM avx +DEBLOCK_LUMA %else -%macro DEBLOCK_LUMA 3 +%macro DEBLOCK_LUMA 2 ;----------------------------------------------------------------------------- ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_%2_luma_8_%1, 5,5 +cglobal deblock_%1_luma_8, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 dec r3 ; beta-1 add r4, r0 ; pix-3*stride - %assign pad 2*%3+12-(stack_offset&15) + %assign pad 2*%2+12-(stack_offset&15) SUB esp, pad mova m0, [r4+r1] ; p1 @@ -415,7 +415,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 movd m4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] - mova [esp+%3], m4 ; tc + mova [esp+%2], m4 ; tc pcmpgtb m4, m3 mova m3, [r4] ; p2 pand m4, m7 @@ -423,7 +423,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m4 - pand m4, [esp+%3] ; tc + pand m4, [esp+%2] ; tc psubb m7, m4, m6 pand m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -431,7 +431,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 mova m4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 pand m6, [esp] ; mask - mova m5, [esp+%3] ; tc + mova m5, [esp+%2] ; tc psubb m7, m6 pand m5, m6 mova m3, [r0+r1] @@ -446,8 +446,8 @@ cglobal deblock_%2_luma_8_%1, 5,5 ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -INIT_MMX -cglobal deblock_h_luma_8_%1, 0,5 +INIT_MMX cpuname +cglobal deblock_h_luma_8, 0,5 mov r0, r0mp mov r3, r1m lea r4, [r3*3] @@ -470,11 +470,11 @@ cglobal deblock_h_luma_8_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 - call deblock_%2_luma_8_%1 -%ifidn %2, v8 + call deblock_%1_luma_8 +%ifidn %1, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 - call deblock_%2_luma_8_%1 + call deblock_%1_luma_8 %endif ADD esp, 20 @@ -501,12 +501,12 @@ cglobal deblock_h_luma_8_%1, 0,5 RET %endmacro ; DEBLOCK_LUMA -INIT_MMX -DEBLOCK_LUMA mmxext, v8, 8 -INIT_XMM -DEBLOCK_LUMA sse2, v, 16 -INIT_AVX -DEBLOCK_LUMA avx, v, 16 +INIT_MMX mmx2 +DEBLOCK_LUMA v8, 8 +INIT_XMM sse2 +DEBLOCK_LUMA v, 16 +INIT_XMM avx +DEBLOCK_LUMA v, 16 %endif ; ARCH @@ -608,7 +608,7 @@ DEBLOCK_LUMA avx, v, 16 %define mask1p mask1q %endmacro -%macro DEBLOCK_LUMA_INTRA 2 +%macro DEBLOCK_LUMA_INTRA 1 %define p1 m0 %define p0 m1 %define q0 m2 @@ -643,7 +643,7 @@ DEBLOCK_LUMA avx, v, 16 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_%2_luma_intra_8_%1, 4,6,16 +cglobal deblock_%1_luma_intra_8, 4,6,16 %if ARCH_X86_64 == 0 sub esp, 0x60 %endif @@ -700,12 +700,12 @@ cglobal deblock_%2_luma_intra_8_%1, 4,6,16 %endif RET -INIT_MMX +INIT_MMX cpuname %if ARCH_X86_64 ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_8_%1, 4,9 +cglobal deblock_h_luma_intra_8, 4,9 movsxd r7, r1d lea r8, [r7*3] lea r6, [r0-4] @@ -721,7 +721,7 @@ cglobal deblock_h_luma_intra_8_%1, 4,9 lea r0, [pix_tmp+0x40] mov r1, 0x10 - call deblock_v_luma_intra_8_%1 + call deblock_v_luma_intra_8 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r8] @@ -734,7 +734,7 @@ cglobal deblock_h_luma_intra_8_%1, 4,9 add rsp, 0x88 RET %else -cglobal deblock_h_luma_intra_8_%1, 2,4 +cglobal deblock_h_luma_intra_8, 2,4 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] @@ -753,10 +753,10 @@ cglobal deblock_h_luma_intra_8_%1, 2,4 PUSH dword r2m PUSH dword 16 PUSH r0 - call deblock_%2_luma_intra_8_%1 -%ifidn %2, v8 + call deblock_%1_luma_intra_8 +%ifidn %1, v8 add dword [rsp], 8 ; pix_tmp+8 - call deblock_%2_luma_intra_8_%1 + call deblock_%1_luma_intra_8 %endif ADD esp, 16 @@ -775,16 +775,16 @@ cglobal deblock_h_luma_intra_8_%1, 2,4 %endif ; ARCH_X86_64 %endmacro ; DEBLOCK_LUMA_INTRA -INIT_XMM -DEBLOCK_LUMA_INTRA sse2, v -INIT_AVX -DEBLOCK_LUMA_INTRA avx , v +INIT_XMM sse2 +DEBLOCK_LUMA_INTRA v +INIT_XMM avx +DEBLOCK_LUMA_INTRA v %if ARCH_X86_64 == 0 -INIT_MMX -DEBLOCK_LUMA_INTRA mmxext, v8 +INIT_MMX mmx2 +DEBLOCK_LUMA_INTRA v8 %endif -INIT_MMX +INIT_MMX mmx2 %macro CHROMA_V_START 0 dec r2d ; alpha-1 @@ -809,13 +809,13 @@ INIT_MMX ;----------------------------------------------------------------------------- ; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_8_mmxext, 5,6 +cglobal deblock_v_chroma_8, 5,6 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call ff_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmx2 movq [t5+r1], m1 movq [r0], m2 RET @@ -823,7 +823,7 @@ cglobal deblock_v_chroma_8_mmxext, 5,6 ;----------------------------------------------------------------------------- ; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_8_mmxext, 5,7 +cglobal deblock_h_chroma_8, 5,7 %if UNIX64 %define buf0 [rsp-24] %define buf1 [rsp-16] @@ -839,7 +839,7 @@ cglobal deblock_h_chroma_8_mmxext, 5,7 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 - call ff_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmx2 movq m0, buf0 movq m3, buf1 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) @@ -849,7 +849,7 @@ cglobal deblock_h_chroma_8_mmxext, 5,7 RET ALIGN 16 -ff_chroma_inter_body_mmxext: +ff_chroma_inter_body_mmx2: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 @@ -876,13 +876,13 @@ ff_chroma_inter_body_mmxext: ;----------------------------------------------------------------------------- ; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_intra_8_mmxext, 4,5 +cglobal deblock_v_chroma_intra_8, 4,5 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call ff_chroma_intra_body_mmxext + call ff_chroma_intra_body_mmx2 movq [t5+r1], m1 movq [r0], m2 RET @@ -890,15 +890,15 @@ cglobal deblock_v_chroma_intra_8_mmxext, 4,5 ;----------------------------------------------------------------------------- ; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_chroma_intra_8_mmxext, 4,6 +cglobal deblock_h_chroma_intra_8, 4,6 CHROMA_H_START TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) - call ff_chroma_intra_body_mmxext + call ff_chroma_intra_body_mmx2 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET ALIGN 16 -ff_chroma_intra_body_mmxext: +ff_chroma_intra_body_mmx2: LOAD_MASK r2d, r3d movq m5, m1 movq m6, m2 diff --git a/libavcodec/x86/h264_deblock_10bit.asm b/libavcodec/x86/h264_deblock_10bit.asm index ae385e0224..ba2f91490e 100644 --- a/libavcodec/x86/h264_deblock_10bit.asm +++ b/libavcodec/x86/h264_deblock_10bit.asm @@ -151,11 +151,11 @@ cextern pw_4 %endif %endmacro -%macro DEBLOCK_LUMA 1 +%macro DEBLOCK_LUMA 0 ;----------------------------------------------------------------------------- ; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16) +cglobal deblock_v_luma_10, 5,5,8*(mmsize/16) %assign pad 5*mmsize+12-(stack_offset&15) %define tcm [rsp] %define ms1 [rsp+mmsize] @@ -210,7 +210,7 @@ cglobal deblock_v_luma_10_%1, 5,5,8*(mmsize/16) ADD rsp, pad RET -cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16) +cglobal deblock_h_luma_10, 5,6,8*(mmsize/16) %assign pad 7*mmsize+12-(stack_offset&15) %define tcm [rsp] %define ms1 [rsp+mmsize] @@ -301,7 +301,6 @@ cglobal deblock_h_luma_10_%1, 5,6,8*(mmsize/16) RET %endmacro -INIT_XMM %if ARCH_X86_64 ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 ; m12=alpha, m13=beta @@ -339,8 +338,8 @@ INIT_XMM SWAP 3, 9 %endmacro -%macro DEBLOCK_LUMA_64 1 -cglobal deblock_v_luma_10_%1, 5,5,15 +%macro DEBLOCK_LUMA_64 0 +cglobal deblock_v_luma_10, 5,5,15 %define p2 m8 %define p1 m0 %define p0 m1 @@ -377,7 +376,7 @@ cglobal deblock_v_luma_10_%1, 5,5,15 jg .loop REP_RET -cglobal deblock_h_luma_10_%1, 5,7,15 +cglobal deblock_h_luma_10, 5,7,15 shl r2d, 2 shl r3d, 2 LOAD_AB m12, m13, r2, r3 @@ -417,10 +416,10 @@ cglobal deblock_h_luma_10_%1, 5,7,15 REP_RET %endmacro -INIT_XMM -DEBLOCK_LUMA_64 sse2 -INIT_AVX -DEBLOCK_LUMA_64 avx +INIT_XMM sse2 +DEBLOCK_LUMA_64 +INIT_XMM avx +DEBLOCK_LUMA_64 %endif %macro SWAPMOVA 2 @@ -602,8 +601,8 @@ DEBLOCK_LUMA_64 avx ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -%macro DEBLOCK_LUMA_INTRA_64 1 -cglobal deblock_v_luma_intra_10_%1, 4,7,16 +%macro DEBLOCK_LUMA_INTRA_64 0 +cglobal deblock_v_luma_intra_10, 4,7,16 %define t0 m1 %define t1 m2 %define t2 m4 @@ -653,7 +652,7 @@ cglobal deblock_v_luma_intra_10_%1, 4,7,16 ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_10_%1, 4,7,16 +cglobal deblock_h_luma_intra_10, 4,7,16 %define t0 m15 %define t1 m14 %define t2 m2 @@ -712,18 +711,18 @@ cglobal deblock_h_luma_intra_10_%1, 4,7,16 RET %endmacro -INIT_XMM -DEBLOCK_LUMA_INTRA_64 sse2 -INIT_AVX -DEBLOCK_LUMA_INTRA_64 avx +INIT_XMM sse2 +DEBLOCK_LUMA_INTRA_64 +INIT_XMM avx +DEBLOCK_LUMA_INTRA_64 %endif -%macro DEBLOCK_LUMA_INTRA 1 +%macro DEBLOCK_LUMA_INTRA 0 ;----------------------------------------------------------------------------- ; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16) +cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16) LUMA_INTRA_INIT 3 lea r4, [r1*4] lea r5, [r1*3] @@ -751,7 +750,7 @@ cglobal deblock_v_luma_intra_10_%1, 4,7,8*(mmsize/16) ;----------------------------------------------------------------------------- ; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16) +cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) LUMA_INTRA_INIT 8 %if mmsize == 8 lea r4, [r1*3] @@ -793,15 +792,15 @@ cglobal deblock_h_luma_intra_10_%1, 4,7,8*(mmsize/16) %endmacro %if ARCH_X86_64 == 0 -INIT_MMX -DEBLOCK_LUMA mmxext -DEBLOCK_LUMA_INTRA mmxext -INIT_XMM -DEBLOCK_LUMA sse2 -DEBLOCK_LUMA_INTRA sse2 -INIT_AVX -DEBLOCK_LUMA avx -DEBLOCK_LUMA_INTRA avx +INIT_MMX mmx2 +DEBLOCK_LUMA +DEBLOCK_LUMA_INTRA +INIT_XMM sse2 +DEBLOCK_LUMA +DEBLOCK_LUMA_INTRA +INIT_XMM avx +DEBLOCK_LUMA +DEBLOCK_LUMA_INTRA %endif ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp @@ -843,11 +842,11 @@ DEBLOCK_LUMA_INTRA avx psraw %1, 6 %endmacro -%macro DEBLOCK_CHROMA 1 +%macro DEBLOCK_CHROMA 0 ;----------------------------------------------------------------------------- ; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16) +cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) mov r5, r0 sub r0, r1 sub r0, r1 @@ -881,7 +880,7 @@ cglobal deblock_v_chroma_10_%1, 5,7-(mmsize/16),8*(mmsize/16) ;----------------------------------------------------------------------------- ; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16) +cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) mov r4, r0 sub r0, r1 sub r0, r1 @@ -908,10 +907,10 @@ cglobal deblock_v_chroma_intra_10_%1, 4,6-(mmsize/16),8*(mmsize/16) %endmacro %if ARCH_X86_64 == 0 -INIT_MMX -DEBLOCK_CHROMA mmxext +INIT_MMX mmx2 +DEBLOCK_CHROMA %endif -INIT_XMM -DEBLOCK_CHROMA sse2 -INIT_AVX -DEBLOCK_CHROMA avx +INIT_XMM sse2 +DEBLOCK_CHROMA +INIT_XMM avx +DEBLOCK_CHROMA diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c index c0a40c42d7..3f18f64f4b 100644 --- a/libavcodec/x86/h264dsp_mmx.c +++ b/libavcodec/x86/h264dsp_mmx.c @@ -249,12 +249,12 @@ void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, in int alpha, int beta); #define LF_FUNCS(type, depth)\ -LF_FUNC (h, chroma, depth, mmxext)\ -LF_IFUNC(h, chroma_intra, depth, mmxext)\ -LF_FUNC (v, chroma, depth, mmxext)\ -LF_IFUNC(v, chroma_intra, depth, mmxext)\ -LF_FUNC (h, luma, depth, mmxext)\ -LF_IFUNC(h, luma_intra, depth, mmxext)\ +LF_FUNC (h, chroma, depth, mmx2)\ +LF_IFUNC(h, chroma_intra, depth, mmx2)\ +LF_FUNC (v, chroma, depth, mmx2)\ +LF_IFUNC(v, chroma_intra, depth, mmx2)\ +LF_FUNC (h, luma, depth, mmx2)\ +LF_IFUNC(h, luma_intra, depth, mmx2)\ LF_FUNC (h, luma, depth, sse2)\ LF_IFUNC(h, luma_intra, depth, sse2)\ LF_FUNC (v, luma, depth, sse2)\ @@ -276,24 +276,24 @@ LF_FUNCS( uint8_t, 8) LF_FUNCS(uint16_t, 10) #if ARCH_X86_32 -LF_FUNC (v8, luma, 8, mmxext) -static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) +LF_FUNC (v8, luma, 8, mmx2) +static void ff_deblock_v_luma_8_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { if((tc0[0] & tc0[1]) >= 0) - ff_deblock_v8_luma_8_mmxext(pix+0, stride, alpha, beta, tc0); + ff_deblock_v8_luma_8_mmx2(pix+0, stride, alpha, beta, tc0); if((tc0[2] & tc0[3]) >= 0) - ff_deblock_v8_luma_8_mmxext(pix+8, stride, alpha, beta, tc0+2); + ff_deblock_v8_luma_8_mmx2(pix+8, stride, alpha, beta, tc0+2); } -LF_IFUNC(v8, luma_intra, 8, mmxext) -static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, int alpha, int beta) +LF_IFUNC(v8, luma_intra, 8, mmx2) +static void ff_deblock_v_luma_intra_8_mmx2(uint8_t *pix, int stride, int alpha, int beta) { - ff_deblock_v8_luma_intra_8_mmxext(pix+0, stride, alpha, beta); - ff_deblock_v8_luma_intra_8_mmxext(pix+8, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmx2(pix+0, stride, alpha, beta); + ff_deblock_v8_luma_intra_8_mmx2(pix+8, stride, alpha, beta); } #endif /* ARCH_X86_32 */ -LF_FUNC (v, luma, 10, mmxext) -LF_IFUNC(v, luma_intra, 10, mmxext) +LF_FUNC (v, luma, 10, mmx2) +LF_IFUNC(v, luma_intra, 10, mmx2) /***********************************/ /* weighted prediction */ @@ -373,17 +373,17 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom c->h264_idct_add8 = ff_h264_idct_add8_8_mmx2; c->h264_idct_add16intra= ff_h264_idct_add16intra_8_mmx2; - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext; + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmx2; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmx2; if (chroma_format_idc == 1) { - c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext; - c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext; + c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmx2; + c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmx2; } #if ARCH_X86_32 - c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext; - c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; + c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmx2; + c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmx2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmx2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmx2; #endif c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2; @@ -436,12 +436,12 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom if (mm_flags & AV_CPU_FLAG_MMX) { if (mm_flags & AV_CPU_FLAG_MMX2) { #if ARCH_X86_32 - c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext; - c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext; - c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext; - c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext; - c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; - c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; + c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmx2; + c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmx2; + c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmx2; + c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmx2; + c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmx2; + c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmx2; #endif c->h264_idct_dc_add= ff_h264_idct_dc_add_10_mmx2; if (mm_flags&AV_CPU_FLAG_SSE2) { -- cgit v1.2.3 From 4d777eedfd339e431d73a3787cc9587775f1ba9c Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 26 Jul 2012 22:16:37 -0700 Subject: vp3: don't compile mmx IDCT functions on x86-64. 64-bit CPUs always have SSE2, and a SSE2 version exists, thus the MMX version will never be used. --- libavcodec/x86/vp3dsp.asm | 3 +++ libavcodec/x86/vp3dsp_init.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index 5877520c6c..7a88892c11 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -607,8 +607,11 @@ cglobal vp3_idct_add, 3, 4, 9 RET %endmacro +%if ARCH_X86_32 INIT_MMX mmx vp3_idct_funcs +%endif + INIT_XMM sse2 vp3_idct_funcs diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c index cd8e206a2c..704d4a6927 100644 --- a/libavcodec/x86/vp3dsp_init.c +++ b/libavcodec/x86/vp3dsp_init.c @@ -41,11 +41,13 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) #if HAVE_YASM int cpuflags = av_get_cpu_flags(); +#if ARCH_X86_32 if (HAVE_MMX && cpuflags & AV_CPU_FLAG_MMX) { c->idct_put = ff_vp3_idct_put_mmx; c->idct_add = ff_vp3_idct_add_mmx; c->idct_perm = FF_PARTTRANS_IDCT_PERM; } +#endif if (HAVE_MMX2 && cpuflags & AV_CPU_FLAG_MMX2) { c->idct_dc_add = ff_vp3_idct_dc_add_mmx2; -- cgit v1.2.3 From b3c5ae5607275f691289df737edaf47c72e6028c Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 28 Jul 2012 08:20:19 -0700 Subject: fft: rename "z" to "zc" to prevent name collision. Without this, cglobal will expand "z" to "zh" to access the high byte in a register's word, which causes a name collision with the ZH(x) macro further up in this file. --- libavcodec/x86/fft_mmx.asm | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 527e215522..5c6583b3b7 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -516,23 +516,23 @@ INIT_MMX 3dnow FFT48_3DN -%define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)] -%define Z2(x) [zq + o3q + mmsize*(x&1)] -%define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] -%define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2] +%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] +%define Z2(x) [zcq + o3q + mmsize*(x&1)] +%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] +%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] %macro DECL_PASS 2+ ; name, payload align 16 %1: -DEFINE_ARGS z, w, n, o1, o3 +DEFINE_ARGS zc, w, n, o1, o3 lea o3q, [nq*3] lea o1q, [nq*8] shl o3q, 4 .loop: %2 - add zq, mmsize*2 - add wq, mmsize - sub nd, mmsize/8 + add zcq, mmsize*2 + add wq, mmsize + sub nd, mmsize/8 jg .loop rep ret %endmacro @@ -747,7 +747,7 @@ section .text ; On x86_32, this function does the register saving and restoring for all of fft. ; The others pass args in registers and don't spill anything. -cglobal fft_dispatch%2, 2,5,8, z, nbits +cglobal fft_dispatch%2, 2,5,8, zc, nbits FFT_DISPATCH fullsuffix, nbits RET %endmacro ; DECL_FFT -- cgit v1.2.3 From c83f44dba11930744e167856b48fbc24a8ff0e63 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Sat, 28 Jul 2012 08:01:12 -0700 Subject: h264_idct_10bit: port x86 assembly to cpuflags. --- libavcodec/x86/h264_idct_10bit.asm | 254 ++++++++++++++++++------------------- 1 file changed, 127 insertions(+), 127 deletions(-) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 934a7ff633..2aab9864d6 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -72,25 +72,25 @@ SECTION .text STORE_DIFFx2 m2, m3, m4, m5, %1, %3 %endmacro -%macro IDCT_ADD_10 1 -cglobal h264_idct_add_10_%1, 3,3 +%macro IDCT_ADD_10 0 +cglobal h264_idct_add_10, 3,3 IDCT4_ADD_10 r0, r1, r2 RET %endmacro -INIT_XMM -IDCT_ADD_10 sse2 +INIT_XMM sse2 +IDCT_ADD_10 %if HAVE_AVX -INIT_AVX -IDCT_ADD_10 avx +INIT_XMM avx +IDCT_ADD_10 %endif ;----------------------------------------------------------------------------- ; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- ;;;;;;; NO FATE SAMPLES TRIGGER THIS -%macro ADD4x4IDCT 1 -add4x4_idct_%1: +%macro ADD4x4IDCT 0 +add4x4_idct %+ SUFFIX: add r5, r0 mova m0, [r2+ 0] mova m1, [r2+16] @@ -107,52 +107,52 @@ add4x4_idct_%1: ret %endmacro -INIT_XMM +INIT_XMM sse2 ALIGN 16 -ADD4x4IDCT sse2 +ADD4x4IDCT %if HAVE_AVX -INIT_AVX +INIT_XMM avx ALIGN 16 -ADD4x4IDCT avx +ADD4x4IDCT %endif -%macro ADD16_OP 3 - cmp byte [r4+%3], 0 - jz .skipblock%2 - mov r5d, [r1+%2*4] - call add4x4_idct_%1 -.skipblock%2: -%if %2<15 +%macro ADD16_OP 2 + cmp byte [r4+%2], 0 + jz .skipblock%1 + mov r5d, [r1+%1*4] + call add4x4_idct %+ SUFFIX +.skipblock%1: +%if %1<15 add r2, 64 %endif %endmacro -%macro IDCT_ADD16_10 1 -cglobal h264_idct_add16_10_%1, 5,6 - ADD16_OP %1, 0, 4+1*8 - ADD16_OP %1, 1, 5+1*8 - ADD16_OP %1, 2, 4+2*8 - ADD16_OP %1, 3, 5+2*8 - ADD16_OP %1, 4, 6+1*8 - ADD16_OP %1, 5, 7+1*8 - ADD16_OP %1, 6, 6+2*8 - ADD16_OP %1, 7, 7+2*8 - ADD16_OP %1, 8, 4+3*8 - ADD16_OP %1, 9, 5+3*8 - ADD16_OP %1, 10, 4+4*8 - ADD16_OP %1, 11, 5+4*8 - ADD16_OP %1, 12, 6+3*8 - ADD16_OP %1, 13, 7+3*8 - ADD16_OP %1, 14, 6+4*8 - ADD16_OP %1, 15, 7+4*8 +%macro IDCT_ADD16_10 0 +cglobal h264_idct_add16_10, 5,6 + ADD16_OP 0, 4+1*8 + ADD16_OP 1, 5+1*8 + ADD16_OP 2, 4+2*8 + ADD16_OP 3, 5+2*8 + ADD16_OP 4, 6+1*8 + ADD16_OP 5, 7+1*8 + ADD16_OP 6, 6+2*8 + ADD16_OP 7, 7+2*8 + ADD16_OP 8, 4+3*8 + ADD16_OP 9, 5+3*8 + ADD16_OP 10, 4+4*8 + ADD16_OP 11, 5+4*8 + ADD16_OP 12, 6+3*8 + ADD16_OP 13, 7+3*8 + ADD16_OP 14, 6+4*8 + ADD16_OP 15, 7+4*8 REP_RET %endmacro -INIT_XMM -IDCT_ADD16_10 sse2 +INIT_XMM sse2 +IDCT_ADD16_10 %if HAVE_AVX -INIT_AVX -IDCT_ADD16_10 avx +INIT_XMM avx +IDCT_ADD16_10 %endif ;----------------------------------------------------------------------------- @@ -185,8 +185,8 @@ IDCT_ADD16_10 avx mova [%1+%3 ], m4 %endmacro -INIT_MMX -cglobal h264_idct_dc_add_10_mmx2,3,3 +INIT_MMX mmx2 +cglobal h264_idct_dc_add_10,3,3 movd m0, [r1] paddd m0, [pd_32] psrad m0, 6 @@ -199,8 +199,8 @@ cglobal h264_idct_dc_add_10_mmx2,3,3 ;----------------------------------------------------------------------------- ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) ;----------------------------------------------------------------------------- -%macro IDCT8_DC_ADD 1 -cglobal h264_idct8_dc_add_10_%1,3,3,7 +%macro IDCT8_DC_ADD 0 +cglobal h264_idct8_dc_add_10,3,3,7 mov r1d, [r1] add r1, 32 sar r1, 6 @@ -214,45 +214,45 @@ cglobal h264_idct8_dc_add_10_%1,3,3,7 RET %endmacro -INIT_XMM -IDCT8_DC_ADD sse2 +INIT_XMM sse2 +IDCT8_DC_ADD %if HAVE_AVX -INIT_AVX -IDCT8_DC_ADD avx +INIT_XMM avx +IDCT8_DC_ADD %endif ;----------------------------------------------------------------------------- ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- -%macro AC 2 -.ac%2 - mov r5d, [r1+(%2+0)*4] - call add4x4_idct_%1 - mov r5d, [r1+(%2+1)*4] +%macro AC 1 +.ac%1 + mov r5d, [r1+(%1+0)*4] + call add4x4_idct %+ SUFFIX + mov r5d, [r1+(%1+1)*4] add r2, 64 - call add4x4_idct_%1 + call add4x4_idct %+ SUFFIX add r2, 64 - jmp .skipadd%2 + jmp .skipadd%1 %endmacro %assign last_block 16 -%macro ADD16_OP_INTRA 3 - cmp word [r4+%3], 0 - jnz .ac%2 +%macro ADD16_OP_INTRA 2 + cmp word [r4+%2], 0 + jnz .ac%1 mov r5d, [r2+ 0] or r5d, [r2+64] - jz .skipblock%2 - mov r5d, [r1+(%2+0)*4] - call idct_dc_add_%1 -.skipblock%2: -%if %2