From 8ad77b65b548a6b2f4707265ebd7e97f956acf0b Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Tue, 10 May 2011 07:08:24 -0700 Subject: Update x86 H.264 deblock asm Includes AVX versions from x264. --- libavcodec/x86/h264_deblock.asm | 395 +++++++++++++++++++++++----------------- 1 file changed, 227 insertions(+), 168 deletions(-) (limited to 'libavcodec/x86/h264_deblock.asm') diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 01778a45cb..081c0e1aef 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -1,10 +1,11 @@ ;***************************************************************************** -;* MMX/SSE2-optimized H.264 deblocking code +;* MMX/SSE2/AVX-optimized H.264 deblocking code ;***************************************************************************** -;* Copyright (C) 2005-2008 x264 project +;* Copyright (C) 2005-2011 x264 project ;* ;* Authors: Loren Merritt ;* Jason Garrett-Glaser +;* Oskar Arvidsson ;* ;* This file is part of Libav. ;* @@ -26,96 +27,135 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA +SECTION .text cextern pb_0 cextern pb_1 cextern pb_3 cextern pb_A1 -SECTION .text - ; expands to [base],...,[base+7*stride] %define PASS8ROWS(base, base3, stride, stride3) \ [base], [base+stride], [base+stride*2], [base3], \ [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] -; in: 8 rows of 4 bytes in %1..%8 +%define PASS8ROWS(base, base3, stride, stride3, offset) \ + PASS8ROWS(base+offset, base3+offset, stride, stride3) + +; in: 8 rows of 4 bytes in %4..%11 ; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 8 - movd m0, %1 - movd m2, %2 - movd m1, %3 - movd m3, %4 - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, m0 - punpcklwd m0, m1 - punpckhwd m2, m1 - - movd m4, %5 - movd m6, %6 - movd m5, %7 - movd m7, %8 - punpcklbw m4, m6 - punpcklbw m5, m7 - movq m6, m4 - punpcklwd m4, m5 - punpckhwd m6, m5 - - movq m1, m0 - movq m3, m2 - punpckldq m0, m4 - punpckhdq m1, m4 - punpckldq m2, m6 - punpckhdq m3, m6 +%macro TRANSPOSE4x8_LOAD 11 + movh m0, %4 + movh m2, %5 + movh m1, %6 + movh m3, %7 + punpckl%1 m0, m2 + punpckl%1 m1, m3 + mova m2, m0 + punpckl%2 m0, m1 + punpckh%2 m2, m1 + + movh m4, %8 + movh m6, %9 + movh m5, %10 + movh m7, %11 + punpckl%1 m4, m6 + punpckl%1 m5, m7 + mova m6, m4 + punpckl%2 m4, m5 + punpckh%2 m6, m5 + + punpckh%3 m1, m0, m4 + punpckh%3 m3, m2, m6 + punpckl%3 m0, m4 + punpckl%3 m2, m6 %endmacro ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 -%macro TRANSPOSE8x4_STORE 8 - movq m4, m0 - movq m5, m1 - movq m6, m2 - punpckhdq m4, m4 - punpckhdq m5, m5 - punpckhdq m6, m6 +%macro TRANSPOSE8x4B_STORE 8 + punpckhdq m4, m0, m0 + punpckhdq m5, m1, m1 + punpckhdq m6, m2, m2 punpcklbw m0, m1 punpcklbw m2, m3 - movq m1, m0 - punpcklwd m0, m2 - punpckhwd m1, m2 - movd %1, m0 - punpckhdq m0, m0 - movd %2, m0 - movd %3, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + movh %1, m1 punpckhdq m1, m1 - movd %4, m1 + movh %2, m1 + movh %3, m0 + punpckhdq m0, m0 + movh %4, m0 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 - movq m5, m4 - punpcklwd m4, m6 - punpckhwd m5, m6 - movd %5, m4 - punpckhdq m4, m4 - movd %6, m4 - movd %7, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + movh %5, m5 punpckhdq m5, m5 - movd %8, m5 + movh %6, m5 + movh %7, m4 + punpckhdq m4, m4 + movh %8, m4 +%endmacro + +%macro TRANSPOSE4x8B_LOAD 8 + TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 +%endmacro + +%macro TRANSPOSE4x8W_LOAD 8 +%if mmsize==16 + TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 +%else + SWAP 1, 4, 2, 3 + mova m0, [t5] + mova m1, [t5+r1] + mova m2, [t5+r1*2] + mova m3, [t5+t6] + TRANSPOSE4x4W 0, 1, 2, 3, 4 +%endif +%endmacro + +%macro TRANSPOSE8x2W_STORE 8 + punpckhwd m0, m1, m2 + punpcklwd m1, m2 +%if mmsize==8 + movd %3, m0 + movd %1, m1 + psrlq m1, 32 + psrlq m0, 32 + movd %2, m1 + movd %4, m0 +%else + movd %5, m0 + movd %1, m1 + psrldq m1, 4 + psrldq m0, 4 + movd %2, m1 + movd %6, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %3, m1 + movd %7, m0 + psrldq m1, 4 + psrldq m0, 4 + movd %4, m1 + movd %8, m0 +%endif %endmacro %macro SBUTTERFLY3 4 - movq %4, %2 + punpckh%1 %4, %2, %3 punpckl%1 %2, %3 - punpckh%1 %4, %3 %endmacro ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] %macro TRANSPOSE6x8_MEM 9 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -123,30 +163,32 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - movq [%9+0x10], m1 - SBUTTERFLY3 bw, m6, %8, m5 - SBUTTERFLY3 wd, m0, m2, m1 - SBUTTERFLY3 wd, m4, m6, m2 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + movq [%9+0x10], m3 + SBUTTERFLY3 bw, m6, %8, m7 + SBUTTERFLY wd, 0, 2, 3 + SBUTTERFLY wd, 4, 6, 3 punpckhdq m0, m4 movq [%9+0x00], m0 - SBUTTERFLY3 wd, m7, [%9+0x10], m6 - SBUTTERFLY3 wd, m3, m5, m4 - SBUTTERFLY3 dq, m7, m3, m0 - SBUTTERFLY3 dq, m1, m2, m5 - punpckldq m6, m4 - movq [%9+0x10], m1 - movq [%9+0x20], m5 - movq [%9+0x30], m7 - movq [%9+0x40], m0 - movq [%9+0x50], m6 + SBUTTERFLY3 wd, m1, [%9+0x10], m3 + SBUTTERFLY wd, 5, 7, 0 + SBUTTERFLY dq, 1, 5, 0 + SBUTTERFLY dq, 2, 6, 0 + punpckldq m3, m7 + movq [%9+0x10], m2 + movq [%9+0x20], m6 + movq [%9+0x30], m1 + movq [%9+0x40], m5 + movq [%9+0x50], m3 + RESET_MM_PERMUTATION %endmacro ; in: 8 rows of 8 in %1..%8 ; out: 8 rows of 8 in %9..%16 %macro TRANSPOSE8x8_MEM 16 + RESET_MM_PERMUTATION movq m0, %1 movq m1, %2 movq m2, %3 @@ -154,38 +196,44 @@ SECTION .text movq m4, %5 movq m5, %6 movq m6, %7 - SBUTTERFLY3 bw, m0, m1, m7 - SBUTTERFLY3 bw, m2, m3, m1 - SBUTTERFLY3 bw, m4, m5, m3 - SBUTTERFLY3 bw, m6, %8, m5 - movq %9, m3 - SBUTTERFLY3 wd, m0, m2, m3 - SBUTTERFLY3 wd, m4, m6, m2 - SBUTTERFLY3 wd, m7, m1, m6 - movq %11, m2 - movq m2, %9 - SBUTTERFLY3 wd, m2, m5, m1 - SBUTTERFLY3 dq, m0, m4, m5 - SBUTTERFLY3 dq, m7, m2, m4 + SBUTTERFLY bw, 0, 1, 7 + SBUTTERFLY bw, 2, 3, 7 + SBUTTERFLY bw, 4, 5, 7 + SBUTTERFLY3 bw, m6, %8, m7 + movq %9, m5 + SBUTTERFLY wd, 0, 2, 5 + SBUTTERFLY wd, 4, 6, 5 + SBUTTERFLY wd, 1, 3, 5 + movq %11, m6 + movq m6, %9 + SBUTTERFLY wd, 6, 7, 5 + SBUTTERFLY dq, 0, 4, 5 + SBUTTERFLY dq, 1, 6, 5 movq %9, m0 - movq %10, m5 - movq %13, m7 - movq %14, m4 - SBUTTERFLY3 dq, m3, %11, m0 - SBUTTERFLY3 dq, m6, m1, m5 - movq %11, m3 + movq %10, m4 + movq %13, m1 + movq %14, m6 + SBUTTERFLY3 dq, m2, %11, m0 + SBUTTERFLY dq, 3, 7, 4 + movq %11, m2 movq %12, m0 - movq %15, m6 - movq %16, m5 + movq %15, m3 + movq %16, m7 + RESET_MM_PERMUTATION %endmacro ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT 5 +%if avx_enabled == 0 mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 +%else + psubusb %5, %2, %1 + psubusb %4, %1, %2 +%endif por %4, %5 psubusb %4, %3 %endmacro @@ -193,32 +241,28 @@ SECTION .text ; out: %4 = |%1-%2|>%3 ; clobbers: %5 %macro DIFF_GT2 5 +%ifdef ARCH_X86_64 + psubusb %5, %2, %1 + psubusb %4, %1, %2 +%else mova %5, %2 mova %4, %1 psubusb %5, %1 psubusb %4, %2 +%endif psubusb %5, %3 psubusb %4, %3 pcmpeqb %4, %5 %endmacro -%macro SPLATW 1 -%ifidn m0, xmm0 - pshuflw %1, %1, 0 - punpcklqdq %1, %1 -%else - pshufw %1, %1, 0 -%endif -%endmacro - ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 ; out: m5=beta-1, m7=mask, %3=alpha-1 ; clobbers: m4,m6 %macro LOAD_MASK 2-3 movd m4, %1 movd m5, %2 - SPLATW m4 - SPLATW m5 + SPLATW m4, m4 + SPLATW m5, m5 packuswb m4, m4 ; 16x alpha-1 packuswb m5, m5 ; 16x beta-1 %if %0>2 @@ -237,8 +281,7 @@ SECTION .text ; out: m1=p0' m2=q0' ; clobbers: m0,3-6 %macro DEBLOCK_P0_Q0 0 - mova m5, m1 - pxor m5, m2 ; p0^q0 + pxor m5, m1, m2 ; p0^q0 pand m5, [pb_1] ; (p0^q0)&1 pcmpeqb m4, m4 pxor m3, m4 @@ -264,14 +307,12 @@ SECTION .text ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) ; clobbers: q2, tmp, tc0 %macro LUMA_Q1 6 - mova %6, m1 - pavgb %6, m2 + pavgb %6, m1, m2 pavgb %2, %6 ; avg(p2,avg(p0,q0)) pxor %6, %3 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 - mova %6, %1 - psubusb %6, %5 + psubusb %6, %1, %5 paddusb %5, %1 pmaxub %2, %6 pminub %2, %5 @@ -280,10 +321,10 @@ SECTION .text %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -INIT_XMM -cglobal x264_deblock_v_luma_sse2, 5,5,10 +%macro DEBLOCK_LUMA 1 +cglobal deblock_v_luma_%1, 5,5,10 movd m8, [r4] ; tc0 lea r4, [r1*3] dec r2d ; alpha-1 @@ -307,8 +348,7 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 movdqa m3, [r4] ; p2 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m9 - mova m7, m8 - psubb m7, m6 + psubb m7, m8, m6 pand m6, m8 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -326,10 +366,10 @@ cglobal x264_deblock_v_luma_sse2, 5,5,10 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_sse2, 5,7 +cglobal deblock_h_luma_%1, 5,7 movsxd r10, r1d lea r11, [r10+r10*2] lea r6, [r0-4] @@ -350,13 +390,13 @@ cglobal x264_deblock_h_luma_sse2, 5,7 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 - ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them + ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them lea r0, [pix_tmp+0x30] mov r1d, 0x10 %ifdef WIN64 mov [rsp+0x20], r4 %endif - call x264_deblock_v_luma_sse2 + call deblock_v_luma_%1 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) add r6, 2 @@ -365,7 +405,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) shl r10, 3 sub r6, r10 @@ -375,7 +415,7 @@ cglobal x264_deblock_h_luma_sse2, 5,7 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11) + TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11) %ifdef WIN64 add rsp, 0x98 @@ -383,14 +423,20 @@ cglobal x264_deblock_h_luma_sse2, 5,7 add rsp, 0x68 %endif RET +%endmacro + +INIT_XMM +DEBLOCK_LUMA sse2 +INIT_AVX +DEBLOCK_LUMA avx %else %macro DEBLOCK_LUMA 3 ;----------------------------------------------------------------------------- -; void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_%1, 5,5 +cglobal deblock_%2_luma_%1, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 @@ -419,8 +465,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 pand m6, m4 pand m4, [esp+%3] ; tc - mova m7, m4 - psubb m7, m6 + psubb m7, m4, m6 pand m6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -441,10 +486,10 @@ cglobal x264_deblock_%2_luma_%1, 5,5 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX -cglobal x264_deblock_h_luma_%1, 0,5 +cglobal deblock_h_luma_%1, 0,5 mov r0, r0mp mov r3, r1m lea r4, [r3*3] @@ -467,11 +512,11 @@ cglobal x264_deblock_h_luma_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %ifidn %2, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 - call x264_deblock_%2_luma_%1 + call deblock_%2_luma_%1 %endif ADD esp, 20 @@ -484,7 +529,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] @@ -492,7 +537,7 @@ cglobal x264_deblock_h_luma_%1, 0,5 movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] - TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) + TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) ADD esp, pad RET @@ -502,22 +547,34 @@ INIT_MMX DEBLOCK_LUMA mmxext, v8, 8 INIT_XMM DEBLOCK_LUMA sse2, v, 16 +INIT_AVX +DEBLOCK_LUMA avx, v, 16 %endif ; ARCH %macro LUMA_INTRA_P012 4 ; p0..p3 in memory +%ifdef ARCH_X86_64 + pavgb t0, p2, p1 + pavgb t1, p0, q0 +%else mova t0, p2 mova t1, p0 pavgb t0, p1 pavgb t1, q0 +%endif pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 mova t5, t1 +%ifdef ARCH_X86_64 + paddb t2, p2, p1 + paddb t3, p0, q0 +%else mova t2, p2 mova t3, p0 paddb t2, p1 paddb t3, q0 +%endif paddb t2, t3 mova t3, t2 mova t4, t2 @@ -527,10 +584,15 @@ DEBLOCK_LUMA sse2, v, 16 pand t2, mpb_1 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; +%ifdef ARCH_X86_64 + pavgb t1, p2, q1 + psubb t2, p2, q1 +%else mova t1, p2 mova t2, p2 pavgb t1, q1 psubb t2, q1 +%endif paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 pand t2, mpb_1 @@ -543,10 +605,8 @@ DEBLOCK_LUMA sse2, v, 16 pand t3, mpb_1 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 - mova t3, p0 - mova t2, p0 - pxor t3, q1 - pavgb t2, q1 + pxor t3, p0, q1 + pavgb t2, p0, q1 pand t3, mpb_1 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 @@ -560,9 +620,8 @@ DEBLOCK_LUMA sse2, v, 16 mova %1, t1 ; store p0 mova t1, %4 ; p3 - mova t2, t1 + paddb t2, t1, p2 pavgb t1, p2 - paddb t2, p2 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 @@ -624,9 +683,9 @@ DEBLOCK_LUMA sse2, v, 16 %endif ;----------------------------------------------------------------------------- -; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 +cglobal deblock_%2_luma_intra_%1, 4,6,16 %ifndef ARCH_X86_64 sub esp, 0x60 %endif @@ -686,9 +745,9 @@ cglobal x264_deblock_%2_luma_intra_%1, 4,6,16 INIT_MMX %ifdef ARCH_X86_64 ;----------------------------------------------------------------------------- -; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_luma_intra_%1, 4,7 +cglobal deblock_h_luma_intra_%1, 4,7 movsxd r10, r1d lea r11, [r10*3] lea r6, [r0-4] @@ -704,7 +763,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 lea r0, [pix_tmp+0x40] mov r1, 0x10 - call x264_deblock_v_luma_intra_%1 + call deblock_v_luma_intra_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r11] @@ -717,7 +776,7 @@ cglobal x264_deblock_h_luma_intra_%1, 4,7 add rsp, 0x88 RET %else -cglobal x264_deblock_h_luma_intra_%1, 2,4 +cglobal deblock_h_luma_intra_%1, 2,4 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] @@ -736,10 +795,10 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 PUSH dword r2m PUSH dword 16 PUSH r0 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %ifidn %2, v8 add dword [rsp], 8 ; pix_tmp+8 - call x264_deblock_%2_luma_intra_%1 + call deblock_%2_luma_intra_%1 %endif ADD esp, 16 @@ -760,13 +819,13 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4 INIT_XMM DEBLOCK_LUMA_INTRA sse2, v +INIT_AVX +DEBLOCK_LUMA_INTRA avx , v %ifndef ARCH_X86_64 INIT_MMX DEBLOCK_LUMA_INTRA mmxext, v8 %endif - - INIT_MMX %macro CHROMA_V_START 0 @@ -790,23 +849,23 @@ INIT_MMX %define t6 r6 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_mmxext, 5,6 +cglobal deblock_v_chroma_mmxext, 5,6 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call x264_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_mmxext, 5,7 +cglobal deblock_h_chroma_mmxext, 5,7 %ifdef ARCH_X86_64 %define buf0 [rsp-24] %define buf1 [rsp-16] @@ -815,17 +874,17 @@ cglobal x264_deblock_h_chroma_mmxext, 5,7 %define buf1 r2m %endif CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 - call x264_chroma_inter_body_mmxext + call ff_chroma_inter_body_mmxext movq m0, buf0 movq m3, buf1 - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET ALIGN 16 -x264_chroma_inter_body_mmxext: +ff_chroma_inter_body_mmxext: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 @@ -850,31 +909,31 @@ x264_chroma_inter_body_mmxext: %define t6 r5 ;----------------------------------------------------------------------------- -; void x264_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 +cglobal deblock_v_chroma_intra_mmxext, 4,5 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] - call x264_chroma_intra_body_mmxext + call ff_chroma_intra_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET ;----------------------------------------------------------------------------- -; void x264_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) ;----------------------------------------------------------------------------- -cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 +cglobal deblock_h_chroma_intra_mmxext, 4,6 CHROMA_H_START - TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) - call x264_chroma_intra_body_mmxext - TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) + call ff_chroma_intra_body_mmxext + TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) RET ALIGN 16 -x264_chroma_intra_body_mmxext: +ff_chroma_intra_body_mmxext: LOAD_MASK r2d, r3d movq m5, m1 movq m6, m2 -- cgit v1.2.3