From c90b94424cd4953a095d6d6648ba8d499e306b35 Mon Sep 17 00:00:00 2001 From: Jason Garrett-Glaser Date: Fri, 3 Jun 2011 01:12:28 -0700 Subject: 4:4:4 H.264 decoding support Note: this is 4:4:4 from the 2007 spec revision, not the previous (now deprecated) 4:4:4 mode in H.264. --- libavcodec/x86/dsputil_mmx.c | 4 ++-- libavcodec/x86/h264_i386.h | 15 +++++++------ libavcodec/x86/h264_idct.asm | 44 +++++++++++++++++++++++--------------- libavcodec/x86/h264_idct_10bit.asm | 35 +++++++++++++++++------------- 4 files changed, 58 insertions(+), 40 deletions(-) (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 1cc6991666..214c6a3945 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -784,7 +784,7 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ /* draw the edges of width 'w' of an image of size width, height this mmx version can only handle w==8 || w==16 */ -static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides) +static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides) { uint8_t *ptr, *last_line; int i; @@ -839,7 +839,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, /* top and bottom (and hopefully also the corners) */ if (sides&EDGE_TOP) { - for(i = 0; i < w; i += 4) { + for(i = 0; i < h; i += 4) { ptr= buf - (i + 1) * wrap - w; __asm__ volatile( "1: \n\t" diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h index c850dc2ef3..e2dffe1e46 100644 --- a/libavcodec/x86/h264_i386.h +++ b/libavcodec/x86/h264_i386.h @@ -36,7 +36,7 @@ #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS) static int decode_significance_x86(CABACContext *c, int max_coeff, uint8_t *significant_coeff_ctx_base, - int *index){ + int *index, x86_reg last_off){ void *end= significant_coeff_ctx_base + max_coeff - 1; int minusstart= -(int)significant_coeff_ctx_base; int minusindex= 4-(int)index; @@ -52,10 +52,12 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, "test $1, %%edx \n\t" " jz 3f \n\t" + "add %7, %1 \n\t" - BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx", + BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") + "sub %7, %1 \n\t" "mov %2, %%"REG_a" \n\t" "movl %4, %%ecx \n\t" "add %1, %%"REG_c" \n\t" @@ -82,7 +84,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, "movl %%esi, "RANGE "(%3) \n\t" "movl %%ebx, "LOW "(%3) \n\t" :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index) - :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex) + :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off) : "%"REG_c, "%ebx", "%edx", "%esi", "memory" ); return coeff_count; @@ -90,7 +92,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, static int decode_significance_8x8_x86(CABACContext *c, uint8_t *significant_coeff_ctx_base, - int *index, const uint8_t *sig_off){ + int *index, x86_reg last_off, const uint8_t *sig_off){ int minusindex= 4-(int)index; int coeff_count; x86_reg last=0; @@ -114,8 +116,9 @@ static int decode_significance_8x8_x86(CABACContext *c, "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t" "add %5, %%"REG_D" \n\t" + "add %7, %%"REG_D" \n\t" - BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx", + BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") "mov %2, %%"REG_a" \n\t" @@ -142,7 +145,7 @@ static int decode_significance_8x8_x86(CABACContext *c, "movl %%esi, "RANGE "(%3) \n\t" "movl %%ebx, "LOW "(%3) \n\t" :"=&a"(coeff_count),"+m"(last), "+m"(index) - :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off) + :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off), "m"(last_off) : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory" ); return coeff_count; diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm index f90f41c4bc..4788da98e0 100644 --- a/libavcodec/x86/h264_idct.asm +++ b/libavcodec/x86/h264_idct.asm @@ -32,14 +32,18 @@ SECTION_RODATA ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split -scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 - db 6+1*8, 7+1*8, 6+2*8, 7+2*8 - db 4+3*8, 5+3*8, 4+4*8, 5+4*8 - db 6+3*8, 7+3*8, 6+4*8, 7+4*8 - db 1+1*8, 2+1*8 - db 1+2*8, 2+2*8 - db 1+4*8, 2+4*8 - db 1+5*8, 2+5*8 +scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 + db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 + db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 + db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 + db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 + db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 + db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 + db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 + db 4+11*8, 5+11*8, 4+12*8, 5+12*8 + db 6+11*8, 7+11*8, 6+12*8, 7+12*8 + db 4+13*8, 5+13*8, 4+14*8, 5+14*8 + db 6+13*8, 7+13*8, 6+14*8, 7+14*8 %ifdef PIC %define scan8 r11 %else @@ -617,6 +621,8 @@ cglobal h264_idct_add8_8_mmx, 5, 7, 0 mov r10, r0 %endif call h264_idct_add8_mmx_plane + mov r5, 32 + add r2, 384 %ifdef ARCH_X86_64 add r10, gprsize %else @@ -678,6 +684,8 @@ cglobal h264_idct_add8_8_mmx2, 5, 7, 0 lea r11, [scan8_mem] %endif call h264_idct_add8_mmx2_plane + mov r5, 32 + add r2, 384 %ifdef ARCH_X86_64 add r10, gprsize %else @@ -810,12 +818,12 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 test r0, r0 jz .try%1dc %ifdef ARCH_X86_64 - mov r0d, dword [r1+%1*8+64] + mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] add r0, [r10] %else mov r0, r0m mov r0, [r0] - add r0, dword [r1+%1*8+64] + add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] %endif call x264_add8x4_idct_sse2 jmp .cycle%1end @@ -824,16 +832,18 @@ cglobal h264_idct_add16intra_8_sse2, 5, 7, 8 or r0w, word [r2+32] jz .cycle%1end %ifdef ARCH_X86_64 - mov r0d, dword [r1+%1*8+64] + mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] add r0, [r10] %else mov r0, r0m mov r0, [r0] - add r0, dword [r1+%1*8+64] + add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] %endif call h264_idct_dc_add8_mmx2 .cycle%1end -%if %1 < 3 +%if %1 == 1 + add r2, 384+64 +%elif %1 < 3 add r2, 64 %endif %endmacro @@ -845,15 +855,15 @@ cglobal h264_idct_add8_8_sse2, 5, 7, 8 %ifdef ARCH_X86_64 mov r10, r0 %endif - add8_sse2_cycle 0, 0x09 - add8_sse2_cycle 1, 0x11 + add8_sse2_cycle 0, 0x34 + add8_sse2_cycle 1, 0x3c %ifdef ARCH_X86_64 add r10, gprsize %else add r0mp, gprsize %endif - add8_sse2_cycle 2, 0x21 - add8_sse2_cycle 3, 0x29 + add8_sse2_cycle 2, 0x5c + add8_sse2_cycle 3, 0x64 RET ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul) diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 3f7cf4cefc..54636a95d0 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -29,14 +29,18 @@ SECTION_RODATA pw_pixel_max: times 8 dw ((1 << 10)-1) pd_32: times 4 dd 32 -scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 - db 6+1*8, 7+1*8, 6+2*8, 7+2*8 - db 4+3*8, 5+3*8, 4+4*8, 5+4*8 - db 6+3*8, 7+3*8, 6+4*8, 7+4*8 - db 1+1*8, 2+1*8 - db 1+2*8, 2+2*8 - db 1+4*8, 2+4*8 - db 1+5*8, 2+5*8 +scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 + db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 + db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 + db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 + db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 + db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 + db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 + db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 + db 4+11*8, 5+11*8, 4+12*8, 5+12*8 + db 6+11*8, 7+11*8, 6+12*8, 7+12*8 + db 4+13*8, 5+13*8, 4+14*8, 5+14*8 + db 6+13*8, 7+13*8, 6+14*8, 7+14*8 %ifdef PIC %define scan8 r11 @@ -306,7 +310,7 @@ INIT_AVX IDCT_ADD16INTRA_10 avx %endif -%assign last_block 24 +%assign last_block 36 ;----------------------------------------------------------------------------- ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) ;----------------------------------------------------------------------------- @@ -317,21 +321,22 @@ cglobal h264_idct_add8_10_%1,5,7 %endif add r2, 1024 mov r0, [r0] - ADD16_OP_INTRA %1, 16, 1+1*8 - ADD16_OP_INTRA %1, 18, 1+2*8 + ADD16_OP_INTRA %1, 16, 4+ 6*8 + ADD16_OP_INTRA %1, 18, 4+ 7*8 + add r2, 1024-128*2 %ifdef ARCH_X86_64 mov r0, [r10+gprsize] %else mov r0, r0m mov r0, [r0+gprsize] %endif - ADD16_OP_INTRA %1, 20, 1+4*8 - ADD16_OP_INTRA %1, 22, 1+5*8 + ADD16_OP_INTRA %1, 32, 4+11*8 + ADD16_OP_INTRA %1, 34, 4+12*8 REP_RET AC %1, 16 AC %1, 18 - AC %1, 20 - AC %1, 22 + AC %1, 32 + AC %1, 34 %endmacro ; IDCT_ADD8 -- cgit v1.2.3