diff options
Diffstat (limited to 'libavcodec/x86/xvididct_sse2.c')
-rw-r--r-- | libavcodec/x86/xvididct_sse2.c | 195 |
1 files changed, 97 insertions, 98 deletions
diff --git a/libavcodec/x86/xvididct_sse2.c b/libavcodec/x86/xvididct_sse2.c index 9a87df00b8..d4f01693ea 100644 --- a/libavcodec/x86/xvididct_sse2.c +++ b/libavcodec/x86/xvididct_sse2.c @@ -52,49 +52,49 @@ * @brief SSE2 IDCT compatible with the Xvid IDCT */ -#define X8(x) x,x,x,x,x,x,x,x +#define X8(x) x, x, x, x, x, x, x, x -DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16) -DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1 -DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1 -DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2) -DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)}; +DECLARE_ASM_CONST(16, int16_t, tan1)[] = { X8(13036) }; // tan( pi/16) +DECLARE_ASM_CONST(16, int16_t, tan2)[] = { X8(27146) }; // tan(2pi/16) = sqrt(2)-1 +DECLARE_ASM_CONST(16, int16_t, tan3)[] = { X8(43790) }; // tan(3pi/16)-1 +DECLARE_ASM_CONST(16, int16_t, sqrt2)[] = { X8(23170) }; // 0.5/sqrt(2) +DECLARE_ASM_CONST(8, uint8_t, m127)[] = { X8(127) }; DECLARE_ASM_CONST(16, int16_t, iTab1)[] = { - 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, - 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, - 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, - 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b + 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, + 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, + 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, + 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b }; DECLARE_ASM_CONST(16, int16_t, iTab2)[] = { - 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, - 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, - 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, - 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df + 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, + 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, + 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, + 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df }; DECLARE_ASM_CONST(16, int16_t, iTab3)[] = { - 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, - 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, - 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, - 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 + 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, + 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, + 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, + 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 }; DECLARE_ASM_CONST(16, int16_t, iTab4)[] = { - 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, - 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, - 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, - 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e + 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, + 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, + 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, + 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e }; DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = { - 65536, 65536, 65536, 65536, - 3597, 3597, 3597, 3597, - 2260, 2260, 2260, 2260, - 1203, 1203, 1203, 1203, - 120, 120, 120, 120, - 512, 512, 512, 512 + 65536, 65536, 65536, 65536, + 3597, 3597, 3597, 3597, + 2260, 2260, 2260, 2260, + 1203, 1203, 1203, 1203, + 120, 120, 120, 120, + 512, 512, 512, 512 }; // Temporary storage before the column pass @@ -164,46 +164,46 @@ DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = { "pmovmskb %%mm1, "reg" \n\t" #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \ - clear1 \ - clear2 \ - "movq "row1", %%mm1 \n\t" \ - "por 8+"row1", %%mm1 \n\t" \ - "movq "row2", %%mm2 \n\t" \ - "por 8+"row2", %%mm2 \n\t" \ - "paddusb %%mm0, %%mm1 \n\t" \ - "paddusb %%mm0, %%mm2 \n\t" \ - "pmovmskb %%mm1, "reg1" \n\t" \ + clear1 \ + clear2 \ + "movq "row1", %%mm1 \n\t" \ + "por 8+"row1", %%mm1 \n\t" \ + "movq "row2", %%mm2 \n\t" \ + "por 8+"row2", %%mm2 \n\t" \ + "paddusb %%mm0, %%mm1 \n\t" \ + "paddusb %%mm0, %%mm2 \n\t" \ + "pmovmskb %%mm1, "reg1" \n\t" \ "pmovmskb %%mm2, "reg2" \n\t" -///IDCT pass on rows. -#define iMTX_MULT(src, table, rounder, put) \ - "movdqa "src", %%xmm3 \n\t" \ - "movdqa %%xmm3, %%xmm0 \n\t" \ +/// IDCT pass on rows. +#define iMTX_MULT(src, table, rounder, put) \ + "movdqa "src", %%xmm3 \n\t" \ + "movdqa %%xmm3, %%xmm0 \n\t" \ "pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \ "punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \ - "pmaddwd "table", %%xmm0 \n\t" \ - "pmaddwd 16+"table", %%xmm1 \n\t" \ + "pmaddwd "table", %%xmm0 \n\t" \ + "pmaddwd 16+"table", %%xmm1 \n\t" \ "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \ "punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \ - "pmaddwd 32+"table", %%xmm2 \n\t" \ - "pmaddwd 48+"table", %%xmm3 \n\t" \ - "paddd %%xmm1, %%xmm0 \n\t" \ - "paddd %%xmm3, %%xmm2 \n\t" \ - rounder", %%xmm0 \n\t" \ - "movdqa %%xmm2, %%xmm3 \n\t" \ - "paddd %%xmm0, %%xmm2 \n\t" \ - "psubd %%xmm3, %%xmm0 \n\t" \ - "psrad $11, %%xmm2 \n\t" \ - "psrad $11, %%xmm0 \n\t" \ - "packssdw %%xmm0, %%xmm2 \n\t" \ - put \ + "pmaddwd 32+"table", %%xmm2 \n\t" \ + "pmaddwd 48+"table", %%xmm3 \n\t" \ + "paddd %%xmm1, %%xmm0 \n\t" \ + "paddd %%xmm3, %%xmm2 \n\t" \ + rounder", %%xmm0 \n\t" \ + "movdqa %%xmm2, %%xmm3 \n\t" \ + "paddd %%xmm0, %%xmm2 \n\t" \ + "psubd %%xmm3, %%xmm0 \n\t" \ + "psrad $11, %%xmm2 \n\t" \ + "psrad $11, %%xmm0 \n\t" \ + "packssdw %%xmm0, %%xmm2 \n\t" \ + put \ "1: \n\t" #define iLLM_HEAD \ "movdqa "MANGLE(tan3)", "TAN3" \n\t" \ "movdqa "MANGLE(tan1)", "TAN1" \n\t" \ -///IDCT pass on columns. +/// IDCT pass on columns. #define iLLM_PASS(dct) \ "movdqa "TAN3", %%xmm1 \n\t" \ "movdqa "TAN1", %%xmm3 \n\t" \ @@ -281,7 +281,7 @@ DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = { "movdqa "REG4", 4*16("dct") \n\t" \ "movdqa %%xmm7, 7*16("dct") \n\t" -///IDCT pass on columns, assuming rows 4-7 are zero. +/// IDCT pass on columns, assuming rows 4-7 are zero. #define iLLM_PASS_SPARSE(dct) \ "pmulhw %%xmm4, "TAN3" \n\t" \ "paddsw %%xmm4, "TAN3" \n\t" \ @@ -344,51 +344,50 @@ DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = { inline void ff_xvid_idct_sse2(short *block) { - __asm__ volatile( - "movq "MANGLE(m127)", %%mm0 \n\t" - iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0)) - iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1)) - iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2)) - - TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) - JZ("%%eax", "1f") - iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3)) - - TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) - TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) - iLLM_HEAD - ".p2align 4 \n\t" - JNZ("%%ecx", "2f") - JNZ("%%eax", "3f") - JNZ("%%edx", "4f") - JNZ("%%esi", "5f") - iLLM_PASS_SPARSE("%0") - "jmp 6f \n\t" - "2: \n\t" - iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) - "3: \n\t" - iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5)) - JZ("%%edx", "1f") - "4: \n\t" - iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6)) - JZ("%%esi", "1f") - "5: \n\t" - iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7)) + __asm__ volatile ( + "movq "MANGLE (m127) ", %%mm0 \n\t" + iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0)) + iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 1 * 16), PUT_ODD(ROW1)) + iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 2 * 16), PUT_EVEN(ROW2)) + + TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) + JZ("%%eax", "1f") + iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 3 * 16), PUT_ODD(ROW3)) + + TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) + TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) + iLLM_HEAD + ".p2align 4 \n\t" + JNZ("%%ecx", "2f") + JNZ("%%eax", "3f") + JNZ("%%edx", "4f") + JNZ("%%esi", "5f") + iLLM_PASS_SPARSE("%0") + "jmp 6f \n\t" + "2: \n\t" + iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) + "3: \n\t" + iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders + 4 * 16), PUT_ODD(ROW5)) + JZ("%%edx", "1f") + "4: \n\t" + iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders + 5 * 16), PUT_EVEN(ROW6)) + JZ("%%esi", "1f") + "5: \n\t" + iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders + 5 * 16), PUT_ODD(ROW7)) #if ARCH_X86_32 - iLLM_HEAD + iLLM_HEAD #endif - iLLM_PASS("%0") - "6: \n\t" - : "+r"(block) - : - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , - "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,) + iLLM_PASS("%0") + "6: \n\t" + : "+r" (block) + : + : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", ) #if ARCH_X86_64 - XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11", - "%xmm12", "%xmm13", "%xmm14",) + XMM_CLOBBERS("%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", ) #endif - "%eax", "%ecx", "%edx", "%esi", "memory" - ); + "%eax", "%ecx", "%edx", "%esi", "memory"); } void ff_xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block) |