summaryrefslogtreecommitdiff
path: root/libavcodec/x86/vc1dsp_mmx.c
diff options
context:
space:
mode:
authorJason Garrett-Glaser <darkshikari@gmail.com>2009-06-16 09:00:55 +0000
committerJason Garrett-Glaser <darkshikari@gmail.com>2009-06-16 09:00:55 +0000
commit4f717c69ed25a701f8b6613ca00e5e632a6382a6 (patch)
tree0c82c716bd1f4f88d7645499692f3e213f4ffe68 /libavcodec/x86/vc1dsp_mmx.c
parent41faa87886e6fc54f159da6940b9edbfcd194714 (diff)
idct_dc for VC-1/WMV3 decoder; ~11% faster decoding overall.
Includes mmx2 asm for the various functions. Note that the actual idct still does not have an x86 SIMD implemtation. For wmv3 files using regular idct, the decoder just falls back to simple_idct, since simple_idct_dc doesn't exist (yet). Originally committed as revision 19204 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/vc1dsp_mmx.c')
-rw-r--r--libavcodec/x86/vc1dsp_mmx.c203
1 files changed, 203 insertions, 0 deletions
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 2c98b37b97..3071d02808 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -494,6 +494,204 @@ DECLARE_FUNCTION(3, 1)
DECLARE_FUNCTION(3, 2)
DECLARE_FUNCTION(3, 3)
+static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int dc = block[0];
+ dc = (17 * dc + 4) >> 3;
+ dc = (17 * dc + 64) >> 7;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int dc = block[0];
+ dc = (17 * dc + 4) >> 3;
+ dc = (12 * dc + 64) >> 7;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+ dest += 4*linesize;
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int dc = block[0];
+ dc = ( 3 * dc + 1) >> 1;
+ dc = (17 * dc + 64) >> 7;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int dc = block[0];
+ dc = (3 * dc + 1) >> 1;
+ dc = (3 * dc + 16) >> 5;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+ dest += 4*linesize;
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
mm_flags = mm_support();
@@ -537,5 +735,10 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2;
dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2;
dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2;
+
+ dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2;
+ dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2;
+ dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2;
+ dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2;
}
}