summaryrefslogtreecommitdiff
path: root/libavcodec
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/dct-test.c2
-rw-r--r--libavcodec/libopenjpegdec.c14
-rw-r--r--libavcodec/vp8.c13
-rw-r--r--libavcodec/x86/dsputil_mmx.c85
-rw-r--r--libavcodec/x86/h264_qpel_mmx.c4
-rw-r--r--libavcodec/x86/idct_mmx.c4
-rw-r--r--libavcodec/x86/idct_mmx_xvid.c4
-rw-r--r--libavcodec/x86/idct_sse2_xvid.c4
-rw-r--r--libavcodec/x86/rv40dsp_init.c2
-rw-r--r--libavcodec/x86/simple_idct_mmx.c4
-rw-r--r--libavcodec/x86/vp3dsp.asm120
11 files changed, 174 insertions, 82 deletions
diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index 37097aac58..d4c32aa6b8 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -125,7 +125,7 @@ static const struct algo idct_tab[] = {
{ "INT", ff_j_rev_dct, MMX_PERM },
{ "SIMPLE-C", ff_simple_idct_8, NO_PERM },
-#if HAVE_MMX
+#if HAVE_MMX && HAVE_INLINE_ASM
#if CONFIG_GPL
{ "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
{ "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
diff --git a/libavcodec/libopenjpegdec.c b/libavcodec/libopenjpegdec.c
index 333955feb0..bb7b497d58 100644
--- a/libavcodec/libopenjpegdec.c
+++ b/libavcodec/libopenjpegdec.c
@@ -27,9 +27,9 @@
#define OPJ_STATIC
#include <openjpeg.h>
+#include "libavutil/intreadwrite.h"
#include "libavutil/imgutils.h"
#include "libavutil/pixfmt.h"
-#include "libavutil/intreadwrite.h"
#include "libavutil/opt.h"
#include "avcodec.h"
#include "thread.h"
@@ -293,15 +293,12 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
avcodec_set_dimensions(avctx, width, height);
- if (avctx->pix_fmt != PIX_FMT_NONE) {
- if (!libopenjpeg_matches_pix_fmt(image, avctx->pix_fmt)) {
+ if (avctx->pix_fmt != PIX_FMT_NONE)
+ if (!libopenjpeg_matches_pix_fmt(image, avctx->pix_fmt))
avctx->pix_fmt = PIX_FMT_NONE;
- }
- }
- if (avctx->pix_fmt == PIX_FMT_NONE) {
+ if (avctx->pix_fmt == PIX_FMT_NONE)
avctx->pix_fmt = libopenjpeg_guess_pix_fmt(image);
- }
if (avctx->pix_fmt == PIX_FMT_NONE) {
av_log(avctx, AV_LOG_ERROR, "Unable to determine pixel format\n");
@@ -331,9 +328,10 @@ static int libopenjpeg_decode_frame(AVCodecContext *avctx,
}
opj_image_destroy(image);
- // Decode the codestream.
+ // Decode the codestream
image = opj_decode_with_info(dec, stream, NULL);
opj_cio_close(stream);
+
if (!image) {
av_log(avctx, AV_LOG_ERROR, "Error decoding codestream.\n");
goto done;
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 9a47ad6da6..c86bad8966 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -249,12 +249,13 @@ static void get_quants(VP8Context *s)
} else
base_qi = yac_qi;
- s->qmat[i].luma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
- s->qmat[i].luma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi , 7)];
- s->qmat[i].luma_dc_qmul[0] = 2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
- s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
- s->qmat[i].chroma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
- s->qmat[i].chroma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
+ s->qmat[i].luma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
+ s->qmat[i].luma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi , 7)];
+ s->qmat[i].luma_dc_qmul[0] = 2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
+ /* 101581>>16 is equivalent to 155/100 */
+ s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
+ s->qmat[i].chroma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
+ s->qmat[i].chroma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
s->qmat[i].chroma_qmul[0] = FFMIN(s->qmat[i].chroma_qmul[0], 132);
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index c2ee5a85e4..1488389572 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -85,6 +85,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF
DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
+#if HAVE_INLINE_ASM
+
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
@@ -246,14 +248,14 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
pix = pixels;
/* unrolled loop */
__asm__ volatile (
- "movq (%3), %%mm0 \n\t"
- "movq 8(%3), %%mm1 \n\t"
- "movq 16(%3), %%mm2 \n\t"
- "movq 24(%3), %%mm3 \n\t"
- "movq 32(%3), %%mm4 \n\t"
- "movq 40(%3), %%mm5 \n\t"
- "movq 48(%3), %%mm6 \n\t"
- "movq 56(%3), %%mm7 \n\t"
+ "movq (%3), %%mm0 \n\t"
+ "movq 8(%3), %%mm1 \n\t"
+ "movq 16(%3), %%mm2 \n\t"
+ "movq 24(%3), %%mm3 \n\t"
+ "movq 32(%3), %%mm4 \n\t"
+ "movq 40(%3), %%mm5 \n\t"
+ "movq 48(%3), %%mm6 \n\t"
+ "movq 56(%3), %%mm7 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
@@ -1856,6 +1858,8 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
avg_pixels16_xy2_mmx(dst, src, stride, 16);
}
+#endif /* HAVE_INLINE_ASM */
+
#if HAVE_YASM
typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
x86_reg linesize, x86_reg start_y,
@@ -1924,6 +1928,8 @@ static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
}
#endif /* HAVE_YASM */
+#if HAVE_INLINE_ASM
+
typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
int linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h);
@@ -2093,6 +2099,8 @@ PREFETCH(prefetch_mmx2, prefetcht0)
PREFETCH(prefetch_3dnow, prefetch)
#undef PREFETCH
+#endif /* HAVE_INLINE_ASM */
+
#include "h264_qpel_mmx.c"
void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
@@ -2138,6 +2146,8 @@ CHROMA_MC(avg, 8, 10, sse2)
CHROMA_MC(put, 8, 10, avx)
CHROMA_MC(avg, 8, 10, avx)
+#if HAVE_INLINE_ASM
+
/* CAVS-specific */
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
{
@@ -2574,6 +2584,8 @@ static void vector_clipf_sse(float *dst, const float *src,
);
}
+#endif /* HAVE_INLINE_ASM */
+
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
@@ -2686,6 +2698,7 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+#if HAVE_INLINE_ASM
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
@@ -2708,10 +2721,6 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
#if ARCH_X86_32 || !HAVE_YASM
c->gmc = gmc_mmx;
#endif
-#if ARCH_X86_32 && HAVE_YASM
- if (!high_bit_depth)
- c->emulated_edge_mc = emulated_edge_mc_mmx;
-#endif
c->add_bytes = add_bytes_mmx;
@@ -2722,8 +2731,14 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
c->h263_v_loop_filter = h263_v_loop_filter_mmx;
c->h263_h_loop_filter = h263_h_loop_filter_mmx;
}
+#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
+#if ARCH_X86_32
+ if (!high_bit_depth)
+ c->emulated_edge_mc = emulated_edge_mc_mmx;
+#endif
+
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
@@ -2740,6 +2755,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
const int bit_depth = avctx->bits_per_raw_sample;
const int high_bit_depth = bit_depth > 8;
+#if HAVE_INLINE_ASM
c->prefetch = prefetch_mmx2;
if (!high_bit_depth) {
@@ -2775,22 +2791,27 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
}
+#endif /* HAVE_INLINE_ASM */
if (CONFIG_H264QPEL) {
+#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
+#endif /* HAVE_INLINE_ASM */
if (!high_bit_depth) {
+#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
+#endif /* HAVE_INLINE_ASM */
} else if (bit_depth == 10) {
#if HAVE_YASM
#if !ARCH_X86_64
@@ -2804,10 +2825,12 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
#endif
}
+#if HAVE_INLINE_ASM
SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
+#endif /* HAVE_INLINE_ASM */
}
#if HAVE_YASM
@@ -2842,6 +2865,7 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+#if HAVE_INLINE_ASM
c->prefetch = prefetch_3dnow;
if (!high_bit_depth) {
@@ -2899,25 +2923,26 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
}
-#if HAVE_YASM
- if (!high_bit_depth && CONFIG_H264CHROMA) {
- c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
- c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
- }
-#endif
-
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
#if HAVE_7REGS
if (mm_flags & AV_CPU_FLAG_CMOV)
c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
#endif
+#endif /* HAVE_INLINE_ASM */
+
+#if HAVE_YASM
+ if (!high_bit_depth && CONFIG_H264CHROMA) {
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
+ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
+ }
+#endif
}
static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
-#if HAVE_6REGS
+#if HAVE_6REGS && HAVE_INLINE_ASM
c->vector_fmul_window = vector_fmul_window_3dnow2;
#endif
}
@@ -2926,6 +2951,7 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+#if HAVE_INLINE_ASM
if (!high_bit_depth) {
if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
@@ -2936,31 +2962,35 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->ac3_downmix = ac3_downmix_sse;
-#if HAVE_YASM
- c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
- c->vector_fmul_add = ff_vector_fmul_add_sse;
-#endif
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_sse;
#endif
c->vector_clipf = vector_clipf_sse;
+#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
+ c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
+ c->vector_fmul_add = ff_vector_fmul_add_sse;
+
c->scalarproduct_float = ff_scalarproduct_float_sse;
c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
if (!high_bit_depth)
c->emulated_edge_mc = emulated_edge_mc_sse;
+#if HAVE_INLINE_ASM
c->gmc = gmc_sse;
#endif
+#endif
}
static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
const int bit_depth = avctx->bits_per_raw_sample;
+
+#if HAVE_INLINE_ASM
const int high_bit_depth = bit_depth > 8;
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
@@ -2988,6 +3018,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
}
+#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
if (bit_depth == 10) {
@@ -3029,6 +3060,7 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
const int bit_depth = avctx->bits_per_raw_sample;
+#if HAVE_INLINE_ASM
if (!high_bit_depth && CONFIG_H264QPEL) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
@@ -3043,8 +3075,9 @@ static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
}
+#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
- else if (bit_depth == 10 && CONFIG_H264QPEL) {
+ if (bit_depth == 10 && CONFIG_H264QPEL) {
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
@@ -3108,6 +3141,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_MMX) {
+#if HAVE_INLINE_ASM
const int idct_algo = avctx->idct_algo;
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
@@ -3148,6 +3182,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
}
}
}
+#endif /* HAVE_INLINE_ASM */
dsputil_init_mmx(c, avctx, mm_flags);
}
diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c
index 53c840ee4f..71a1fbeed9 100644
--- a/libavcodec/x86/h264_qpel_mmx.c
+++ b/libavcodec/x86/h264_qpel_mmx.c
@@ -21,6 +21,8 @@
#include "dsputil_mmx.h"
+#if HAVE_INLINE_ASM
+
/***********************************/
/* motion compensation */
@@ -1191,7 +1193,7 @@ H264_MC_816(H264_MC_H, ssse3)
H264_MC_816(H264_MC_HV, ssse3)
#endif
-
+#endif /* HAVE_INLINE_ASM */
//10bit
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
diff --git a/libavcodec/x86/idct_mmx.c b/libavcodec/x86/idct_mmx.c
index f199941f55..2408ab26ad 100644
--- a/libavcodec/x86/idct_mmx.c
+++ b/libavcodec/x86/idct_mmx.c
@@ -25,6 +25,8 @@
#include "libavutil/x86_cpu.h"
#include "dsputil_mmx.h"
+#if HAVE_INLINE_ASM
+
#define ROW_SHIFT 11
#define COL_SHIFT 6
@@ -626,3 +628,5 @@ declare_idct (ff_mmxext_idct, mmxext_table,
declare_idct (ff_mmx_idct, mmx_table,
mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c
index 1b48ab52bb..5d8027fb27 100644
--- a/libavcodec/x86/idct_mmx_xvid.c
+++ b/libavcodec/x86/idct_mmx_xvid.c
@@ -43,6 +43,8 @@
#include "libavcodec/avcodec.h"
#include "idct_xvid.h"
+#if HAVE_INLINE_ASM
+
//=============================================================================
// Macros and other preprocessor constants
//=============================================================================
@@ -523,3 +525,5 @@ __asm__ volatile(
DCT_8_INV_COL(8(%0), 8(%0))
:: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c
index fc75a57519..3708f93df8 100644
--- a/libavcodec/x86/idct_sse2_xvid.c
+++ b/libavcodec/x86/idct_sse2_xvid.c
@@ -43,6 +43,8 @@
#include "idct_xvid.h"
#include "dsputil_mmx.h"
+#if HAVE_INLINE_ASM
+
/**
* @file
* @brief SSE2 idct compatible with xvidmmx
@@ -401,3 +403,5 @@ void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
ff_idct_xvid_sse2(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 41ee6a74c6..bbb1c8eb57 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -194,10 +194,12 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
if (mm_flags & AV_CPU_FLAG_MMX) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
+#if HAVE_INLINE_ASM
c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx;
c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx;
c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx;
c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx;
+#endif /* HAVE_INLINE_ASM */
#if ARCH_X86_32
QPEL_MC_SET(put_, _mmx)
#endif
diff --git a/libavcodec/x86/simple_idct_mmx.c b/libavcodec/x86/simple_idct_mmx.c
index db479ce257..20e51a47f4 100644
--- a/libavcodec/x86/simple_idct_mmx.c
+++ b/libavcodec/x86/simple_idct_mmx.c
@@ -23,6 +23,8 @@
#include "libavcodec/simple_idct.h"
#include "dsputil_mmx.h"
+#if HAVE_INLINE_ASM
+
/*
23170.475006
22725.260826
@@ -1161,3 +1163,5 @@ void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
idct(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
+
+#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 0e0bd29a99..46bd9d8f86 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -38,13 +38,11 @@ cextern pb_1
cextern pb_3
cextern pb_7
cextern pb_1F
+cextern pb_80
cextern pb_81
cextern pw_8
-cextern put_signed_pixels_clamped_mmx
-cextern add_pixels_clamped_mmx
-
SECTION .text
; this is off by one or two for some cases when filter_limit is greater than 63
@@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%endmacro
-%macro vp3_idct_funcs 3
-cglobal vp3_idct_put_%1, 3, %3, %2
+%macro vp3_idct_funcs 1
+cglobal vp3_idct_put_%1, 3, 4, 9
VP3_IDCT_%1 r2
-%if ARCH_X86_64
- mov r3, r2
- mov r2, r1
- mov r1, r0
- mov r0, r3
+
+ movsxdifnidn r1, r1d
+ mova m4, [pb_80]
+ lea r3, [r1*3]
+%assign %%i 0
+%rep 16/mmsize
+ mova m0, [r2+mmsize*0+%%i]
+ mova m1, [r2+mmsize*2+%%i]
+ mova m2, [r2+mmsize*4+%%i]
+ mova m3, [r2+mmsize*6+%%i]
+ packsswb m0, [r2+mmsize*1+%%i]
+ packsswb m1, [r2+mmsize*3+%%i]
+ packsswb m2, [r2+mmsize*5+%%i]
+ packsswb m3, [r2+mmsize*7+%%i]
+ paddb m0, m4
+ paddb m1, m4
+ paddb m2, m4
+ paddb m3, m4
+ movq [r0 ], m0
+%if mmsize == 8
+ movq [r0+r1 ], m1
+ movq [r0+r1*2], m2
+ movq [r0+r3 ], m3
%else
- mov r0m, r2
- mov r1m, r0
- mov r2m, r1
+ movhps [r0+r1 ], m0
+ movq [r0+r1*2], m1
+ movhps [r0+r3 ], m1
%endif
-%if WIN64
- call put_signed_pixels_clamped_mmx
- RET
-%else
- jmp put_signed_pixels_clamped_mmx
+%if %%i == 0
+ lea r0, [r0+r1*4]
+%endif
+%if mmsize == 16
+ movq [r0 ], m2
+ movhps [r0+r1 ], m2
+ movq [r0+r1*2], m3
+ movhps [r0+r3 ], m3
%endif
+%assign %%i %%i+64
+%endrep
+ RET
-cglobal vp3_idct_add_%1, 3, %3, %2
+cglobal vp3_idct_add_%1, 3, 4, 9
VP3_IDCT_%1 r2
-%if ARCH_X86_64
- mov r3, r2
- mov r2, r1
- mov r1, r0
- mov r0, r3
-%else
- mov r0m, r2
- mov r1m, r0
- mov r2m, r1
+
+ mov r3, 4
+ pxor m4, m4
+ movsxdifnidn r1, r1d
+.loop:
+ movq m0, [r0]
+ movq m1, [r0+r1]
+%if mmsize == 8
+ mova m2, m0
+ mova m3, m1
%endif
-%if WIN64
- call add_pixels_clamped_mmx
- RET
-%else
- jmp add_pixels_clamped_mmx
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+%if mmsize == 8
+ punpckhbw m2, m4
+ punpckhbw m3, m4
+%endif
+ paddsw m0, [r2+ 0]
+ paddsw m1, [r2+16]
+%if mmsize == 8
+ paddsw m2, [r2+ 8]
+ paddsw m3, [r2+24]
+ packuswb m0, m2
+ packuswb m1, m3
+%else ; mmsize == 16
+ packuswb m0, m1
%endif
+ movq [r0 ], m0
+%if mmsize == 8
+ movq [r0+r1], m1
+%else ; mmsize == 16
+ movhps [r0+r1], m0
+%endif
+ lea r0, [r0+r1*2]
+ add r2, 32
+ dec r3
+ jg .loop
+ RET
%endmacro
-%if ARCH_X86_64
-%define REGS 4
-%else
-%define REGS 3
-%endif
INIT_MMX
-vp3_idct_funcs mmx, 0, REGS
+vp3_idct_funcs mmx
INIT_XMM
-vp3_idct_funcs sse2, 9, REGS
-%undef REGS
+vp3_idct_funcs sse2
%macro DC_ADD 0
movq m2, [r0 ]