summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libavcodec/x86/Makefile4
-rw-r--r--libavcodec/x86/cavsdsp.c13
-rw-r--r--libavcodec/x86/dsputil_init.c11
-rw-r--r--libavcodec/x86/dsputil_mmx.c9
-rw-r--r--libavcodec/x86/fpel.asm126
-rw-r--r--libavcodec/x86/fpel_mmx.c140
-rw-r--r--libavcodec/x86/h264_qpel.c11
-rw-r--r--libavcodec/x86/hpeldsp_init.c5
-rw-r--r--libavcodec/x86/vc1dsp_mmx.c7
9 files changed, 96 insertions, 230 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index a972a2779a..adf4843a61 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -50,13 +50,11 @@ OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \
- x86/fpel_mmx.o \
x86/idct_mmx_xvid.o \
x86/idct_sse2_xvid.o \
x86/simple_idct.o
MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o
-MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \
- x86/hpeldsp_mmx.o
+MMX-OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_mmx.o
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c
index b45126cf01..78d4689ba0 100644
--- a/libavcodec/x86/cavsdsp.c
+++ b/libavcodec/x86/cavsdsp.c
@@ -461,7 +461,7 @@ static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
-#if HAVE_MMX_INLINE
+#if HAVE_MMX_EXTERNAL
static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
@@ -485,19 +485,23 @@ static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src,
{
ff_avg_pixels16_mmx(dst, src, stride, 16);
}
+#endif
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
AVCodecContext *avctx)
{
+#if HAVE_MMX_EXTERNAL
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
+#endif
+#if HAVE_MMX_INLINE
c->cavs_idct8_add = cavs_idct8_add_mmx;
c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
-}
#endif /* HAVE_MMX_INLINE */
+}
#define DSPFUNC(PFX, IDX, NUM, EXT) \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
@@ -545,12 +549,9 @@ static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
{
-#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
- if (INLINE_MMX(cpu_flags))
- cavsdsp_init_mmx(c, avctx);
-#endif /* HAVE_MMX_INLINE */
+ cavsdsp_init_mmx(c, avctx);
#if HAVE_AMD3DNOW_INLINE
if (INLINE_AMD3DNOW(cpu_flags))
cavsdsp_init_3dnow(c, avctx);
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 6c7b5218c4..414da14118 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -73,8 +73,8 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
-#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
-#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
int order);
@@ -112,8 +112,11 @@ void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
#if HAVE_YASM
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
+
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
#define QPEL_OP(OPNAME, RND, MMX) \
static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst, uint8_t *src, \
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 86100ba6ff..42e25c4ff6 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -554,13 +554,12 @@ void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[
}\
}
-#if HAVE_MMX_INLINE
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
+#if HAVE_YASM
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
+
DIRAC_PIXOP(put, ff_put, mmx)
DIRAC_PIXOP(avg, ff_avg, mmx)
-#endif
-
-#if HAVE_YASM
DIRAC_PIXOP(avg, ff_avg, mmxext)
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index de6b1d6c1b..0e3b444e2a 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -25,85 +25,83 @@
SECTION .text
-INIT_MMX mmxext
+%macro PAVGB_MMX 4
+ LOAD %3, %1
+ por %3, %2
+ pxor %2, %1
+ pand %2, %4
+ psrlq %2, 1
+ psubb %3, %2
+ SWAP %2, %3
+%endmacro
+
; void ff_put/avg_pixels(uint8_t *block, const uint8_t *pixels,
; ptrdiff_t line_size, int h)
-%macro PIXELS48 2
-%if %2 == 4
-%define OP movh
+%macro OP_PIXELS 2
+%if %2 == mmsize/2
+%define LOAD movh
+%define SAVE movh
+%define LEN mmsize
%else
-%define OP mova
+%define LOAD movu
+%define SAVE mova
+%define LEN %2
%endif
-cglobal %1_pixels%2, 4,5
+cglobal %1_pixels%2, 4,5,4
movsxdifnidn r2, r2d
lea r4, [r2*3]
+%ifidn %1, avg
+%if notcpuflag(mmxext)
+ pcmpeqd m6, m6
+ paddb m6, m6
+%endif
+%endif
.loop:
- OP m0, [r1]
- OP m1, [r1+r2]
- OP m2, [r1+r2*2]
- OP m3, [r1+r4]
- lea r1, [r1+r2*4]
+%assign %%i 0
+%rep LEN/mmsize
+ LOAD m0, [r1 + %%i]
+ LOAD m1, [r1+r2 + %%i]
+ LOAD m2, [r1+r2*2 + %%i]
+ LOAD m3, [r1+r4 + %%i]
%ifidn %1, avg
- pavgb m0, [r0]
- pavgb m1, [r0+r2]
- pavgb m2, [r0+r2*2]
- pavgb m3, [r0+r4]
+%if notcpuflag(mmxext)
+ PAVGB_MMX [r0 + %%i], m0, m4, m6
+ PAVGB_MMX [r0+r2 + %%i], m1, m5, m6
+ PAVGB_MMX [r0+r2*2 + %%i], m2, m4, m6
+ PAVGB_MMX [r0+r4 + %%i], m3, m5, m6
+%else
+ pavgb m0, [r0 + %%i]
+ pavgb m1, [r0+r2 + %%i]
+ pavgb m2, [r0+r2*2 + %%i]
+ pavgb m3, [r0+r4 + %%i]
+%endif
%endif
- OP [r0], m0
- OP [r0+r2], m1
- OP [r0+r2*2], m2
- OP [r0+r4], m3
+ SAVE [r0 + %%i], m0
+ SAVE [r0+r2 + %%i], m1
+ SAVE [r0+r2*2 + %%i], m2
+ SAVE [r0+r4 + %%i], m3
+%assign %%i %%i+mmsize
+%endrep
sub r3d, 4
+ lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
jne .loop
RET
%endmacro
-PIXELS48 put, 4
-PIXELS48 avg, 4
-PIXELS48 put, 8
-PIXELS48 avg, 8
+INIT_MMX mmx
+OP_PIXELS put, 4
+OP_PIXELS avg, 4
+OP_PIXELS put, 8
+OP_PIXELS avg, 8
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
+INIT_MMX mmxext
+OP_PIXELS avg, 4
+OP_PIXELS avg, 8
+OP_PIXELS avg, 16
INIT_XMM sse2
-; void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-; ptrdiff_t line_size, int h)
-cglobal put_pixels16, 4,5,4
- lea r4, [r2*3]
-.loop:
- movu m0, [r1]
- movu m1, [r1+r2]
- movu m2, [r1+r2*2]
- movu m3, [r1+r4]
- lea r1, [r1+r2*4]
- mova [r0], m0
- mova [r0+r2], m1
- mova [r0+r2*2], m2
- mova [r0+r4], m3
- sub r3d, 4
- lea r0, [r0+r2*4]
- jnz .loop
- REP_RET
-
-; void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
-; ptrdiff_t line_size, int h)
-cglobal avg_pixels16, 4,5,4
- lea r4, [r2*3]
-.loop:
- movu m0, [r1]
- movu m1, [r1+r2]
- movu m2, [r1+r2*2]
- movu m3, [r1+r4]
- lea r1, [r1+r2*4]
- pavgb m0, [r0]
- pavgb m1, [r0+r2]
- pavgb m2, [r0+r2*2]
- pavgb m3, [r0+r4]
- mova [r0], m0
- mova [r0+r2], m1
- mova [r0+r2*2], m2
- mova [r0+r4], m3
- sub r3d, 4
- lea r0, [r0+r2*4]
- jnz .loop
- REP_RET
+OP_PIXELS put, 16
+OP_PIXELS avg, 16
diff --git a/libavcodec/x86/fpel_mmx.c b/libavcodec/x86/fpel_mmx.c
deleted file mode 100644
index 45cc57e6d7..0000000000
--- a/libavcodec/x86/fpel_mmx.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * MMX-optimized avg/put pixel routines
- *
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "config.h"
-#include "fpel.h"
-#include "inline_asm.h"
-
-#if HAVE_MMX_INLINE
-
-// in case more speed is needed - unrolling would certainly help
-void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- JUMPALIGN();
- do {
- __asm__ volatile(
- "movq %0, %%mm0 \n\t"
- "movq %1, %%mm1 \n\t"
- PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
- "movq %%mm2, %0 \n\t"
- :"+m"(*block)
- :"m"(*pixels)
- :"memory");
- pixels += line_size;
- block += line_size;
- }
- while (--h);
-}
-
-void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- JUMPALIGN();
- do {
- __asm__ volatile(
- "movq %0, %%mm0 \n\t"
- "movq %1, %%mm1 \n\t"
- PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
- "movq %%mm2, %0 \n\t"
- "movq 8%0, %%mm0 \n\t"
- "movq 8%1, %%mm1 \n\t"
- PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
- "movq %%mm2, 8%0 \n\t"
- :"+m"(*block)
- :"m"(*pixels)
- :"memory");
- pixels += line_size;
- block += line_size;
- }
- while (--h);
-}
-
-void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- __asm__ volatile (
- "lea (%3, %3), %%"REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1 ), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "movq (%1 ), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- : "+g"(h), "+r"(pixels), "+r"(block)
- : "r"((x86_reg)line_size)
- : "%"REG_a, "memory"
- );
-}
-
-void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h)
-{
- __asm__ volatile (
- "lea (%3, %3), %%"REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1 ), %%mm0 \n\t"
- "movq 8(%1 ), %%mm4 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm5 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "movq (%1 ), %%mm0 \n\t"
- "movq 8(%1 ), %%mm4 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm5 \n\t"
- "movq %%mm0, (%2) \n\t"
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"REG_a", %1 \n\t"
- "add %%"REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- : "+g"(h), "+r"(pixels), "+r"(block)
- : "r"((x86_reg)line_size)
- : "%"REG_a, "memory"
- );
-}
-
-#endif /* HAVE_MMX_INLINE */
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 8edaeacde6..1a06acab2e 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -29,8 +29,8 @@
#include "fpel.h"
#if HAVE_YASM
-void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
+void ff_put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
@@ -49,9 +49,12 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
+#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
+#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
+#define ff_put_pixels4_mmxext ff_put_pixels4_mmx
-CALL_2X_PIXELS(ff_avg_pixels16_mmxext, ff_avg_pixels8_mmxext, 8)
-CALL_2X_PIXELS(ff_put_pixels16_mmxext, ff_put_pixels8_mmxext, 8)
+void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index f860533f7e..e9878cf916 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -165,15 +165,17 @@ HPELDSP_AVG_PIXELS16(_mmxext)
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
+ if (HAVE_MMX_EXTERNAL) \
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
+ if (HAVE_MMX_INLINE) { \
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
+ } \
} while (0)
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags)
{
-#if HAVE_MMX_INLINE
SET_HPEL_FUNCS(put, [0], 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
SET_HPEL_FUNCS(avg, [0], 16, mmx);
@@ -181,7 +183,6 @@ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags)
SET_HPEL_FUNCS(put, [1], 8, mmx);
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
SET_HPEL_FUNCS(avg, [1], 8, mmx);
-#endif /* HAVE_MMX_INLINE */
}
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 87e4638ca7..77a8e359ff 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -728,6 +728,7 @@ static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
);
}
+#if HAVE_MMX_EXTERNAL
static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride, int rnd)
{
@@ -748,6 +749,7 @@ static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src,
{
ff_avg_pixels16_mmx(dst, src, stride, 16);
}
+#endif
#define FN_ASSIGN(OP, X, Y, INSN) \
dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
@@ -755,7 +757,10 @@ static void avg_vc1_mspel_mc00_16_mmx(uint8_t *dst, const uint8_t *src,
av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
{
+#if HAVE_MMX_EXTERNAL
FN_ASSIGN(put_, 0, 0, _mmx);
+ FN_ASSIGN(avg_, 0, 0, _mmx);
+#endif
FN_ASSIGN(put_, 0, 1, _mmx);
FN_ASSIGN(put_, 0, 2, _mmx);
FN_ASSIGN(put_, 0, 3, _mmx);
@@ -774,8 +779,6 @@ av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
FN_ASSIGN(put_, 3, 1, _mmx);
FN_ASSIGN(put_, 3, 2, _mmx);
FN_ASSIGN(put_, 3, 3, _mmx);
-
- FN_ASSIGN(avg_, 0, 0, _mmx);
}
av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)