summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libavcodec/huffyuvdsp.c2
-rw-r--r--libavcodec/huffyuvdsp.h2
-rw-r--r--libavcodec/ppc/huffyuvdsp_altivec.c2
-rw-r--r--libavcodec/x86/huffyuvdsp.asm37
-rw-r--r--libavcodec/x86/huffyuvdsp_init.c9
-rw-r--r--libavcodec/x86/huffyuvdsp_mmx.c32
6 files changed, 48 insertions, 36 deletions
diff --git a/libavcodec/huffyuvdsp.c b/libavcodec/huffyuvdsp.c
index cbc09cf124..3d51552fc3 100644
--- a/libavcodec/huffyuvdsp.c
+++ b/libavcodec/huffyuvdsp.c
@@ -27,7 +27,7 @@
#define pb_7f (~0UL / 255 * 0x7f)
#define pb_80 (~0UL / 255 * 0x80)
-static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
+static void add_bytes_c(uint8_t *dst, uint8_t *src, intptr_t w)
{
long i;
diff --git a/libavcodec/huffyuvdsp.h b/libavcodec/huffyuvdsp.h
index fd66f0a56e..c52dd69405 100644
--- a/libavcodec/huffyuvdsp.h
+++ b/libavcodec/huffyuvdsp.h
@@ -35,7 +35,7 @@
typedef struct HuffYUVDSPContext {
void (*add_bytes)(uint8_t *dst /* align 16 */, uint8_t *src /* align 16 */,
- int w);
+ intptr_t w);
void (*add_hfyu_median_pred)(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);
diff --git a/libavcodec/ppc/huffyuvdsp_altivec.c b/libavcodec/ppc/huffyuvdsp_altivec.c
index ff2bd87eeb..0052daeb64 100644
--- a/libavcodec/ppc/huffyuvdsp_altivec.c
+++ b/libavcodec/ppc/huffyuvdsp_altivec.c
@@ -31,7 +31,7 @@
#include "libavcodec/huffyuvdsp.h"
#if HAVE_ALTIVEC
-static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w)
+static void add_bytes_altivec(uint8_t *dst, uint8_t *src, intptr_t w)
{
register int i;
register vector unsigned char vdst, vsrc;
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index f183ebee54..a923e70e1e 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -163,3 +163,40 @@ cglobal add_hfyu_left_pred, 3,3,7, dst, src, w, left
ADD_HFYU_LEFT_LOOP 0, 1
.src_unaligned:
ADD_HFYU_LEFT_LOOP 0, 0
+
+%macro ADD_BYTES 0
+cglobal add_bytes, 3,4,2, dst, src, w, size
+ mov sizeq, wq
+ and sizeq, -2*mmsize
+ jz .2
+ add dstq, sizeq
+ add srcq, sizeq
+ neg sizeq
+.1:
+ mova m0, [srcq + sizeq]
+ mova m1, [srcq + sizeq + mmsize]
+ paddb m0, [dstq + sizeq]
+ paddb m1, [dstq + sizeq + mmsize]
+ mova [dstq + sizeq], m0
+ mova [dstq + sizeq + mmsize], m1
+ add sizeq, 2*mmsize
+ jl .1
+.2:
+ and wq, 2*mmsize-1
+ jz .end
+ add dstq, wq
+ add srcq, wq
+ neg wq
+.3
+ mov sizeb, [srcq + wq]
+ add [dstq + wq], sizeb
+ inc wq
+ jl .3
+.end:
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+ADD_BYTES
+INIT_XMM sse2
+ADD_BYTES
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 1efb34dbbe..8a755e65b0 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -23,7 +23,8 @@
#include "libavutil/x86/cpu.h"
#include "libavcodec/huffyuvdsp.h"
-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
+void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, intptr_t w);
+void ff_add_bytes_sse2(uint8_t *dst, uint8_t *src, intptr_t w);
void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
@@ -46,7 +47,7 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
c->add_hfyu_median_pred = ff_add_hfyu_median_pred_cmov;
#endif
- if (INLINE_MMX(cpu_flags))
+ if (EXTERNAL_MMX(cpu_flags))
c->add_bytes = ff_add_bytes_mmx;
if (EXTERNAL_MMXEXT(cpu_flags)) {
@@ -55,6 +56,10 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
}
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->add_bytes = ff_add_bytes_sse2;
+ }
+
if (EXTERNAL_SSSE3(cpu_flags)) {
c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
diff --git a/libavcodec/x86/huffyuvdsp_mmx.c b/libavcodec/x86/huffyuvdsp_mmx.c
index 59422107d3..ee6ec91287 100644
--- a/libavcodec/x86/huffyuvdsp_mmx.c
+++ b/libavcodec/x86/huffyuvdsp_mmx.c
@@ -22,9 +22,7 @@
#include "libavutil/x86/asm.h"
#include "huffyuvdsp.h"
-#if HAVE_INLINE_ASM
-
-#if HAVE_7REGS
+#if HAVE_INLINE_ASM && HAVE_7REGS
void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top)
@@ -61,31 +59,3 @@ void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
*left_top = tl;
}
#endif
-
-void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
-{
- x86_reg i = 0;
-
- __asm__ volatile (
- "jmp 2f \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq (%2, %0), %%mm1 \n\t"
- "paddb %%mm0, %%mm1 \n\t"
- "movq %%mm1, (%2, %0) \n\t"
- "movq 8(%1, %0), %%mm0 \n\t"
- "movq 8(%2, %0), %%mm1 \n\t"
- "paddb %%mm0, %%mm1 \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "2: \n\t"
- "cmp %3, %0 \n\t"
- "js 1b \n\t"
- : "+r" (i)
- : "r" (src), "r" (dst), "r" ((x86_reg) w - 15));
-
- for (; i < w; i++)
- dst[i + 0] += src[i + 0];
-}
-
-#endif /* HAVE_INLINE_ASM */