summaryrefslogtreecommitdiff
path: root/libavcodec/x86/dsputil_mmx.c
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2011-01-31 20:55:56 -0500
committerRonald S. Bultje <rsbultje@gmail.com>2011-01-31 20:55:56 -0500
commit81f2a3f4ffcc6935b8b8ada4954700b3f333ae4f (patch)
treeeb4ccad677629b8795c8f339ccf7e08990fa7fd6 /libavcodec/x86/dsputil_mmx.c
parenta0f9c8ce372c3943104672f8fd7ba2bcf9a5e157 (diff)
Implement a SIMD version of emulated_edge_mc() for x86.
From ~550 cycles (C version) to 170 (SSE/x86-64), 206 (MMX/x86-32) and 196 (SSE2/x86-32) cycles.
Diffstat (limited to 'libavcodec/x86/dsputil_mmx.c')
-rw-r--r--libavcodec/x86/dsputil_mmx.c110
1 files changed, 107 insertions, 3 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 8257b3fa8a..2eb7d85f14 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1664,8 +1664,80 @@ QPEL_2TAP(avg_, 8, 3dnow)
static void just_return(void) { return; }
#endif
-static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
- int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
+#if HAVE_YASM
+typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
+ x86_reg linesize, x86_reg start_y,
+ x86_reg end_y, x86_reg block_h,
+ x86_reg start_x, x86_reg end_x,
+ x86_reg block_w);
+extern emu_edge_core_func ff_emu_edge_core_mmx;
+extern emu_edge_core_func ff_emu_edge_core_sse;
+
+static av_always_inline
+void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
+ int block_w, int block_h,
+ int src_x, int src_y, int w, int h,
+ emu_edge_core_func *core_fn)
+{
+ int start_y, start_x, end_y, end_x, src_y_add=0;
+
+ if(src_y>= h){
+ src_y_add = h-1-src_y;
+ src_y=h-1;
+ }else if(src_y<=-block_h){
+ src_y_add = 1-block_h-src_y;
+ src_y=1-block_h;
+ }
+ if(src_x>= w){
+ src+= (w-1-src_x);
+ src_x=w-1;
+ }else if(src_x<=-block_w){
+ src+= (1-block_w-src_x);
+ src_x=1-block_w;
+ }
+
+ start_y= FFMAX(0, -src_y);
+ start_x= FFMAX(0, -src_x);
+ end_y= FFMIN(block_h, h-src_y);
+ end_x= FFMIN(block_w, w-src_x);
+ assert(start_x < end_x && block_w > 0);
+ assert(start_y < end_y && block_h > 0);
+
+ // fill in the to-be-copied part plus all above/below
+ src += (src_y_add+start_y)*linesize + start_x;
+ buf += start_x;
+ core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
+}
+
+#if ARCH_X86_32
+static av_noinline
+void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
+ int block_w, int block_h,
+ int src_x, int src_y, int w, int h)
+{
+ emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
+ w, h, &ff_emu_edge_core_mmx);
+}
+#endif
+static av_noinline
+void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
+ int block_w, int block_h,
+ int src_x, int src_y, int w, int h)
+{
+ emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
+ w, h, &ff_emu_edge_core_sse);
+}
+#endif /* HAVE_YASM */
+
+typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
+ int linesize, int block_w, int block_h,
+ int src_x, int src_y, int w, int h);
+
+static av_always_inline
+void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
+ emulated_edge_mc_func *emu_edge_fn)
+{
const int w = 8;
const int ix = ox>>(16+shift);
const int iy = oy>>(16+shift);
@@ -1701,7 +1773,7 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o
if( (unsigned)ix >= width-w ||
(unsigned)iy >= height-h )
{
- ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
+ emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
src = edge_buf;
}
@@ -1782,6 +1854,30 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o
}
}
+#if HAVE_YASM
+#if ARCH_X86_32
+static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
+{
+ gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+ width, height, &emulated_edge_mc_mmx);
+}
+#endif
+static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
+{
+ gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+ width, height, &emulated_edge_mc_sse);
+}
+#else
+static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
+ int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
+{
+ gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
+ width, height, &ff_emulated_edge_mc);
+}
+#endif
+
#define PREFETCH(name, op) \
static void name(void *mem, int stride, int h){\
const uint8_t *p= mem;\
@@ -2626,7 +2722,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
SET_HPEL_FUNCS(avg, 1, 8, mmx);
SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
+#if ARCH_X86_32 || !HAVE_YASM
c->gmc= gmc_mmx;
+#endif
+#if ARCH_X86_32 && HAVE_YASM
+ c->emulated_edge_mc = emulated_edge_mc_mmx;
+#endif
c->add_bytes= add_bytes_mmx;
c->add_bytes_l2= add_bytes_l2_mmx;
@@ -2913,6 +3014,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+
+ c->emulated_edge_mc = emulated_edge_mc_sse;
+ c->gmc= gmc_sse;
#endif
}
if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit