summaryrefslogtreecommitdiff
path: root/libavcodec
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2014-05-21 18:02:39 -0300
committerMichael Niedermayer <michaelni@gmx.at>2014-05-21 23:33:45 +0200
commit80ee2dfcf6744525b0d08311333a06ab780af30c (patch)
treed84680054a0c6c20936c10cc8d95b7fa2e6120ba /libavcodec
parent7b05267239edbd7ea2e2b3b67925137b4cd99c8f (diff)
x86/dsputil: port ff_put_signed_pixels_clamped_mmx to yasm
Also add an SSE2 version Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/x86/dsputil.asm52
-rw-r--r--libavcodec/x86/dsputil_init.c3
-rw-r--r--libavcodec/x86/dsputil_mmx.c36
-rw-r--r--libavcodec/x86/dsputil_x86.h2
4 files changed, 56 insertions, 37 deletions
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index c91dd8eb69..747c645666 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -31,6 +31,8 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+cextern pb_80
+
SECTION_TEXT
%macro SCALARPRODUCT 0
@@ -573,3 +575,53 @@ CLEAR_BLOCKS 0
INIT_XMM sse
%define ZERO xorps
CLEAR_BLOCKS 1
+
+;--------------------------------------------------------------------------
+;void ff_put_signed_pixels_clamped(const int16_t *block, uint8_t *pixels,
+; int line_size)
+;--------------------------------------------------------------------------
+
+%macro PUT_SIGNED_PIXELS_CLAMPED_HALF 1
+ mova m1, [blockq+mmsize*0+%1]
+ mova m2, [blockq+mmsize*2+%1]
+%if mmsize == 8
+ mova m3, [blockq+mmsize*4+%1]
+ mova m4, [blockq+mmsize*6+%1]
+%endif
+ packsswb m1, [blockq+mmsize*1+%1]
+ packsswb m2, [blockq+mmsize*3+%1]
+%if mmsize == 8
+ packsswb m3, [blockq+mmsize*5+%1]
+ packsswb m4, [blockq+mmsize*7+%1]
+%endif
+ paddb m1, m0
+ paddb m2, m0
+%if mmsize == 8
+ paddb m3, m0
+ paddb m4, m0
+ movq [pixelsq+lsizeq*0], m1
+ movq [pixelsq+lsizeq*1], m2
+ movq [pixelsq+lsizeq*2], m3
+ movq [pixelsq+lsize3q ], m4
+%else
+ movq [pixelsq+lsizeq*0], m1
+ movhps [pixelsq+lsizeq*1], m1
+ movq [pixelsq+lsizeq*2], m2
+ movhps [pixelsq+lsize3q ], m2
+%endif
+%endmacro
+
+%macro PUT_SIGNED_PIXELS_CLAMPED 1
+cglobal put_signed_pixels_clamped, 3, 4, %1, block, pixels, lsize, lsize3
+ mova m0, [pb_80]
+ lea lsize3q, [lsizeq*3]
+ PUT_SIGNED_PIXELS_CLAMPED_HALF 0
+ lea pixelsq, [pixelsq+lsizeq*4]
+ PUT_SIGNED_PIXELS_CLAMPED_HALF 64
+ RET
+%endmacro
+
+INIT_MMX mmx
+PUT_SIGNED_PIXELS_CLAMPED 0
+INIT_XMM sse2
+PUT_SIGNED_PIXELS_CLAMPED 3
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index 4461ae464f..e274e671d7 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -530,7 +530,6 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
{
#if HAVE_MMX_INLINE
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
- c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
if (!high_bit_depth) {
@@ -550,6 +549,7 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->clear_blocks = ff_clear_blocks_mmx;
}
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
@@ -627,6 +627,7 @@ static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index a9c584d88a..fa77a5c938 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -94,42 +94,6 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
: "memory");
}
-#define put_signed_pixels_clamped_mmx_half(off) \
- "movq "#off"(%2), %%mm1 \n\t" \
- "movq 16 + "#off"(%2), %%mm2 \n\t" \
- "movq 32 + "#off"(%2), %%mm3 \n\t" \
- "movq 48 + "#off"(%2), %%mm4 \n\t" \
- "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
- "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
- "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
- "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
- "paddb %%mm0, %%mm1 \n\t" \
- "paddb %%mm0, %%mm2 \n\t" \
- "paddb %%mm0, %%mm3 \n\t" \
- "paddb %%mm0, %%mm4 \n\t" \
- "movq %%mm1, (%0) \n\t" \
- "movq %%mm2, (%0, %3) \n\t" \
- "movq %%mm3, (%0, %3, 2) \n\t" \
- "movq %%mm4, (%0, %1) \n\t"
-
-void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
- int line_size)
-{
- x86_reg line_skip = line_size;
- x86_reg line_skip3;
-
- __asm__ volatile (
- "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
- "lea (%3, %3, 2), %1 \n\t"
- put_signed_pixels_clamped_mmx_half(0)
- "lea (%0, %3, 4), %0 \n\t"
- put_signed_pixels_clamped_mmx_half(64)
- : "+&r" (pixels), "=&r" (line_skip3)
- : "r" (block), "r" (line_skip)
- NAMED_CONSTRAINTS_ADD(ff_pb_80)
- : "memory");
-}
-
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h
index 6a50a09e5d..1f4711dd2d 100644
--- a/libavcodec/x86/dsputil_x86.h
+++ b/libavcodec/x86/dsputil_x86.h
@@ -37,6 +37,8 @@ void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size);
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size);
+void ff_put_signed_pixels_clamped_sse2(const int16_t *block, uint8_t *pixels,
+ int line_size);
void ff_clear_block_mmx(int16_t *block);
void ff_clear_block_sse(int16_t *block);