summaryrefslogtreecommitdiff
path: root/libavcodec/i386/dsputil_mmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/i386/dsputil_mmx.c')
-rw-r--r--libavcodec/i386/dsputil_mmx.c148
1 files changed, 68 insertions, 80 deletions
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index b7a454b28b..6d88b0a04c 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2752,91 +2752,69 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o
}
#ifdef CONFIG_ENCODERS
-static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
- long i=0;
- assert(FFABS(scale) < 256);
- scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
+#define PHADDD(a, t)\
+ "movq "#a", "#t" \n\t"\
+ "psrlq $32, "#a" \n\t"\
+ "paddd "#t", "#a" \n\t"
+/*
+ pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
+ pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
+ pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
+ */
+#define PMULHRW(x, y, s, o)\
+ "pmulhw " #s ", "#x " \n\t"\
+ "pmulhw " #s ", "#y " \n\t"\
+ "paddw " #o ", "#x " \n\t"\
+ "paddw " #o ", "#y " \n\t"\
+ "psraw $1, "#x " \n\t"\
+ "psraw $1, "#y " \n\t"
+#define DEF(x) x ## _mmx
+#define SET_RND MOVQ_WONE
+#define SCALE_OFFSET 1
+
+#include "dsputil_mmx_qns.h"
- asm volatile(
- "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
- "psrlw $15, %%mm6 \n\t" // 1w
- "pxor %%mm7, %%mm7 \n\t"
- "movd %4, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "pmulhw %%mm5, %%mm0 \n\t"
- "pmulhw %%mm5, %%mm1 \n\t"
- "paddw %%mm6, %%mm0 \n\t"
- "paddw %%mm6, %%mm1 \n\t"
- "psraw $1, %%mm0 \n\t"
- "psraw $1, %%mm1 \n\t"
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "psraw $6, %%mm0 \n\t"
- "psraw $6, %%mm1 \n\t"
- "pmullw (%3, %0), %%mm0 \n\t"
- "pmullw 8(%3, %0), %%mm1 \n\t"
- "pmaddwd %%mm0, %%mm0 \n\t"
- "pmaddwd %%mm1, %%mm1 \n\t"
- "paddd %%mm1, %%mm0 \n\t"
- "psrld $4, %%mm0 \n\t"
- "paddd %%mm0, %%mm7 \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" //FIXME optimize & bench
- " jb 1b \n\t"
- "movq %%mm7, %%mm6 \n\t"
- "psrlq $32, %%mm7 \n\t"
- "paddd %%mm6, %%mm7 \n\t"
- "psrld $2, %%mm7 \n\t"
- "movd %%mm7, %0 \n\t"
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
- : "+r" (i)
- : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
- );
- return i;
-}
+#define DEF(x) x ## _3dnow
+#define SET_RND(x)
+#define SCALE_OFFSET 0
+#define PMULHRW(x, y, s, o)\
+ "pmulhrw " #s ", "#x " \n\t"\
+ "pmulhrw " #s ", "#y " \n\t"
-static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
- long i=0;
+#include "dsputil_mmx_qns.h"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+
+#ifdef HAVE_SSSE3
+#undef PHADDD
+#define DEF(x) x ## _ssse3
+#define SET_RND(x)
+#define SCALE_OFFSET -1
+#define PHADDD(a, t)\
+ "pshufw $0x0E, "#a", "#t" \n\t"\
+ "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
+#define PMULHRW(x, y, s, o)\
+ "pmulhrsw " #s ", "#x " \n\t"\
+ "pmulhrsw " #s ", "#y " \n\t"
+
+#include "dsputil_mmx_qns.h"
+
+#undef DEF
+#undef SET_RND
+#undef SCALE_OFFSET
+#undef PMULHRW
+#undef PHADDD
+#endif //HAVE_SSSE3
- if(FFABS(scale) < 256){
- scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
- asm volatile(
- "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
- "psrlw $15, %%mm6 \n\t" // 1w
- "movd %3, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- "pmulhw %%mm5, %%mm0 \n\t"
- "pmulhw %%mm5, %%mm1 \n\t"
- "paddw %%mm6, %%mm0 \n\t"
- "paddw %%mm6, %%mm1 \n\t"
- "psraw $1, %%mm0 \n\t"
- "psraw $1, %%mm1 \n\t"
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "movq %%mm0, (%2, %0) \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" //FIXME optimize & bench
- " jb 1b \n\t"
-
- : "+r" (i)
- : "r"(basis), "r"(rem), "g"(scale)
- );
- }else{
- for(i=0; i<8*8; i++){
- rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
- }
- }
-}
#endif /* CONFIG_ENCODERS */
#define PREFETCH(name, op) \
@@ -3625,6 +3603,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#ifdef HAVE_SSSE3
if(mm_flags & MM_SSSE3){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_ssse3;
+ }
+ c->add_8x8basis= add_8x8basis_ssse3;
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
@@ -3646,6 +3628,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#endif
if(mm_flags & MM_3DNOW){
+#ifdef CONFIG_ENCODERS
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->try_8x8basis= try_8x8basis_3dnow;
+ }
+ c->add_8x8basis= add_8x8basis_3dnow;
+#endif //CONFIG_ENCODERS
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
if(!(avctx->flags & CODEC_FLAG_BITEXACT))