diff options
-rw-r--r-- | libavcodec/dsputil.c | 6 | ||||
-rw-r--r-- | libavcodec/dsputil.h | 4 | ||||
-rw-r--r-- | libavcodec/ppc/float_altivec.c | 66 | ||||
-rw-r--r-- | libavcodec/wmadec.c | 8 | ||||
-rw-r--r-- | libavcodec/x86/dsputil_mmx.c | 76 |
5 files changed, 18 insertions, 142 deletions
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 270c583628..894e592aa6 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -4068,10 +4068,10 @@ static void vector_fmul_reverse_c(float *dst, const float *src0, const float *sr dst[i] = src0[i] * src1[-i]; } -void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){ +static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){ int i; for(i=0; i<len; i++) - dst[i*step] = src0[i] * src1[i] + src2[i] + src3; + dst[i] = src0[i] * src1[i] + src2[i]; } void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){ @@ -4787,7 +4787,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) #endif c->vector_fmul = vector_fmul_c; c->vector_fmul_reverse = vector_fmul_reverse_c; - c->vector_fmul_add_add = ff_vector_fmul_add_add_c; + c->vector_fmul_add = vector_fmul_add_c; c->vector_fmul_window = ff_vector_fmul_window_c; c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; c->vector_clipf = vector_clipf_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 58d5b49bd5..dd7b22d537 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -66,8 +66,6 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); -void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, - const float *src2, int src3, int blocksize, int step); void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); void ff_float_to_int16_c(int16_t *dst, const float *src, long len); @@ -391,7 +389,7 @@ typedef struct DSPContext { void (*vector_fmul)(float *dst, const float *src, int len); void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); /* assume len is a multiple of 8, and src arrays are 16-byte aligned */ - void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step); + void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len); /* assume len is a multiple of 4, and arrays are 16-byte aligned */ void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); /* assume len is a multiple of 8, and arrays are 16-byte aligned */ diff --git a/libavcodec/ppc/float_altivec.c b/libavcodec/ppc/float_altivec.c index 096f75f8b9..48d093cd15 100644 --- a/libavcodec/ppc/float_altivec.c +++ b/libavcodec/ppc/float_altivec.c @@ -66,71 +66,15 @@ static void vector_fmul_reverse_altivec(float *dst, const float *src0, } } -static void vector_fmul_add_add_altivec(float *dst, const float *src0, - const float *src1, const float *src2, - int src3, int len, int step) +static void vector_fmul_add_altivec(float *dst, const float *src0, + const float *src1, const float *src2, + int len) { int i; vector float d, s0, s1, s2, t0, t1, edges; vector unsigned char align = vec_lvsr(0,dst), mask = vec_lvsl(0, dst); -#if 0 //FIXME: there is still something wrong - if (step == 2) { - int y; - vector float d0, d1, s3, t2; - vector unsigned int sel = - vec_mergeh(vec_splat_u32(-1), vec_splat_u32(0)); - t1 = vec_ld(16, dst); - for (i=0,y=0; i<len-3; i+=4,y+=8) { - - s0 = vec_ld(0,src0+i); - s1 = vec_ld(0,src1+i); - s2 = vec_ld(0,src2+i); - -// t0 = vec_ld(0, dst+y); //[x x x|a] -// t1 = vec_ld(16, dst+y); //[b c d|e] - t2 = vec_ld(31, dst+y); //[f g h|x] - - d = vec_madd(s0,s1,s2); // [A B C D] - - // [A A B B] - - // [C C D D] - - d0 = vec_perm(t0, t1, mask); // [a b c d] - - d0 = vec_sel(vec_mergeh(d, d), d0, sel); // [A b B d] - - edges = vec_perm(t1, t0, mask); - - t0 = vec_perm(edges, d0, align); // [x x x|A] - - t1 = vec_perm(d0, edges, align); // [b B d|e] - - vec_stl(t0, 0, dst+y); - - d1 = vec_perm(t1, t2, mask); // [e f g h] - - d1 = vec_sel(vec_mergel(d, d), d1, sel); // [C f D h] - - edges = vec_perm(t2, t1, mask); - - t1 = vec_perm(edges, d1, align); // [b B d|C] - - t2 = vec_perm(d1, edges, align); // [f D h|x] - - vec_stl(t1, 16, dst+y); - - t0 = t1; - - vec_stl(t2, 31, dst+y); - - t1 = t2; - } - } else - #endif - if (step == 1 && src3 == 0) for (i=0; i<len-3; i+=4) { t0 = vec_ld(0, dst+i); t1 = vec_ld(15, dst+i); @@ -144,8 +88,6 @@ static void vector_fmul_add_add_altivec(float *dst, const float *src0, vec_st(t1, 15, dst+i); vec_st(t0, 0, dst+i); } - else - ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); } static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len) @@ -299,7 +241,7 @@ void float_init_altivec(DSPContext* c, AVCodecContext *avctx) { c->vector_fmul = vector_fmul_altivec; c->vector_fmul_reverse = vector_fmul_reverse_altivec; - c->vector_fmul_add_add = vector_fmul_add_add_altivec; + c->vector_fmul_add = vector_fmul_add_altivec; c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec; if(!(avctx->flags & CODEC_FLAG_BITEXACT)) { c->vector_fmul_window = vector_fmul_window_altivec; diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c index ca3bf0a523..a4d5ad6d27 100644 --- a/libavcodec/wmadec.c +++ b/libavcodec/wmadec.c @@ -301,16 +301,16 @@ static void wma_window(WMACodecContext *s, float *out) block_len = s->block_len; bsize = s->frame_len_bits - s->block_len_bits; - s->dsp.vector_fmul_add_add(out, in, s->windows[bsize], - out, 0, block_len, 1); + s->dsp.vector_fmul_add(out, in, s->windows[bsize], + out, block_len); } else { block_len = 1 << s->prev_block_len_bits; n = (s->block_len - block_len) / 2; bsize = s->frame_len_bits - s->prev_block_len_bits; - s->dsp.vector_fmul_add_add(out+n, in+n, s->windows[bsize], - out+n, 0, block_len, 1); + s->dsp.vector_fmul_add(out+n, in+n, s->windows[bsize], + out+n, block_len); memcpy(out+n+block_len, in+n+block_len, n*sizeof(float)); } diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index f430abcbb0..79ceb15554 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2125,34 +2125,9 @@ static void vector_fmul_reverse_sse(float *dst, const float *src0, const float * ); } -static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, - const float *src2, int src3, int len, int step){ +static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1, + const float *src2, int len){ x86_reg i = (len-4)*4; - if(step == 2 && src3 == 0){ - dst += (len-4)*2; - __asm__ volatile( - "1: \n\t" - "movq (%2,%0), %%mm0 \n\t" - "movq 8(%2,%0), %%mm1 \n\t" - "pfmul (%3,%0), %%mm0 \n\t" - "pfmul 8(%3,%0), %%mm1 \n\t" - "pfadd (%4,%0), %%mm0 \n\t" - "pfadd 8(%4,%0), %%mm1 \n\t" - "movd %%mm0, (%1) \n\t" - "movd %%mm1, 16(%1) \n\t" - "psrlq $32, %%mm0 \n\t" - "psrlq $32, %%mm1 \n\t" - "movd %%mm0, 8(%1) \n\t" - "movd %%mm1, 24(%1) \n\t" - "sub $32, %1 \n\t" - "sub $16, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(dst) - :"r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else if(step == 1 && src3 == 0){ __asm__ volatile( "1: \n\t" "movq (%2,%0), %%mm0 \n\t" @@ -2169,47 +2144,11 @@ static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float :"r"(dst), "r"(src0), "r"(src1), "r"(src2) :"memory" ); - } - else - ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); __asm__ volatile("femms"); } -static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, - const float *src2, int src3, int len, int step){ +static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1, + const float *src2, int len){ x86_reg i = (len-8)*4; - if(step == 2 && src3 == 0){ - dst += (len-8)*2; - __asm__ volatile( - "1: \n\t" - "movaps (%2,%0), %%xmm0 \n\t" - "movaps 16(%2,%0), %%xmm1 \n\t" - "mulps (%3,%0), %%xmm0 \n\t" - "mulps 16(%3,%0), %%xmm1 \n\t" - "addps (%4,%0), %%xmm0 \n\t" - "addps 16(%4,%0), %%xmm1 \n\t" - "movss %%xmm0, (%1) \n\t" - "movss %%xmm1, 32(%1) \n\t" - "movhlps %%xmm0, %%xmm2 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movss %%xmm2, 16(%1) \n\t" - "movss %%xmm3, 48(%1) \n\t" - "shufps $0xb1, %%xmm0, %%xmm0 \n\t" - "shufps $0xb1, %%xmm1, %%xmm1 \n\t" - "movss %%xmm0, 8(%1) \n\t" - "movss %%xmm1, 40(%1) \n\t" - "movhlps %%xmm0, %%xmm2 \n\t" - "movhlps %%xmm1, %%xmm3 \n\t" - "movss %%xmm2, 24(%1) \n\t" - "movss %%xmm3, 56(%1) \n\t" - "sub $64, %1 \n\t" - "sub $32, %0 \n\t" - "jge 1b \n\t" - :"+r"(i), "+r"(dst) - :"r"(src0), "r"(src1), "r"(src2) - :"memory" - ); - } - else if(step == 1 && src3 == 0){ __asm__ volatile( "1: \n\t" "movaps (%2,%0), %%xmm0 \n\t" @@ -2226,9 +2165,6 @@ static void vector_fmul_add_add_sse(float *dst, const float *src0, const float * :"r"(dst), "r"(src0), "r"(src1), "r"(src2) :"memory" ); - } - else - ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); } static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, @@ -3077,7 +3013,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->ac3_downmix = ac3_downmix_sse; c->vector_fmul = vector_fmul_sse; c->vector_fmul_reverse = vector_fmul_reverse_sse; - c->vector_fmul_add_add = vector_fmul_add_add_sse; + c->vector_fmul_add = vector_fmul_add_sse; c->vector_fmul_window = vector_fmul_window_sse; c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; c->vector_clipf = vector_clipf_sse; @@ -3085,7 +3021,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->float_to_int16_interleave = float_to_int16_interleave_sse; } if(mm_flags & FF_MM_3DNOW) - c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse + c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse if(mm_flags & FF_MM_SSE2){ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; c->float_to_int16 = float_to_int16_sse2; |