From 421c99a4a7c116fc2d4e7a6c866c2209852ef581 Mon Sep 17 00:00:00 2001 From: Vitor Sessak Date: Tue, 3 Jan 2012 21:40:57 +0100 Subject: mpegaudiodec: interleave iMDCT buffer to simplify future SIMD implementations Signed-off-by: Ronald S. Bultje --- libavcodec/mpegaudiodec.c | 50 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'libavcodec/mpegaudiodec.c') diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index cd7b7f5053..702476b7ec 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -621,17 +621,17 @@ static void imdct36(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, INTFLOAT *win) t0 = s0 + s1; t1 = s0 - s1; - out[(9 + j) * SBLIMIT] = MULH3(t1, win[ 9 + j], 1) + buf[9 + j]; - out[(8 - j) * SBLIMIT] = MULH3(t1, win[ 8 - j], 1) + buf[8 - j]; - buf[ 9 + j ] = MULH3(t0, win[18 + 9 + j], 1); - buf[ 8 - j ] = MULH3(t0, win[18 + 8 - j], 1); + out[(9 + j) * SBLIMIT] = MULH3(t1, win[ 9 + j], 1) + buf[4*(9 + j)]; + out[(8 - j) * SBLIMIT] = MULH3(t1, win[ 8 - j], 1) + buf[4*(8 - j)]; + buf[4 * ( 9 + j )] = MULH3(t0, win[18 + 9 + j], 1); + buf[4 * ( 8 - j )] = MULH3(t0, win[18 + 8 - j], 1); t0 = s2 + s3; t1 = s2 - s3; - out[(9 + 8 - j) * SBLIMIT] = MULH3(t1, win[ 9 + 8 - j], 1) + buf[9 + 8 - j]; - out[ j * SBLIMIT] = MULH3(t1, win[ j], 1) + buf[ j]; - buf[ 9 + 8 - j ] = MULH3(t0, win[18 + 9 + 8 - j], 1); - buf[ j ] = MULH3(t0, win[18 + j], 1); + out[(9 + 8 - j) * SBLIMIT] = MULH3(t1, win[ 9 + 8 - j], 1) + buf[4*(9 + 8 - j)]; + out[ j * SBLIMIT] = MULH3(t1, win[ j], 1) + buf[4*( j)]; + buf[4 * ( 9 + 8 - j )] = MULH3(t0, win[18 + 9 + 8 - j], 1); + buf[4 * ( j )] = MULH3(t0, win[18 + j], 1); i += 4; } @@ -639,10 +639,10 @@ static void imdct36(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, INTFLOAT *win) s1 = MULH3(tmp[17], icos36h[4], 2); t0 = s0 + s1; t1 = s0 - s1; - out[(9 + 4) * SBLIMIT] = MULH3(t1, win[ 9 + 4], 1) + buf[9 + 4]; - out[(8 - 4) * SBLIMIT] = MULH3(t1, win[ 8 - 4], 1) + buf[8 - 4]; - buf[ 9 + 4 ] = MULH3(t0, win[18 + 9 + 4], 1); - buf[ 8 - 4 ] = MULH3(t0, win[18 + 8 - 4], 1); + out[(9 + 4) * SBLIMIT] = MULH3(t1, win[ 9 + 4], 1) + buf[4*(9 + 4)]; + out[(8 - 4) * SBLIMIT] = MULH3(t1, win[ 8 - 4], 1) + buf[4*(8 - 4)]; + buf[4 * ( 9 + 4 )] = MULH3(t0, win[18 + 9 + 4], 1); + buf[4 * ( 8 - 4 )] = MULH3(t0, win[18 + 8 - 4], 1); } /* return the number of decoded frames */ @@ -1407,7 +1407,7 @@ static void compute_imdct(MPADecodeContext *s, GranuleDef *g, imdct36(out_ptr, buf, ptr, win); out_ptr += 18 * SBLIMIT; ptr += 18; - buf += 18; + buf += (j&3) != 3 ? 1 : (4*18-3); } for (j = mdct_long_end; j < sblimit; j++) { /* select frequency inversion */ @@ -1415,40 +1415,40 @@ static void compute_imdct(MPADecodeContext *s, GranuleDef *g, out_ptr = sb_samples + j; for (i = 0; i < 6; i++) { - *out_ptr = buf[i]; + *out_ptr = buf[4*i]; out_ptr += SBLIMIT; } imdct12(out2, ptr + 0); for (i = 0; i < 6; i++) { - *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[i + 6*1]; - buf[i + 6*2] = MULH3(out2[i + 6], win[i + 6], 1); + *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[4*(i + 6*1)]; + buf[4*(i + 6*2)] = MULH3(out2[i + 6], win[i + 6], 1); out_ptr += SBLIMIT; } imdct12(out2, ptr + 1); for (i = 0; i < 6; i++) { - *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[i + 6*2]; - buf[i + 6*0] = MULH3(out2[i + 6], win[i + 6], 1); + *out_ptr = MULH3(out2[i ], win[i ], 1) + buf[4*(i + 6*2)]; + buf[4*(i + 6*0)] = MULH3(out2[i + 6], win[i + 6], 1); out_ptr += SBLIMIT; } imdct12(out2, ptr + 2); for (i = 0; i < 6; i++) { - buf[i + 6*0] = MULH3(out2[i ], win[i ], 1) + buf[i + 6*0]; - buf[i + 6*1] = MULH3(out2[i + 6], win[i + 6], 1); - buf[i + 6*2] = 0; + buf[4*(i + 6*0)] = MULH3(out2[i ], win[i ], 1) + buf[4*(i + 6*0)]; + buf[4*(i + 6*1)] = MULH3(out2[i + 6], win[i + 6], 1); + buf[4*(i + 6*2)] = 0; } ptr += 18; - buf += 18; + buf += (j&3) != 3 ? 1 : (4*18-3); } /* zero bands */ for (j = sblimit; j < SBLIMIT; j++) { /* overlap */ out_ptr = sb_samples + j; for (i = 0; i < 18; i++) { - *out_ptr = buf[i]; - buf[i] = 0; + *out_ptr = buf[4*i]; + buf[4*i] = 0; out_ptr += SBLIMIT; } - buf += 18; + buf += (j&3) != 3 ? 1 : (4*18-3); } } -- cgit v1.2.3 From 6dfcf53092aba9f1ef31629e11515df5752327db Mon Sep 17 00:00:00 2001 From: Vitor Sessak Date: Wed, 4 Jan 2012 21:32:47 +0100 Subject: mpegaudiodec: move imdct and windowing function to mpegaudiodsp Signed-off-by: Ronald S. Bultje --- libavcodec/mpegaudiodec.c | 193 +++---------------------------------- libavcodec/mpegaudiodsp.c | 5 + libavcodec/mpegaudiodsp.h | 16 +++ libavcodec/mpegaudiodsp_template.c | 190 ++++++++++++++++++++++++++++++++++++ 4 files changed, 226 insertions(+), 178 deletions(-) (limited to 'libavcodec/mpegaudiodec.c') diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index 702476b7ec..70c5f76381 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -130,7 +130,6 @@ static uint16_t band_index_long[9][23]; static INTFLOAT is_table[2][16]; static INTFLOAT is_table_lsf[2][2][16]; static INTFLOAT csa_table[8][4]; -static INTFLOAT mdct_win[8][36]; static int16_t division_tab3[1<<6 ]; static int16_t division_tab5[1<<8 ]; @@ -417,43 +416,6 @@ static av_cold void decode_init_static(void) csa_table[i][3] = ca - cs; #endif } - - /* compute mdct windows */ - for (i = 0; i < 36; i++) { - for (j = 0; j < 4; j++) { - double d; - - if (j == 2 && i % 3 != 1) - continue; - - d = sin(M_PI * (i + 0.5) / 36.0); - if (j == 1) { - if (i >= 30) d = 0; - else if (i >= 24) d = sin(M_PI * (i - 18 + 0.5) / 12.0); - else if (i >= 18) d = 1; - } else if (j == 3) { - if (i < 6) d = 0; - else if (i < 12) d = sin(M_PI * (i - 6 + 0.5) / 12.0); - else if (i < 18) d = 1; - } - //merge last stage of imdct into the window coefficients - d *= 0.5 / cos(M_PI * (2 * i + 19) / 72); - - if (j == 2) - mdct_win[j][i/3] = FIXHR((d / (1<<5))); - else - mdct_win[j][i ] = FIXHR((d / (1<<5))); - } - } - - /* NOTE: we do frequency inversion adter the MDCT by changing - the sign of the right window coefs */ - for (j = 0; j < 4; j++) { - for (i = 0; i < 36; i += 2) { - mdct_win[j + 4][i ] = mdct_win[j][i ]; - mdct_win[j + 4][i + 1] = -mdct_win[j][i + 1]; - } - } } static av_cold int decode_init(AVCodecContext * avctx) @@ -483,32 +445,9 @@ static av_cold int decode_init(AVCodecContext * avctx) } #define C3 FIXHR(0.86602540378443864676/2) - -/* 0.5 / cos(pi*(2*i+1)/36) */ -static const INTFLOAT icos36[9] = { - FIXR(0.50190991877167369479), - FIXR(0.51763809020504152469), //0 - FIXR(0.55168895948124587824), - FIXR(0.61038729438072803416), - FIXR(0.70710678118654752439), //1 - FIXR(0.87172339781054900991), - FIXR(1.18310079157624925896), - FIXR(1.93185165257813657349), //2 - FIXR(5.73685662283492756461), -}; - -/* 0.5 / cos(pi*(2*i+1)/36) */ -static const INTFLOAT icos36h[9] = { - FIXHR(0.50190991877167369479/2), - FIXHR(0.51763809020504152469/2), //0 - FIXHR(0.55168895948124587824/2), - FIXHR(0.61038729438072803416/2), - FIXHR(0.70710678118654752439/2), //1 - FIXHR(0.87172339781054900991/2), - FIXHR(1.18310079157624925896/4), - FIXHR(1.93185165257813657349/4), //2 -// FIXHR(5.73685662283492756461), -}; +#define C4 FIXHR(0.70710678118654752439/2) //0.5 / cos(pi*(9)/36) +#define C5 FIXHR(0.51763809020504152469/2) //0.5 / cos(pi*(5)/36) +#define C6 FIXHR(1.93185165257813657349/4) //0.5 / cos(pi*(15)/36) /* 12 points IMDCT. We compute it "by hand" by factorizing obvious cases. */ @@ -529,7 +468,7 @@ static void imdct12(INTFLOAT *out, INTFLOAT *in) in3 = MULH3(in3, C3, 4); t1 = in0 - in4; - t2 = MULH3(in1 - in5, icos36h[4], 2); + t2 = MULH3(in1 - in5, C4, 2); out[ 7] = out[10] = t1 + t2; @@ -539,112 +478,20 @@ static void imdct12(INTFLOAT *out, INTFLOAT *in) in0 += SHR(in4, 1); in4 = in0 + in2; in5 += 2*in1; - in1 = MULH3(in5 + in3, icos36h[1], 1); + in1 = MULH3(in5 + in3, C5, 1); out[ 8] = out[ 9] = in4 + in1; out[ 2] = out[ 3] = in4 - in1; in0 -= in2; - in5 = MULH3(in5 - in3, icos36h[7], 2); + in5 = MULH3(in5 - in3, C6, 2); out[ 0] = out[ 5] = in0 - in5; out[ 6] = out[11] = in0 + in5; } -/* cos(pi*i/18) */ -#define C1 FIXHR(0.98480775301220805936/2) -#define C2 FIXHR(0.93969262078590838405/2) -#define C3 FIXHR(0.86602540378443864676/2) -#define C4 FIXHR(0.76604444311897803520/2) -#define C5 FIXHR(0.64278760968653932632/2) -#define C6 FIXHR(0.5/2) -#define C7 FIXHR(0.34202014332566873304/2) -#define C8 FIXHR(0.17364817766693034885/2) - - -/* using Lee like decomposition followed by hand coded 9 points DCT */ -static void imdct36(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, INTFLOAT *win) -{ - int i, j; - INTFLOAT t0, t1, t2, t3, s0, s1, s2, s3; - INTFLOAT tmp[18], *tmp1, *in1; - - for (i = 17; i >= 1; i--) - in[i] += in[i-1]; - for (i = 17; i >= 3; i -= 2) - in[i] += in[i-2]; - - for (j = 0; j < 2; j++) { - tmp1 = tmp + j; - in1 = in + j; - - t2 = in1[2*4] + in1[2*8] - in1[2*2]; - - t3 = in1[2*0] + SHR(in1[2*6],1); - t1 = in1[2*0] - in1[2*6]; - tmp1[ 6] = t1 - SHR(t2,1); - tmp1[16] = t1 + t2; - - t0 = MULH3(in1[2*2] + in1[2*4] , C2, 2); - t1 = MULH3(in1[2*4] - in1[2*8] , -2*C8, 1); - t2 = MULH3(in1[2*2] + in1[2*8] , -C4, 2); - - tmp1[10] = t3 - t0 - t2; - tmp1[ 2] = t3 + t0 + t1; - tmp1[14] = t3 + t2 - t1; - - tmp1[ 4] = MULH3(in1[2*5] + in1[2*7] - in1[2*1], -C3, 2); - t2 = MULH3(in1[2*1] + in1[2*5], C1, 2); - t3 = MULH3(in1[2*5] - in1[2*7], -2*C7, 1); - t0 = MULH3(in1[2*3], C3, 2); - - t1 = MULH3(in1[2*1] + in1[2*7], -C5, 2); - - tmp1[ 0] = t2 + t3 + t0; - tmp1[12] = t2 + t1 - t0; - tmp1[ 8] = t3 - t1 - t0; - } - - i = 0; - for (j = 0; j < 4; j++) { - t0 = tmp[i]; - t1 = tmp[i + 2]; - s0 = t1 + t0; - s2 = t1 - t0; - - t2 = tmp[i + 1]; - t3 = tmp[i + 3]; - s1 = MULH3(t3 + t2, icos36h[ j], 2); - s3 = MULLx(t3 - t2, icos36 [8 - j], FRAC_BITS); - - t0 = s0 + s1; - t1 = s0 - s1; - out[(9 + j) * SBLIMIT] = MULH3(t1, win[ 9 + j], 1) + buf[4*(9 + j)]; - out[(8 - j) * SBLIMIT] = MULH3(t1, win[ 8 - j], 1) + buf[4*(8 - j)]; - buf[4 * ( 9 + j )] = MULH3(t0, win[18 + 9 + j], 1); - buf[4 * ( 8 - j )] = MULH3(t0, win[18 + 8 - j], 1); - - t0 = s2 + s3; - t1 = s2 - s3; - out[(9 + 8 - j) * SBLIMIT] = MULH3(t1, win[ 9 + 8 - j], 1) + buf[4*(9 + 8 - j)]; - out[ j * SBLIMIT] = MULH3(t1, win[ j], 1) + buf[4*( j)]; - buf[4 * ( 9 + 8 - j )] = MULH3(t0, win[18 + 9 + 8 - j], 1); - buf[4 * ( j )] = MULH3(t0, win[18 + j], 1); - i += 4; - } - - s0 = tmp[16]; - s1 = MULH3(tmp[17], icos36h[4], 2); - t0 = s0 + s1; - t1 = s0 - s1; - out[(9 + 4) * SBLIMIT] = MULH3(t1, win[ 9 + 4], 1) + buf[4*(9 + 4)]; - out[(8 - 4) * SBLIMIT] = MULH3(t1, win[ 8 - 4], 1) + buf[4*(8 - 4)]; - buf[4 * ( 9 + 4 )] = MULH3(t0, win[18 + 9 + 4], 1); - buf[4 * ( 8 - 4 )] = MULH3(t0, win[18 + 8 - 4], 1); -} - /* return the number of decoded frames */ static int mp_decode_layer1(MPADecodeContext *s) { @@ -1366,7 +1213,7 @@ static void compute_antialias(MPADecodeContext *s, GranuleDef *g) static void compute_imdct(MPADecodeContext *s, GranuleDef *g, INTFLOAT *sb_samples, INTFLOAT *mdct_buf) { - INTFLOAT *win, *win1, *out_ptr, *ptr, *buf, *ptr1; + INTFLOAT *win, *out_ptr, *ptr, *buf, *ptr1; INTFLOAT out2[12]; int i, j, mdct_long_end, sblimit; @@ -1392,26 +1239,16 @@ static void compute_imdct(MPADecodeContext *s, GranuleDef *g, mdct_long_end = sblimit; } - buf = mdct_buf; - ptr = g->sb_hybrid; - for (j = 0; j < mdct_long_end; j++) { - /* apply window & overlap with previous buffer */ - out_ptr = sb_samples + j; - /* select window */ - if (g->switch_point && j < 2) - win1 = mdct_win[0]; - else - win1 = mdct_win[g->block_type]; - /* select frequency inversion */ - win = win1 + ((4 * 36) & -(j & 1)); - imdct36(out_ptr, buf, ptr, win); - out_ptr += 18 * SBLIMIT; - ptr += 18; - buf += (j&3) != 3 ? 1 : (4*18-3); - } + s->mpadsp.RENAME(imdct36_blocks)(sb_samples, mdct_buf, g->sb_hybrid, + mdct_long_end, g->switch_point, + g->block_type); + + buf = mdct_buf + 4*18*(mdct_long_end >> 2) + (mdct_long_end & 3); + ptr = g->sb_hybrid + 18 * mdct_long_end; + for (j = mdct_long_end; j < sblimit; j++) { /* select frequency inversion */ - win = mdct_win[2 + (4 & -(j & 1))]; + win = RENAME(ff_mdct_win)[2 + (4 & -(j & 1))]; out_ptr = sb_samples + j; for (i = 0; i < 6; i++) { diff --git a/libavcodec/mpegaudiodsp.c b/libavcodec/mpegaudiodsp.c index 438b097d06..431724a71c 100644 --- a/libavcodec/mpegaudiodsp.c +++ b/libavcodec/mpegaudiodsp.c @@ -28,6 +28,8 @@ void ff_mpadsp_init(MPADSPContext *s) DCTContext dct; ff_dct_init(&dct, 5, DCT_II); + ff_init_mpadsp_tabs_float(); + ff_init_mpadsp_tabs_fixed(); s->apply_window_float = ff_mpadsp_apply_window_float; s->apply_window_fixed = ff_mpadsp_apply_window_fixed; @@ -35,6 +37,9 @@ void ff_mpadsp_init(MPADSPContext *s) s->dct32_float = dct.dct32; s->dct32_fixed = ff_dct32_fixed; + s->imdct36_blocks_float = ff_imdct36_blocks_float; + s->imdct36_blocks_fixed = ff_imdct36_blocks_fixed; + if (ARCH_ARM) ff_mpadsp_init_arm(s); if (HAVE_MMX) ff_mpadsp_init_mmx(s); if (HAVE_ALTIVEC) ff_mpadsp_init_altivec(s); diff --git a/libavcodec/mpegaudiodsp.h b/libavcodec/mpegaudiodsp.h index 8a18db8325..01fd698f3e 100644 --- a/libavcodec/mpegaudiodsp.h +++ b/libavcodec/mpegaudiodsp.h @@ -28,6 +28,10 @@ typedef struct MPADSPContext { int *dither_state, int16_t *samples, int incr); void (*dct32_float)(float *dst, const float *src); void (*dct32_fixed)(int *dst, const int *src); + void (*imdct36_blocks_float)(float *out, float *buf, float *in, + int count, int switch_point, int block_type); + void (*imdct36_blocks_fixed)(int *out, int *buf, int *in, + int count, int switch_point, int block_type); } MPADSPContext; void ff_mpadsp_init(MPADSPContext *s); @@ -61,4 +65,16 @@ void ff_mpadsp_apply_window_fixed(int32_t *synth_buf, int32_t *window, int *dither_state, int16_t *samples, int incr); +void ff_imdct36_blocks_float(float *out, float *buf, float *in, + int count, int switch_point, int block_type); + +void ff_imdct36_blocks_fixed(int *out, int *buf, int *in, + int count, int switch_point, int block_type); + +void ff_init_mpadsp_tabs_float(void); +void ff_init_mpadsp_tabs_fixed(void); + +extern int ff_mdct_win_fixed[8][36]; +extern float ff_mdct_win_float[8][36]; + #endif /* AVCODEC_MPEGAUDIODSP_H */ diff --git a/libavcodec/mpegaudiodsp_template.c b/libavcodec/mpegaudiodsp_template.c index 5561c46135..5a6adb8cca 100644 --- a/libavcodec/mpegaudiodsp_template.c +++ b/libavcodec/mpegaudiodsp_template.c @@ -39,7 +39,12 @@ static inline float round_sample(float *sum) #define MACS(rt, ra, rb) rt+=(ra)*(rb) #define MULS(ra, rb) ((ra)*(rb)) +#define MULH3(x, y, s) ((s)*(y)*(x)) #define MLSS(rt, ra, rb) rt-=(ra)*(rb) +#define MULLx(x, y, s) ((y)*(x)) +#define FIXHR(x) ((float)(x)) +#define FIXR(x) ((float)(x)) +#define SHR(a,b) ((a)*(1.0f/(1<<(b)))) #else @@ -57,8 +62,16 @@ static inline int round_sample(int64_t *sum) # define MULS(ra, rb) MUL64(ra, rb) # define MACS(rt, ra, rb) MAC64(rt, ra, rb) # define MLSS(rt, ra, rb) MLS64(rt, ra, rb) +# define MULH3(x, y, s) MULH((s)*(x), y) +# define MULLx(x, y, s) MULL(x,y,s) +# define SHR(a,b) ((a)>>(b)) +# define FIXR(a) ((int)((a) * FRAC_ONE + 0.5)) +# define FIXHR(a) ((int)((a) * (1LL<<32) + 0.5)) #endif +/** Window for MDCT. */ +DECLARE_ALIGNED(16, INTFLOAT, RENAME(ff_mdct_win))[8][36]; + DECLARE_ALIGNED(16, MPA_INT, RENAME(ff_mpa_synth_window))[512+256]; #define SUM8(op, sum, w, p) \ @@ -194,6 +207,7 @@ void av_cold RENAME(ff_mpa_synth_init)(MPA_INT *window) window[512 - i] = v; } + // Needed for avoiding shuffles in ASM implementations for(i=0; i < 8; i++) for(j=0; j < 16; j++) @@ -203,3 +217,179 @@ void av_cold RENAME(ff_mpa_synth_init)(MPA_INT *window) for(j=0; j < 16; j++) window[512+128+16*i+j] = window[64*i+48-j]; } + +void RENAME(ff_init_mpadsp_tabs)(void) +{ + int i, j; + /* compute mdct windows */ + for (i = 0; i < 36; i++) { + for (j = 0; j < 4; j++) { + double d; + + if (j == 2 && i % 3 != 1) + continue; + + d = sin(M_PI * (i + 0.5) / 36.0); + if (j == 1) { + if (i >= 30) d = 0; + else if (i >= 24) d = sin(M_PI * (i - 18 + 0.5) / 12.0); + else if (i >= 18) d = 1; + } else if (j == 3) { + if (i < 6) d = 0; + else if (i < 12) d = sin(M_PI * (i - 6 + 0.5) / 12.0); + else if (i < 18) d = 1; + } + //merge last stage of imdct into the window coefficients + d *= 0.5 / cos(M_PI * (2 * i + 19) / 72); + + if (j == 2) + RENAME(ff_mdct_win)[j][i/3] = FIXHR((d / (1<<5))); + else + RENAME(ff_mdct_win)[j][i ] = FIXHR((d / (1<<5))); + } + } + + /* NOTE: we do frequency inversion adter the MDCT by changing + the sign of the right window coefs */ + for (j = 0; j < 4; j++) { + for (i = 0; i < 36; i += 2) { + RENAME(ff_mdct_win)[j + 4][i ] = RENAME(ff_mdct_win)[j][i ]; + RENAME(ff_mdct_win)[j + 4][i + 1] = -RENAME(ff_mdct_win)[j][i + 1]; + } + } +} +/* cos(pi*i/18) */ +#define C1 FIXHR(0.98480775301220805936/2) +#define C2 FIXHR(0.93969262078590838405/2) +#define C3 FIXHR(0.86602540378443864676/2) +#define C4 FIXHR(0.76604444311897803520/2) +#define C5 FIXHR(0.64278760968653932632/2) +#define C6 FIXHR(0.5/2) +#define C7 FIXHR(0.34202014332566873304/2) +#define C8 FIXHR(0.17364817766693034885/2) + +/* 0.5 / cos(pi*(2*i+1)/36) */ +static const INTFLOAT icos36[9] = { + FIXR(0.50190991877167369479), + FIXR(0.51763809020504152469), //0 + FIXR(0.55168895948124587824), + FIXR(0.61038729438072803416), + FIXR(0.70710678118654752439), //1 + FIXR(0.87172339781054900991), + FIXR(1.18310079157624925896), + FIXR(1.93185165257813657349), //2 + FIXR(5.73685662283492756461), +}; + +/* 0.5 / cos(pi*(2*i+1)/36) */ +static const INTFLOAT icos36h[9] = { + FIXHR(0.50190991877167369479/2), + FIXHR(0.51763809020504152469/2), //0 + FIXHR(0.55168895948124587824/2), + FIXHR(0.61038729438072803416/2), + FIXHR(0.70710678118654752439/2), //1 + FIXHR(0.87172339781054900991/2), + FIXHR(1.18310079157624925896/4), + FIXHR(1.93185165257813657349/4), //2 +// FIXHR(5.73685662283492756461), +}; + +/* using Lee like decomposition followed by hand coded 9 points DCT */ +static void imdct36(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, INTFLOAT *win) +{ + int i, j; + INTFLOAT t0, t1, t2, t3, s0, s1, s2, s3; + INTFLOAT tmp[18], *tmp1, *in1; + + for (i = 17; i >= 1; i--) + in[i] += in[i-1]; + for (i = 17; i >= 3; i -= 2) + in[i] += in[i-2]; + + for (j = 0; j < 2; j++) { + tmp1 = tmp + j; + in1 = in + j; + + t2 = in1[2*4] + in1[2*8] - in1[2*2]; + + t3 = in1[2*0] + SHR(in1[2*6],1); + t1 = in1[2*0] - in1[2*6]; + tmp1[ 6] = t1 - SHR(t2,1); + tmp1[16] = t1 + t2; + + t0 = MULH3(in1[2*2] + in1[2*4] , C2, 2); + t1 = MULH3(in1[2*4] - in1[2*8] , -2*C8, 1); + t2 = MULH3(in1[2*2] + in1[2*8] , -C4, 2); + + tmp1[10] = t3 - t0 - t2; + tmp1[ 2] = t3 + t0 + t1; + tmp1[14] = t3 + t2 - t1; + + tmp1[ 4] = MULH3(in1[2*5] + in1[2*7] - in1[2*1], -C3, 2); + t2 = MULH3(in1[2*1] + in1[2*5], C1, 2); + t3 = MULH3(in1[2*5] - in1[2*7], -2*C7, 1); + t0 = MULH3(in1[2*3], C3, 2); + + t1 = MULH3(in1[2*1] + in1[2*7], -C5, 2); + + tmp1[ 0] = t2 + t3 + t0; + tmp1[12] = t2 + t1 - t0; + tmp1[ 8] = t3 - t1 - t0; + } + + i = 0; + for (j = 0; j < 4; j++) { + t0 = tmp[i]; + t1 = tmp[i + 2]; + s0 = t1 + t0; + s2 = t1 - t0; + + t2 = tmp[i + 1]; + t3 = tmp[i + 3]; + s1 = MULH3(t3 + t2, icos36h[ j], 2); + s3 = MULLx(t3 - t2, icos36 [8 - j], FRAC_BITS); + + t0 = s0 + s1; + t1 = s0 - s1; + out[(9 + j) * SBLIMIT] = MULH3(t1, win[ 9 + j], 1) + buf[4*(9 + j)]; + out[(8 - j) * SBLIMIT] = MULH3(t1, win[ 8 - j], 1) + buf[4*(8 - j)]; + buf[4 * ( 9 + j )] = MULH3(t0, win[18 + 9 + j], 1); + buf[4 * ( 8 - j )] = MULH3(t0, win[18 + 8 - j], 1); + + t0 = s2 + s3; + t1 = s2 - s3; + out[(9 + 8 - j) * SBLIMIT] = MULH3(t1, win[ 9 + 8 - j], 1) + buf[4*(9 + 8 - j)]; + out[ j * SBLIMIT] = MULH3(t1, win[ j], 1) + buf[4*( j)]; + buf[4 * ( 9 + 8 - j )] = MULH3(t0, win[18 + 9 + 8 - j], 1); + buf[4 * ( j )] = MULH3(t0, win[18 + j], 1); + i += 4; + } + + s0 = tmp[16]; + s1 = MULH3(tmp[17], icos36h[4], 2); + t0 = s0 + s1; + t1 = s0 - s1; + out[(9 + 4) * SBLIMIT] = MULH3(t1, win[ 9 + 4], 1) + buf[4*(9 + 4)]; + out[(8 - 4) * SBLIMIT] = MULH3(t1, win[ 8 - 4], 1) + buf[4*(8 - 4)]; + buf[4 * ( 9 + 4 )] = MULH3(t0, win[18 + 9 + 4], 1); + buf[4 * ( 8 - 4 )] = MULH3(t0, win[18 + 8 - 4], 1); +} + +void RENAME(ff_imdct36_blocks)(INTFLOAT *out, INTFLOAT *buf, INTFLOAT *in, + int count, int switch_point, int block_type) +{ + int j; + for (j=0 ; j < count; j++) { + /* apply window & overlap with previous buffer */ + + /* select window */ + int win_idx = (switch_point && j < 2) ? 0 : block_type; + INTFLOAT *win = RENAME(ff_mdct_win)[win_idx + (4 & -(j & 1))]; + + imdct36(out, buf, in, win); + + in += 18; + buf += ((j&3) != 3 ? 1 : (72-3)); + out++; + } +} -- cgit v1.2.3 From 39df0c434c76aa6a6decccb969dfa51468440823 Mon Sep 17 00:00:00 2001 From: Vitor Sessak Date: Thu, 5 Jan 2012 20:26:33 +0100 Subject: mpegaudiodec: optimized iMDCT transform Signed-off-by: Ronald S. Bultje --- libavcodec/mpegaudiodec.c | 2 +- libavcodec/x86/Makefile | 1 + libavcodec/x86/imdct36_sse.asm | 721 ++++++++++++++++++++++++++++++++++++++ libavcodec/x86/mpegaudiodec_mmx.c | 80 +++++ libavutil/x86/x86inc.asm | 2 + 5 files changed, 805 insertions(+), 1 deletion(-) create mode 100644 libavcodec/x86/imdct36_sse.asm (limited to 'libavcodec/mpegaudiodec.c') diff --git a/libavcodec/mpegaudiodec.c b/libavcodec/mpegaudiodec.c index 70c5f76381..6a06afa680 100644 --- a/libavcodec/mpegaudiodec.c +++ b/libavcodec/mpegaudiodec.c @@ -58,7 +58,7 @@ typedef struct GranuleDef { int preflag; int short_start, long_end; /* long/short band indexes */ uint8_t scale_factors[40]; - INTFLOAT sb_hybrid[SBLIMIT * 18]; /* 576 samples */ + DECLARE_ALIGNED(16, INTFLOAT, sb_hybrid)[SBLIMIT * 18]; /* 576 samples */ } GranuleDef; typedef struct MPADecodeContext { diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index aa97942dba..2abe4fbe72 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -33,6 +33,7 @@ YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o MMX-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhd_mmx.o MMX-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec_mmx.o +YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36_sse.o MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o diff --git a/libavcodec/x86/imdct36_sse.asm b/libavcodec/x86/imdct36_sse.asm new file mode 100644 index 0000000000..2908459db7 --- /dev/null +++ b/libavcodec/x86/imdct36_sse.asm @@ -0,0 +1,721 @@ +;****************************************************************************** +;* 36 point SSE-optimized IMDCT transform +;* Copyright (c) 2011 Vitor Sessak +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86inc.asm" +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +align 16 +ps_mask: dd 0, ~0, ~0, ~0 +ps_mask2: dd 0, ~0, 0, ~0 +ps_mask3: dd 0, 0, 0, ~0 +ps_mask4: dd 0, ~0, 0, 0 + +ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 +ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 +ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 +ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 +ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 +ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 +ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 + +ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 +ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 + +ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 + dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 + dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 + dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 + dd 1.0, 0.70710678118654752439, 0.0, 0.0 + +ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 + dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 + dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 + dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 + dd 1.0, 0.70710678118654752439, 0.0, 0.0 + +costabs: times 4 dd 0.98480773 + times 4 dd 0.93969262 + times 4 dd 0.86602539 + times 4 dd -0.76604444 + times 4 dd -0.64278764 + times 4 dd 0.50000000 + times 4 dd -0.50000000 + times 4 dd -0.34202015 + times 4 dd -0.17364818 + times 4 dd 0.50190992 + times 4 dd 0.51763808 + times 4 dd 0.55168896 + times 4 dd 0.61038726 + times 4 dd 0.70710677 + times 4 dd 0.87172341 + times 4 dd 1.18310082 + times 4 dd 1.93185163 + times 4 dd 5.73685646 + +%define SBLIMIT 32 +SECTION_TEXT + +%macro PSHUFD 3 +%if cpuflag(sse2) && notcpuflag(avx) + pshufd %1, %2, %3 +%else + shufps %1, %2, %2, %3 +%endif +%endmacro + +; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} +; output %1={x3,x4,y1,y2} +%macro BUILDINVHIGHLOW 3 +%if cpuflag(avx) + shufps %1, %2, %3, 0x4e +%else + movlhps %1, %3 + movhlps %1, %2 +%endif +%endmacro + +; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} +; output %1={x4,y1,y2,y3} +%macro ROTLEFT 3 +%if cpuflag(ssse3) + palignr %1, %3, %2, 12 +%else + BUILDINVHIGHLOW %1, %2, %3 + shufps %1, %1, %3, 0x99 +%endif +%endmacro + +%macro INVERTHL 2 +%if cpuflag(sse2) + PSHUFD %1, %2, 0x4e +%else + movhlps %1, %2 + movlhps %1, %2 +%endif +%endmacro + +%macro BUTTERF 3 + INVERTHL %2, %1 + xorps %1, [ps_p1p1m1m1] + addps %1, %2 +%if cpuflag(sse3) + mulps %1, %1, [ps_cosh_sse3 + %3] + PSHUFD %2, %1, 0xb1 + addsubps %1, %1, %2 +%else + mulps %1, [ps_cosh + %3] + PSHUFD %2, %1, 0xb1 + xorps %1, [ps_p1m1p1m1] + addps %1, %2 +%endif +%endmacro + +%macro STORE 4 + movhlps %2, %1 + movss [%3 ], %1 + movss [%3 + 2*%4], %2 + shufps %1, %1, 0xb1 + movss [%3 + %4], %1 + movhlps %2, %1 + movss [%3 + 3*%4], %2 +%endmacro + +%macro LOAD 4 + movlps %1, [%3 ] + movhps %1, [%3 + %4] + movlps %2, [%3 + 2*%4] + movhps %2, [%3 + 3*%4] + shufps %1, %2, 0x88 +%endmacro + +%macro LOADA64 2 +%if cpuflag(avx) + movu %1, [%2] +%else + movlps %1, [%2] + movhps %1, [%2 + 8] +%endif +%endmacro + +%macro DEFINE_IMDCT 0 +cglobal imdct36_float, 4,4,9, out, buf, in, win + + ; for(i=17;i>=1;i--) in[i] += in[i-1]; + LOADA64 m0, inq + LOADA64 m1, inq + 16 + + ROTLEFT m5, m0, m1 + + PSHUFD m6, m0, 0x93 + andps m6, m6, [ps_mask] + addps m0, m0, m6 + + LOADA64 m2, inq + 32 + + ROTLEFT m7, m1, m2 + + addps m1, m1, m5 + LOADA64 m3, inq + 48 + + ROTLEFT m5, m2, m3 + + xorps m4, m4, m4 + movlps m4, [inq+64] + BUILDINVHIGHLOW m6, m3, m4 + shufps m6, m6, m4, 0xa9 + + addps m4, m4, m6 + addps m2, m2, m7 + addps m3, m3, m5 + + ; for(i=17;i>=3;i-=2) in[i] += in[i-2]; + movlhps m5, m5, m0 + andps m5, m5, [ps_mask3] + + BUILDINVHIGHLOW m7, m0, m1 + andps m7, m7, [ps_mask2] + + addps m0, m0, m5 + + BUILDINVHIGHLOW m6, m1, m2 + andps m6, m6, [ps_mask2] + + addps m1, m1, m7 + + BUILDINVHIGHLOW m7, m2, m3 + andps m7, m7, [ps_mask2] + + addps m2, m2, m6 + + movhlps m6, m6, m3 + andps m6, m6, [ps_mask4] + + addps m3, m3, m7 + addps m4, m4, m6 + + ; Populate tmp[] + movlhps m6, m1, m5 ; zero out high values + subps m6, m6, m4 + + subps m5, m0, m3 + +%ifdef ARCH_X86_64 + SWAP m5, m8 +%endif + + mulps m7, m2, [ps_val1] + +%ifdef ARCH_X86_64 + mulps m5, m8, [ps_val2] +%else + mulps m5, m5, [ps_val2] +%endif + addps m7, m7, m5 + + mulps m5, m6, [ps_val1] + subps m7, m7, m5 + +%ifdef ARCH_X86_64 + SWAP m5, m8 +%else + subps m5, m0, m3 +%endif + + subps m5, m5, m6 + addps m5, m5, m2 + + shufps m6, m4, m3, 0xe4 + subps m6, m6, m2 + mulps m6, m6, [ps_val3] + + addps m4, m4, m1 + mulps m4, m4, [ps_val4] + + shufps m1, m1, m0, 0xe4 + addps m1, m1, m2 + mulps m1, m1, [ps_val5] + + mulps m3, m3, [ps_val6] + mulps m0, m0, [ps_val7] + addps m0, m0, m3 + + xorps m2, m1, [ps_p1p1m1m1] + subps m2, m2, m4 + addps m2, m2, m0 + + addps m3, m4, m0 + subps m3, m3, m6 + xorps m3, m3, [ps_p1p1m1m1] + + shufps m0, m0, m4, 0xe4 + subps m0, m0, m1 + addps m0, m0, m6 + + BUILDINVHIGHLOW m4, m2, m3 + shufps m3, m3, m2, 0x4e + + ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} + + BUTTERF m0, m1, 0 + BUTTERF m7, m2, 16 + BUTTERF m3, m6, 32 + BUTTERF m4, m1, 48 + + mulps m5, m5, [ps_cosh + 64] + PSHUFD m1, m5, 0xe1 + xorps m5, m5, [ps_p1m1p1m1] + addps m5, m5, m1 + + ; permutates: + ; m0 0 1 2 3 => 2 6 10 14 m1 + ; m7 4 5 6 7 => 3 7 11 15 m2 + ; m3 8 9 10 11 => 17 13 9 5 m3 + ; m4 12 13 14 15 => 16 12 8 4 m5 + ; m5 16 17 xx xx => 0 1 xx xx m0 + + unpckhps m1, m0, m7 + unpckhps m6, m3, m4 + movhlps m2, m6, m1 + movlhps m1, m1, m6 + + unpcklps m5, m5, m4 + unpcklps m3, m3, m7 + movhlps m4, m3, m5 + movlhps m5, m5, m3 + SWAP m4, m3 + ; permutation done + + PSHUFD m6, m2, 0xb1 + movss m4, [bufq + 4*68] + movss m7, [bufq + 4*64] + unpcklps m7, m7, m4 + mulps m6, m6, [winq + 16*4] + addps m6, m6, m7 + movss [outq + 64*SBLIMIT], m6 + shufps m6, m6, m6, 0xb1 + movss [outq + 68*SBLIMIT], m6 + + mulps m6, m3, [winq + 4*4] + LOAD m4, m7, bufq + 4*16, 16 + addps m6, m6, m4 + STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT + + shufps m4, m0, m3, 0xb5 + mulps m4, m4, [winq + 8*4] + LOAD m7, m6, bufq + 4*32, 16 + addps m4, m4, m7 + STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT + + shufps m3, m3, m2, 0xb1 + mulps m3, m3, [winq + 12*4] + LOAD m7, m6, bufq + 4*48, 16 + addps m3, m3, m7 + STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT + + mulps m2, m2, [winq] + LOAD m6, m7, bufq, 16 + addps m2, m2, m6 + STORE m2, m7, outq, 4*SBLIMIT + + mulps m4, m1, [winq + 20*4] + STORE m4, m7, bufq, 16 + + mulps m3, m5, [winq + 24*4] + STORE m3, m7, bufq + 4*16, 16 + + shufps m0, m0, m5, 0xb0 + mulps m0, m0, [winq + 28*4] + STORE m0, m7, bufq + 4*32, 16 + + shufps m5, m5, m1, 0xb1 + mulps m5, m5, [winq + 32*4] + STORE m5, m7, bufq + 4*48, 16 + + shufps m1, m1, m1, 0xb1 + mulps m1, m1, [winq + 36*4] + movss [bufq + 4*64], m1 + shufps m1, m1, 0xb1 + movss [bufq + 4*68], m1 + RET +%endmacro + +INIT_XMM sse +DEFINE_IMDCT + +INIT_XMM sse2 +DEFINE_IMDCT + +INIT_XMM sse3 +DEFINE_IMDCT + +INIT_XMM ssse3 +DEFINE_IMDCT + +INIT_XMM avx +DEFINE_IMDCT + +INIT_XMM sse + +%ifdef ARCH_X86_64 +%define SPILL SWAP +%define UNSPILL SWAP +%define SPILLED(x) m %+ x +%else +%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] +%macro SPILL 2 ; xmm#, mempos + movaps SPILLED(%2), m%1 +%endmacro +%macro UNSPILL 2 + movaps m%1, SPILLED(%2) +%endmacro +%endif + +%macro DEFINE_FOUR_IMDCT 0 +cglobal four_imdct36_float, 5,5,8, out, buf, in, win, tmp + movlps m0, [inq+64] + movhps m0, [inq+64 + 72] + movlps m3, [inq+64 + 2*72] + movhps m3, [inq+64 + 3*72] + + shufps m5, m0, m3, 0xdd + shufps m0, m0, m3, 0x88 + + mova m1, [inq+48] + movu m6, [inq+48 + 72] + mova m7, [inq+48 + 2*72] + movu m3, [inq+48 + 3*72] + + TRANSPOSE4x4PS 1, 6, 7, 3, 4 + + addps m4, m6, m7 + mova [tmpq+4*28], m4 + + addps m7, m3 + addps m6, m1 + addps m3, m0 + addps m0, m5 + addps m0, m7 + addps m7, m6 + mova [tmpq+4*12], m7 + SPILL 3, 12 + + mova m4, [inq+32] + movu m5, [inq+32 + 72] + mova m2, [inq+32 + 2*72] + movu m7, [inq+32 + 3*72] + + TRANSPOSE4x4PS 4, 5, 2, 7, 3 + + addps m1, m7 + SPILL 1, 11 + + addps m3, m5, m2 + SPILL 3, 13 + + addps m7, m2 + addps m5, m4 + addps m6, m7 + mova [tmpq], m6 + addps m7, m5 + mova [tmpq+4*16], m7 + + mova m2, [inq+16] + movu m7, [inq+16 + 72] + mova m1, [inq+16 + 2*72] + movu m6, [inq+16 + 3*72] + + TRANSPOSE4x4PS 2, 7, 1, 6, 3 + + addps m4, m6 + addps m6, m1 + addps m1, m7 + addps m7, m2 + addps m5, m6 + SPILL 5, 15 + addps m6, m7 + mulps m6, [costabs + 16*2] + mova [tmpq+4*8], m6 + SPILL 1, 10 + SPILL 0, 14 + + mova m1, [inq] + movu m6, [inq + 72] + mova m3, [inq + 2*72] + movu m5, [inq + 3*72] + + TRANSPOSE4x4PS 1, 6, 3, 5, 0 + + addps m2, m5 + addps m5, m3 + addps m7, m5 + addps m3, m6 + addps m6, m1 + SPILL 7, 8 + addps m5, m6 + SPILL 6, 9 + addps m6, m4, SPILLED(12) + subps m6, m2 + UNSPILL 7, 11 + SPILL 5, 11 + subps m5, m1, m7 + mulps m7, [costabs + 16*5] + addps m7, m1 + mulps m0, m6, [costabs + 16*6] + addps m0, m5 + mova [tmpq+4*24], m0 + addps m6, m5 + mova [tmpq+4*4], m6 + addps m6, m4, m2 + mulps m6, [costabs + 16*1] + subps m4, SPILLED(12) + mulps m4, [costabs + 16*8] + addps m2, SPILLED(12) + mulps m2, [costabs + 16*3] + subps m5, m7, m6 + subps m5, m2 + addps m6, m7 + addps m6, m4 + addps m7, m2 + subps m7, m4 + mova [tmpq+4*20], m7 + mova m2, [tmpq+4*28] + mova [tmpq+4*28], m5 + UNSPILL 7, 13 + subps m5, m7, m2 + mulps m5, [costabs + 16*7] + UNSPILL 1, 10 + mulps m1, [costabs + 16*2] + addps m4, m3, m2 + mulps m4, [costabs + 16*4] + addps m2, m7 + addps m7, m3 + mulps m7, [costabs] + subps m3, m2 + mulps m3, [costabs + 16*2] + addps m2, m7, m5 + addps m2, m1 + SPILL 2, 10 + addps m7, m4 + subps m7, m1 + SPILL 7, 12 + subps m5, m4 + subps m5, m1 + UNSPILL 0, 14 + SPILL 5, 13 + addps m1, m0, SPILLED(15) + subps m1, SPILLED(8) + mova m4, [costabs + 16*5] + mulps m4, [tmpq] + UNSPILL 2, 9 + addps m4, m2 + subps m2, [tmpq] + mulps m5, m1, [costabs + 16*6] + addps m5, m2 + SPILL 5, 9 + addps m2, m1 + SPILL 2, 14 + UNSPILL 5, 15 + subps m7, m5, m0 + addps m5, SPILLED(8) + mulps m5, [costabs + 16*1] + mulps m7, [costabs + 16*8] + addps m0, SPILLED(8) + mulps m0, [costabs + 16*3] + subps m2, m4, m5 + subps m2, m0 + SPILL 2, 15 + addps m5, m4 + addps m5, m7 + addps m4, m0 + subps m4, m7 + SPILL 4, 8 + mova m7, [tmpq+4*16] + mova m2, [tmpq+4*12] + addps m0, m7, m2 + subps m0, SPILLED(11) + mulps m0, [costabs + 16*2] + addps m4, m7, SPILLED(11) + mulps m4, [costabs] + subps m7, m2 + mulps m7, [costabs + 16*7] + addps m2, SPILLED(11) + mulps m2, [costabs + 16*4] + addps m1, m7, [tmpq+4*8] + addps m1, m4 + addps m4, m2 + subps m4, [tmpq+4*8] + SPILL 4, 11 + subps m7, m2 + subps m7, [tmpq+4*8] + addps m4, m6, SPILLED(10) + subps m6, SPILLED(10) + addps m2, m5, m1 + mulps m2, [costabs + 16*9] + subps m5, m1 + mulps m5, [costabs + 16*17] + subps m1, m4, m2 + addps m4, m2 + mulps m2, m1, [winq+4*36] + addps m2, [bufq+4*36] + mova [outq+1152], m2 + mulps m1, [winq+4*32] + addps m1, [bufq+4*32] + mova [outq+1024], m1 + mulps m1, m4, [winq+4*116] + mova [bufq+4*36], m1 + mulps m4, [winq+4*112] + mova [bufq+4*32], m4 + addps m2, m6, m5 + subps m6, m5 + mulps m1, m6, [winq+4*68] + addps m1, [bufq+4*68] + mova [outq+2176], m1 + mulps m6, [winq] + addps m6, [bufq] + mova [outq], m6 + mulps m1, m2, [winq+4*148] + mova [bufq+4*68], m1 + mulps m2, [winq+4*80] + mova [bufq], m2 + addps m5, m3, [tmpq+4*24] + mova m2, [tmpq+4*24] + subps m2, m3 + mova m1, SPILLED(9) + subps m1, m0 + mulps m1, [costabs + 16*10] + addps m0, SPILLED(9) + mulps m0, [costabs + 16*16] + addps m6, m5, m1 + subps m5, m1 + mulps m3, m5, [winq+4*40] + addps m3, [bufq+4*40] + mova [outq+1280], m3 + mulps m5, [winq+4*28] + addps m5, [bufq+4*28] + mova [outq+896], m5 + mulps m1, m6, [winq+4*120] + mova [bufq+4*40], m1 + mulps m6, [winq+4*108] + mova [bufq+4*28], m6 + addps m1, m2, m0 + subps m2, m0 + mulps m5, m2, [winq+4*64] + addps m5, [bufq+4*64] + mova [outq+2048], m5 + mulps m2, [winq+4*4] + addps m2, [bufq+4*4] + mova [outq+128], m2 + mulps m0, m1, [winq+4*144] + mova [bufq+4*64], m0 + mulps m1, [winq+4*84] + mova [bufq+4*4], m1 + mova m1, [tmpq+4*28] + mova m5, m1 + addps m1, SPILLED(13) + subps m5, SPILLED(13) + UNSPILL 3, 15 + addps m2, m7, m3 + mulps m2, [costabs + 16*11] + subps m3, m7 + mulps m3, [costabs + 16*15] + addps m0, m2, m1 + subps m1, m2 + SWAP m0, m2 + mulps m6, m1, [winq+4*44] + addps m6, [bufq+4*44] + mova [outq+1408], m6 + mulps m1, [winq+4*24] + addps m1, [bufq+4*24] + mova [outq+768], m1 + mulps m0, m2, [winq+4*124] + mova [bufq+4*44], m0 + mulps m2, [winq+4*104] + mova [bufq+4*24], m2 + addps m0, m5, m3 + subps m5, m3 + mulps m1, m5, [winq+4*60] + addps m1, [bufq+4*60] + mova [outq+1920], m1 + mulps m5, [winq+4*8] + addps m5, [bufq+4*8] + mova [outq+256], m5 + mulps m1, m0, [winq+4*140] + mova [bufq+4*60], m1 + mulps m0, [winq+4*88] + mova [bufq+4*8], m0 + mova m1, [tmpq+4*20] + addps m1, SPILLED(12) + mova m2, [tmpq+4*20] + subps m2, SPILLED(12) + UNSPILL 7, 8 + subps m0, m7, SPILLED(11) + addps m7, SPILLED(11) + mulps m4, m7, [costabs + 16*12] + mulps m0, [costabs + 16*14] + addps m5, m1, m4 + subps m1, m4 + mulps m7, m1, [winq+4*48] + addps m7, [bufq+4*48] + mova [outq+1536], m7 + mulps m1, [winq+4*20] + addps m1, [bufq+4*20] + mova [outq+640], m1 + mulps m1, m5, [winq+4*128] + mova [bufq+4*48], m1 + mulps m5, [winq+4*100] + mova [bufq+4*20], m5 + addps m6, m2, m0 + subps m2, m0 + mulps m1, m2, [winq+4*56] + addps m1, [bufq+4*56] + mova [outq+1792], m1 + mulps m2, [winq+4*12] + addps m2, [bufq+4*12] + mova [outq+384], m2 + mulps m0, m6, [winq+4*136] + mova [bufq+4*56], m0 + mulps m6, [winq+4*92] + mova [bufq+4*12], m6 + UNSPILL 0, 14 + mulps m0, [costabs + 16*13] + mova m3, [tmpq+4*4] + addps m2, m0, m3 + subps m3, m0 + mulps m0, m3, [winq+4*52] + addps m0, [bufq+4*52] + mova [outq+1664], m0 + mulps m3, [winq+4*16] + addps m3, [bufq+4*16] + mova [outq+512], m3 + mulps m0, m2, [winq+4*132] + mova [bufq+4*52], m0 + mulps m2, [winq+4*96] + mova [bufq+4*16], m2 + RET +%endmacro + +INIT_XMM sse +DEFINE_FOUR_IMDCT + +INIT_XMM avx +DEFINE_FOUR_IMDCT diff --git a/libavcodec/x86/mpegaudiodec_mmx.c b/libavcodec/x86/mpegaudiodec_mmx.c index b64461513e..06ffbca90a 100644 --- a/libavcodec/x86/mpegaudiodec_mmx.c +++ b/libavcodec/x86/mpegaudiodec_mmx.c @@ -24,6 +24,18 @@ #include "libavcodec/dsputil.h" #include "libavcodec/mpegaudiodsp.h" +void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win); +void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win); +void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win); +void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win); +void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win); +void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, + float *tmpbuf); +void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, + float *tmpbuf); + +DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; + #define MACS(rt, ra, rb) rt+=(ra)*(rb) #define MLSS(rt, ra, rb) rt-=(ra)*(rb) @@ -147,11 +159,79 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out, *out = sum; } + +#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ +static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ + int count, int switch_point, int block_type) \ +{ \ + int align_end = count - (count & 3); \ + int j; \ + for (j = 0; j < align_end; j+= 4) { \ + LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ + float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ + in += 4*18; \ + buf += 4*18; \ + out += 4; \ + } \ + for (; j < count; j++) { \ + /* apply window & overlap with previous buffer */ \ + \ + /* select window */ \ + int win_idx = (switch_point && j < 2) ? 0 : block_type; \ + float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ + \ + ff_imdct36_float_ ## CPU1(out, buf, in, win); \ + \ + in += 18; \ + buf++; \ + out++; \ + } \ +} + +DECL_IMDCT_BLOCKS(sse,sse) +DECL_IMDCT_BLOCKS(sse2,sse) +DECL_IMDCT_BLOCKS(sse3,sse) +DECL_IMDCT_BLOCKS(ssse3,sse) +DECL_IMDCT_BLOCKS(avx,avx) + void ff_mpadsp_init_mmx(MPADSPContext *s) { int mm_flags = av_get_cpu_flags(); + int i, j; + for (j = 0; j < 4; j++) { + for (i = 0; i < 40; i ++) { + mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; + mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; + mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; + mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; + } + } + if (mm_flags & AV_CPU_FLAG_SSE2) { s->apply_window_float = apply_window_mp3; } +#if HAVE_YASM + if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) { + s->imdct36_blocks_float = imdct36_blocks_avx; +#if HAVE_SSE + } else if (mm_flags & AV_CPU_FLAG_SSSE3) { + s->imdct36_blocks_float = imdct36_blocks_ssse3; + } else if (mm_flags & AV_CPU_FLAG_SSE3) { + s->imdct36_blocks_float = imdct36_blocks_sse3; + } else if (mm_flags & AV_CPU_FLAG_SSE2) { + s->imdct36_blocks_float = imdct36_blocks_sse2; + } else if (mm_flags & AV_CPU_FLAG_SSE) { + s->imdct36_blocks_float = imdct36_blocks_sse; +#endif /* HAVE_SSE */ + } +#endif /* HAVE_YASM */ } diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 475e70e3fc..6941c1ac2f 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -916,6 +916,8 @@ AVX_INSTR minpd, 1, 0, 1 AVX_INSTR minps, 1, 0, 1 AVX_INSTR minsd, 1, 0, 1 AVX_INSTR minss, 1, 0, 1 +AVX_INSTR movhlps, 1, 0, 0 +AVX_INSTR movlhps, 1, 0, 0 AVX_INSTR movsd, 1, 0, 0 AVX_INSTR movss, 1, 0, 0 AVX_INSTR mpsadbw, 0, 1, 0 -- cgit v1.2.3