summaryrefslogtreecommitdiff
path: root/libavcodec/x86/vp9dsp_init.c
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2014-01-16 07:30:15 -0500
committerClément Bœsch <clement@stupeflix.com>2014-01-16 13:49:31 +0100
commit8173d1ffc0b742972db6c9cffb2cec204e2a1a96 (patch)
tree0636fe8f54d6a7ed31813716815103a1fc855a9f /libavcodec/x86/vp9dsp_init.c
parenta64333db5fefdec89533b6bec299fe169ef7d77d (diff)
vp9/x86: 16x16 iadst_idct, idct_iadst and iadst_iadst (ssse3+avx).
Sample timings on ped1080p.webm (of the ssse3 functions): iadst_idct: 4672 -> 1175 cycles idct_iadst: 4736 -> 1263 cycles iadst_iadst: 4924 -> 1438 cycles Total decoding time changed from 6.565s to 6.413s.
Diffstat (limited to 'libavcodec/x86/vp9dsp_init.c')
-rw-r--r--libavcodec/x86/vp9dsp_init.c34
1 files changed, 26 insertions, 8 deletions
diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 3651641386..900efb3a0b 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -157,13 +157,25 @@ filters_8tap_1d_fn3(avg)
#undef filters_8tap_1d_fn3
#undef filter_8tap_1d_fn
-void ff_vp9_idct_idct_4x4_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
-void ff_vp9_idct_idct_8x8_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
-void ff_vp9_idct_idct_8x8_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
-void ff_vp9_idct_idct_16x16_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
-void ff_vp9_idct_idct_16x16_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
-void ff_vp9_idct_idct_32x32_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
-void ff_vp9_idct_idct_32x32_add_avx (uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+#define itxfm_func(typea, typeb, size, opt) \
+void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \
+ int16_t *block, int eob)
+#define itxfm_funcs(size, opt) \
+itxfm_func(idct, idct, size, opt); \
+itxfm_func(iadst, idct, size, opt); \
+itxfm_func(idct, iadst, size, opt); \
+itxfm_func(iadst, iadst, size, opt)
+
+itxfm_func(idct, idct, 4, ssse3);
+itxfm_func(idct, idct, 8, ssse3);
+itxfm_func(idct, idct, 8, avx);
+itxfm_funcs(16, ssse3);
+itxfm_funcs(16, avx);
+itxfm_func(idct, idct, 32, ssse3);
+itxfm_func(idct, idct, 32, avx);
+
+#undef itxfm_func
+#undef itxfm_funcs
void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
void ff_vp9_loop_filter_v_16_16_avx (uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
@@ -226,7 +238,10 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
if (ARCH_X86_64) {
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
- dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =
@@ -240,6 +255,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
if (ARCH_X86_64) {
dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx;
dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx;
+ dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx;
dsp->itxfm_add[TX_32X32][ADST_ADST] =
dsp->itxfm_add[TX_32X32][ADST_DCT] =
dsp->itxfm_add[TX_32X32][DCT_ADST] =