5 files changed, 93 insertions, 285 deletions
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index ba42984fb6..03d4246ca0 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -56,8 +56,6 @@ typedef struct HEVCDSPContext {
 
     void (*idct_dc[4])(int16_t *coeffs);
 
-    void (*transform_dc_add[4])(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride);
-
     void (*sao_band_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                             struct SAOParams *sao, int *borders,
                             int width, int height, int c_idx);
diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
index c97c82a8e6..7a44f0830c 100644
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -1,6 +1,7 @@
 ; /*
-; * Provide SSE & MMX idct functions for HEVC decoding
+; * SIMD optimized idct functions for HEVC decoding
 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
+; * Copyright (c) 2014 James Almer
 ; *
 ; * This file is part of FFmpeg.
 ; *
@@ -20,206 +21,86 @@
 ; */
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA 32
-max_pixels_10:          times 16  dw ((1 << 10)-1)
-dc_add_10:              times 4 dd ((1 << 14-10) + 1)
-
-
 SECTION_TEXT 32
 
-;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
-
-%macro DC_ADD_INIT 2
-    add              %1w, ((1 << 14-8) + 1)
-    sar              %1w, (15-8)
-    movd              m0, %1d
-    lea               %1, [%2*3]
-    SPLATW            m0, m0, 0
-    pxor              m1, m1
-    psubw             m1, m0
-    packuswb          m0, m0
-    packuswb          m1, m1
-%endmacro
-
-%macro DC_ADD_INIT_AVX2 2
-    add              %1w, ((1 << 14-8) + 1)
-    sar              %1w, (15-8)
-    movd             xm0, %1d
-    vpbroadcastw      m0, xm0    ;SPLATW
-    lea               %1, [%2*3]
-    pxor              m1, m1
-    psubw             m1, m0
-    packuswb          m0, m0
-    packuswb          m1, m1
+; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
+; %1 = HxW
+; %2 = number of loops
+; %3 = bitdepth
+%macro IDCT_DC 3
+cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%3) + 1)
+    sar               tmpw, (15-%3)
+    movd               xm0, tmpd
+    SPLATW              m0, xm0
+    DEFINE_ARGS coeff, cnt
+    mov               cntd, %2
+.loop
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+    add  coeffq, mmsize*8
+    dec  cntd
+    jg  .loop
+    RET
 %endmacro
 
-%macro DC_ADD_OP 4
-    %1                m2, [%2     ]
-    %1                m3, [%2+%3  ]
-    %1                m4, [%2+%3*2]
-    %1                m5, [%2+%4  ]
-    paddusb           m2, m0
-    paddusb           m3, m0
-    paddusb           m4, m0
-    paddusb           m5, m0
-    psubusb           m2, m1
-    psubusb           m3, m1
-    psubusb           m4, m1
-    psubusb           m5, m1
-    %1         [%2     ], m2
-    %1         [%2+%3  ], m3
-    %1         [%2+%3*2], m4
-    %1         [%2+%4  ], m5
+; %1 = HxW
+; %2 = bitdepth
+%macro IDCT_DC_NL 2 ; No loop
+cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp
+    movsx             tmpq, word [coeffq]
+    add               tmpw, ((1 << 14-%2) + 1)
+    sar               tmpw, (15-%2)
+    movd                m0, tmpd
+    SPLATW              m0, xm0
+    mova [coeffq+mmsize*0], m0
+    mova [coeffq+mmsize*1], m0
+    mova [coeffq+mmsize*2], m0
+    mova [coeffq+mmsize*3], m0
+%if mmsize == 16
+    mova [coeffq+mmsize*4], m0
+    mova [coeffq+mmsize*5], m0
+    mova [coeffq+mmsize*6], m0
+    mova [coeffq+mmsize*7], m0
+%endif
+    RET
 %endmacro
 
+; 8-bit
 INIT_MMX mmxext
-; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-%if ARCH_X86_64
-cglobal hevc_idct4_dc_add_8, 3, 4, 0
-    movsx             r3, word [r1]
-    DC_ADD_INIT       r3, r2
-    DC_ADD_OP       movh, r0, r2, r3
-    RET
-
-; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_idct8_dc_add_8, 3, 4, 0
-    movsx             r3, word [r1]
-    DC_ADD_INIT       r3, r2
-    DC_ADD_OP       mova, r0, r2, r3
-    lea               r0, [r0+r2*4]
-    DC_ADD_OP       mova, r0, r2, r3
-    RET
-%else
-; void ff_hevc_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_idct4_dc_add_8, 2, 3, 0
-    movsx             r2, word [r1]
-    mov               r1, r2m
-    DC_ADD_INIT       r2, r1
-    DC_ADD_OP       movh, r0, r1, r2
-    RET
-
-; void ff_hevc_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_idct8_dc_add_8, 2, 3, 0
-    movsx             r2, word [r1]
-    mov               r1, r2m
-    DC_ADD_INIT       r2, r1
-    DC_ADD_OP       mova, r0, r1, r2
-    lea               r0, [r0+r1*4]
-    DC_ADD_OP       mova, r0, r1, r2
-    RET
-%endif
-
+IDCT_DC_NL  4,      8
+IDCT_DC     8,  2,  8
 
 INIT_XMM sse2
-; void ff_hevc_idct16_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_idct16_dc_add_8, 3, 4, 6
-    movsx             r3, word [r1]
-    DC_ADD_INIT       r3, r2
-    DC_ADD_OP       mova, r0, r2, r3
-    lea               r0, [r0+r2*4]
-    DC_ADD_OP       mova, r0, r2, r3
-    lea               r0, [r0+r2*4]
-    DC_ADD_OP       mova, r0, r2, r3
-    lea               r0, [r0+r2*4]
-    DC_ADD_OP       mova, r0, r2, r3
-    RET
+IDCT_DC_NL  8,      8
+IDCT_DC    16,  4,  8
+IDCT_DC    32, 16,  8
 
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-; void ff_hevc_idct32_dc_add_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-cglobal hevc_idct32_dc_add_8, 3, 4, 6
-    movsx             r3, word [r1]
-    DC_ADD_INIT_AVX2  r3, r2
-    DC_ADD_OP       mova, r0, r2, r3,
- %rep 7
-    lea               r0, [r0+r2*4]
-    DC_ADD_OP       mova, r0, r2, r3
-%endrep
-    RET
+IDCT_DC    16,  2,  8
+IDCT_DC    32,  8,  8
 %endif ;HAVE_AVX2_EXTERNAL
-;-----------------------------------------------------------------------------
-; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
-;-----------------------------------------------------------------------------
-%macro IDCT_DC_ADD_OP_10 3
-    pxor              m5, m5
-%if avx_enabled
-    paddw             m1, m0, [%1+0   ]
-    paddw             m2, m0, [%1+%2  ]
-    paddw             m3, m0, [%1+%2*2]
-    paddw             m4, m0, [%1+%3  ]
-%else
-    mova              m1, [%1+0   ]
-    mova              m2, [%1+%2  ]
-    mova              m3, [%1+%2*2]
-    mova              m4, [%1+%3  ]
-    paddw             m1, m0
-    paddw             m2, m0
-    paddw             m3, m0
-    paddw             m4, m0
-%endif
-    CLIPW             m1, m5, m6
-    CLIPW             m2, m5, m6
-    CLIPW             m3, m5, m6
-    CLIPW             m4, m5, m6
-    mova       [%1+0   ], m1
-    mova       [%1+%2  ], m2
-    mova       [%1+%2*2], m3
-    mova       [%1+%3  ], m4
-%endmacro
 
+; 10-bit
 INIT_MMX mmxext
-cglobal hevc_idct4_dc_add_10,3,3
-    mov              r1w, [r1]
-    add              r1w, ((1 << 4) + 1)
-    sar              r1w, 5
-    movd              m0, r1d
-    lea               r1, [r2*3]
-    SPLATW            m0, m0, 0
-    mova              m6, [max_pixels_10]
-    IDCT_DC_ADD_OP_10 r0, r2, r1
-    RET
-
-;-----------------------------------------------------------------------------
-; void ff_hevc_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
-;-----------------------------------------------------------------------------
-%macro IDCT8_DC_ADD 0
-cglobal hevc_idct8_dc_add_10,3,4,7
-    mov              r1w, [r1]
-    add              r1w, ((1 << 4) + 1)
-    sar              r1w, 5
-    movd              m0, r1d
-    lea               r1, [r2*3]
-    SPLATW            m0, m0, 0
-    mova              m6, [max_pixels_10]
-    IDCT_DC_ADD_OP_10 r0, r2, r1
-    lea               r0, [r0+r2*4]
-    IDCT_DC_ADD_OP_10 r0, r2, r1
-    RET
-%endmacro
+IDCT_DC_NL  4,     10
+IDCT_DC     8,  2, 10
 
 INIT_XMM sse2
-IDCT8_DC_ADD
-%if HAVE_AVX_EXTERNAL
-INIT_XMM avx
-IDCT8_DC_ADD
-%endif
+IDCT_DC_NL  8,     10
+IDCT_DC    16,  4, 10
+IDCT_DC    32, 16, 10
 
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-cglobal hevc_idct16_dc_add_10,3,4,7
-    mov              r1w, [r1]
-    add              r1w, ((1 << 4) + 1)
-    sar              r1w, 5
-    movd             xm0, r1d
-    lea               r1, [r2*3]
-    vpbroadcastw      m0, xm0    ;SPLATW
-    mova              m6, [max_pixels_10]
-    IDCT_DC_ADD_OP_10 r0, r2, r1
-    lea               r0, [r0+r2*4]
-    IDCT_DC_ADD_OP_10 r0, r2, r1
-    lea               r0, [r0+r2*4]
-    IDCT_DC_ADD_OP_10 r0, r2, r1
-    lea               r0, [r0+r2*4]
-    IDCT_DC_ADD_OP_10 r0, r2, r1
-    RET
-%endif ;HAVE_AVX_EXTERNAL
+IDCT_DC    16,  2, 10
+IDCT_DC    32,  8, 10
+%endif ;HAVE_AVX2_EXTERNAL
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 6062d8e055..4bcc8dcc12 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -131,32 +131,4 @@ WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);
 WEIGHTING_PROTOTYPES(12, sse4);
 
-///////////////////////////////////////////////////////////////////////////////
-// IDCT
-///////////////////////////////////////////////////////////////////////////////
-
-
-idct_dc_proto(4, 8,mmxext);
-idct_dc_proto(8, 8,mmxext);
-idct_dc_proto(16,8,  sse2);
-idct_dc_proto(32,8,  sse2);
-
-idct_dc_proto(32,8,  avx2);
-
-
-idct_dc_proto(4, 10,mmxext);
-idct_dc_proto(8, 10,  sse2);
-idct_dc_proto(16,10,  sse2);
-idct_dc_proto(32,10,  sse2);
-idct_dc_proto(8, 10,   avx);
-idct_dc_proto(16,10,   avx);
-idct_dc_proto(32,10,   avx);
-
-idct_dc_proto(16,10,  avx2);
-idct_dc_proto(32,10,  avx2);
-
-
-
-
-
 #endif // AVCODEC_X86_HEVCDSP_H
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 6fb94aaf0b..fb3357bef6 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -54,59 +54,17 @@ LFL_FUNCS(uint8_t,   8, ssse3)
 LFL_FUNCS(uint8_t,  10, ssse3)
 LFL_FUNCS(uint8_t,  12, ssse3)
 
-#if HAVE_SSE2_EXTERNAL
-void ff_hevc_idct32_dc_add_8_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-{
-    ff_hevc_idct16_dc_add_8_sse2(dst, coeffs, stride);
-    ff_hevc_idct16_dc_add_8_sse2(dst+16, coeffs, stride);
-    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride, coeffs, stride);
-    ff_hevc_idct16_dc_add_8_sse2(dst+16*stride+16, coeffs, stride);
-}
-
-void ff_hevc_idct16_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-{
-    ff_hevc_idct8_dc_add_10_sse2(dst, coeffs, stride);
-    ff_hevc_idct8_dc_add_10_sse2(dst+16, coeffs, stride);
-    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride, coeffs, stride);
-    ff_hevc_idct8_dc_add_10_sse2(dst+8*stride+16, coeffs, stride);
-}
-
-void ff_hevc_idct32_dc_add_10_sse2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-{
-    ff_hevc_idct16_dc_add_10_sse2(dst, coeffs, stride);
-    ff_hevc_idct16_dc_add_10_sse2(dst+32, coeffs, stride);
-    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride, coeffs, stride);
-    ff_hevc_idct16_dc_add_10_sse2(dst+16*stride+32, coeffs, stride);
-}
-#endif //HAVE_SSE2_EXTERNAL
-#if HAVE_AVX_EXTERNAL
-void ff_hevc_idct16_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-{
-    ff_hevc_idct8_dc_add_10_avx(dst, coeffs, stride);
-    ff_hevc_idct8_dc_add_10_avx(dst+16, coeffs, stride);
-    ff_hevc_idct8_dc_add_10_avx(dst+8*stride, coeffs, stride);
-    ff_hevc_idct8_dc_add_10_avx(dst+8*stride+16, coeffs, stride);
-}
-
-void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-{
-    ff_hevc_idct16_dc_add_10_avx(dst, coeffs, stride);
-    ff_hevc_idct16_dc_add_10_avx(dst+32, coeffs, stride);
-    ff_hevc_idct16_dc_add_10_avx(dst+16*stride, coeffs, stride);
-    ff_hevc_idct16_dc_add_10_avx(dst+16*stride+32, coeffs, stride);
-}
-#endif //HAVE_AVX_EXTERNAL
-
-#if HAVE_AVX2_EXTERNAL
-
-void ff_hevc_idct32_dc_add_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
-{
-    ff_hevc_idct16_dc_add_10_avx2(dst, coeffs, stride);
-    ff_hevc_idct16_dc_add_10_avx2(dst+32, coeffs, stride);
-    ff_hevc_idct16_dc_add_10_avx2(dst+16*stride, coeffs, stride);
-    ff_hevc_idct16_dc_add_10_avx2(dst+16*stride+32, coeffs, stride);
-}
-#endif //HAVE_AVX2_EXTERNAL
+#define IDCT_FUNCS(W, opt) \
+void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
+void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs)
+
+IDCT_FUNCS(4x4,   mmxext);
+IDCT_FUNCS(8x8,   mmxext);
+IDCT_FUNCS(8x8,   sse2);
+IDCT_FUNCS(16x16, sse2);
+IDCT_FUNCS(32x32, sse2);
+IDCT_FUNCS(16x16, avx2);
+IDCT_FUNCS(32x32, avx2);
 
 #define mc_rep_func(name, bitd, step, W, opt) \
 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride,                            \
@@ -504,8 +462,8 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 
     if (bit_depth == 8) {
         if (EXTERNAL_MMXEXT(mm_flags)) {
-            c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_8_mmxext;
-            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_8_mmxext;
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
         }
         if (EXTERNAL_SSE2(mm_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
@@ -515,8 +473,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
             }
 
-            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_8_sse2;
-            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_sse2;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
         }
         if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
@@ -535,12 +494,13 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
         }
         if (EXTERNAL_AVX2(mm_flags)) {
-            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_avx2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(mm_flags)) {
-            c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_10_mmxext;
-
+            c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
         }
         if (EXTERNAL_SSE2(mm_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
@@ -550,9 +510,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
                 c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
             }
 
-            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_sse2;
-            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_sse2;
-            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_sse2;
+            c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
         }
         if (EXTERNAL_SSSE3(mm_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
@@ -569,14 +529,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     10, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    10, sse4);
         }
-        if (EXTERNAL_AVX(mm_flags)) {
-            c->transform_dc_add[1]    =  ff_hevc_idct8_dc_add_10_avx;
-            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx;
-            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx;
-        }
         if (EXTERNAL_AVX2(mm_flags)) {
-            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx2;
-            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx2;
+            c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
+            c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
 
         }
     } else if (bit_depth == 12) {
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 9fb4778547..824e449d24 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -598,7 +598,9 @@
 %endmacro
 
 %macro SPLATW 2-3 0
-%if mmsize == 16
+%if cpuflag(avx2) && %3 == 0
+    vpbroadcastw %1, %2
+%elif mmsize == 16
     pshuflw    %1, %2, (%3)*0x55
     punpcklqdq %1, %1
 %elif cpuflag(mmxext)