x86 dsputil: provide SSE2/SSSE3 versions of bswap_buf

While pshufb allows emulating bswap on XMM registers for SSSE3, more shuffling is needed for SSE2. Alignment is critical, so specific codepaths are provided for this case. For the huffyuv sequence "angels_480-huffyuvcompress.avi": C (using bswap instruction): ~ 55k cycles SSE2: ~ 40k cycles SSSE3 using unaligned loads: ~ 35k cycles SSSE3 using aligned loads: ~ 30k cycles Signed-off-by: Diego Biurrun <diego@biurrun.de>
author: Christophe Gisquet <christophe.gisquet@gmail.com> 2012-01-19 21:48:39 +0100
committer: Diego Biurrun <diego@biurrun.de> 2012-01-30 10:19:55 +0100
commit: 6b039003822a03add20c7ba91fc857dca52b0a03 (patch)
tree: 66ed7686c3377bce8accbed2fbc471c9a5931dbb /libavcodec/x86
parent: a846202343af7c56bf444ec47d4bb26a5d2b83ce (diff)
2 files changed, 128 insertions, 0 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 7cc16c0f82..c40cab51f0 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2313,6 +2313,9 @@ void ff_apply_window_int16_ssse3     (int16_t *output, const int16_t *input,
 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 
+void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
+void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
+
 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
@@ -2798,6 +2801,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                     c->apply_window_int16 = ff_apply_window_int16_sse2;
                 }
             }
+            c->bswap_buf = ff_bswap32_buf_sse2;
 #endif
         }
         if (mm_flags & AV_CPU_FLAG_SSSE3) {
@@ -2810,6 +2814,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
                 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
             }
+            c->bswap_buf = ff_bswap32_buf_ssse3;
 #endif
         }
 
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 4607ff15d7..611f5c8a72 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -30,6 +30,7 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
 pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
 pd_16384: times 4 dd 16384
+pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
 SECTION_TEXT
 
@@ -1178,3 +1179,125 @@ INIT_XMM sse
 BUTTERFLIES_FLOAT_INTERLEAVE
 INIT_YMM avx
 BUTTERFLIES_FLOAT_INTERLEAVE
+
+INIT_XMM sse2
+; %1 = aligned/unaligned
+%macro BSWAP_LOOPS_SSE2  1
+    mov      r3, r2
+    sar      r2, 3
+    jz       .left4_%1
+.loop8_%1:
+    mov%1    m0, [r1 +  0]
+    mov%1    m1, [r1 + 16]
+    pshuflw  m0, m0, 10110001b
+    pshuflw  m1, m1, 10110001b
+    pshufhw  m0, m0, 10110001b
+    pshufhw  m1, m1, 10110001b
+    mova     m2, m0
+    mova     m3, m1
+    psllw    m0, 8
+    psllw    m1, 8
+    psrlw    m2, 8
+    psrlw    m3, 8
+    por      m2, m0
+    por      m3, m1
+    mova     [r0 +  0], m2
+    mova     [r0 + 16], m3
+    add      r1, 32
+    add      r0, 32
+    dec      r2
+    jnz      .loop8_%1
+.left4_%1:
+    mov      r2, r3
+    and      r3, 4
+    jz       .left
+    mov%1    m0, [r1]
+    pshuflw  m0, m0, 10110001b
+    pshufhw  m0, m0, 10110001b
+    mova     m2, m0
+    psllw    m0, 8
+    psrlw    m2, 8
+    por      m2, m0
+    mova     [r0], m2
+    add      r1, 16
+    add      r0, 16
+%endmacro
+
+; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
+cglobal bswap32_buf, 3,4,5
+    mov      r3, r1
+    and      r3, 15
+    jz       .start_align
+    BSWAP_LOOPS_SSE2  u
+    jmp      .left
+.start_align:
+    BSWAP_LOOPS_SSE2  a
+.left:
+    and      r2, 3
+    jz       .end
+.loop2:
+    mov      r3d, [r1]
+    bswap    r3d
+    mov      [r0], r3d
+    add      r1, 4
+    add      r0, 4
+    dec      r2
+    jnz      .loop2
+.end
+    RET
+
+; %1 = aligned/unaligned
+%macro BSWAP_LOOPS_SSSE3  1
+    mov      r3, r2
+    sar      r2, 3
+    jz       .left4_%1
+.loop8_%1:
+    mov%1    m0, [r1 +  0]
+    mov%1    m1, [r1 + 16]
+    pshufb   m0, m2
+    pshufb   m1, m2
+    mova     [r0 +  0], m0
+    mova     [r0 + 16], m1
+    add      r0, 32
+    add      r1, 32
+    dec      r2
+    jnz      .loop8_%1
+.left4_%1:
+    mov      r2, r3
+    and      r3, 4
+    jz       .left2
+    mov%1    m0, [r1]
+    pshufb   m0, m2
+    mova     [r0], m0
+    add      r1, 16
+    add      r0, 16
+%endmacro
+
+INIT_XMM ssse3
+; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
+cglobal bswap32_buf, 3,4,3
+    mov      r3, r1
+    mova     m2, [pb_bswap32]
+    and      r3, 15
+    jz       .start_align
+    BSWAP_LOOPS_SSSE3  u
+    jmp      .left2
+.start_align:
+    BSWAP_LOOPS_SSSE3  a
+.left2:
+    mov      r3, r2
+    and      r2, 2
+    jz       .left1
+    movq     m0, [r1]
+    pshufb   m0, m2
+    movq     [r0], m0
+    add      r1, 8
+    add      r0, 8
+.left1:
+    and      r3, 1
+    jz       .end
+    mov      r2d, [r1]
+    bswap    r2d
+    mov      [r0], r2d
+.end:
+    RET
author	Christophe Gisquet <christophe.gisquet@gmail.com>	2012-01-19 21:48:39 +0100
committer	Diego Biurrun <diego@biurrun.de>	2012-01-30 10:19:55 +0100
commit	6b039003822a03add20c7ba91fc857dca52b0a03 (patch)
tree	66ed7686c3377bce8accbed2fbc471c9a5931dbb /libavcodec/x86
parent	a846202343af7c56bf444ec47d4bb26a5d2b83ce (diff)