summaryrefslogtreecommitdiff
path: root/libavcodec/x86/bswapdsp.asm
diff options
context:
space:
mode:
authorDiego Biurrun <diego@biurrun.de>2014-02-13 17:57:05 +0100
committerDiego Biurrun <diego@biurrun.de>2014-06-22 18:22:31 -0700
commitc67b449bebbe0b35c73b203683e77a0a649bc765 (patch)
treefef2691cbb548198024dbc1461419dfdd9d3fea2 /libavcodec/x86/bswapdsp.asm
parent7b9ef8d701c319c26f7d0664fe977e176764c74e (diff)
dsputil: Split bswap*_buf() off into a separate context
Diffstat (limited to 'libavcodec/x86/bswapdsp.asm')
-rw-r--r--libavcodec/x86/bswapdsp.asm135
1 files changed, 135 insertions, 0 deletions
diff --git a/libavcodec/x86/bswapdsp.asm b/libavcodec/x86/bswapdsp.asm
new file mode 100644
index 0000000000..17a6cb1be3
--- /dev/null
+++ b/libavcodec/x86/bswapdsp.asm
@@ -0,0 +1,135 @@
+;******************************************************************************
+;* optimized bswap buffer functions
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+SECTION_TEXT
+
+; %1 = aligned/unaligned
+%macro BSWAP_LOOPS 1
+ mov r3, r2
+ sar r2, 3
+ jz .left4_%1
+.loop8_%1:
+ mov%1 m0, [r1 + 0]
+ mov%1 m1, [r1 + 16]
+%if cpuflag(ssse3)
+ pshufb m0, m2
+ pshufb m1, m2
+ mov%1 [r0 + 0], m0
+ mov%1 [r0 + 16], m1
+%else
+ pshuflw m0, m0, 10110001b
+ pshuflw m1, m1, 10110001b
+ pshufhw m0, m0, 10110001b
+ pshufhw m1, m1, 10110001b
+ mova m2, m0
+ mova m3, m1
+ psllw m0, 8
+ psllw m1, 8
+ psrlw m2, 8
+ psrlw m3, 8
+ por m2, m0
+ por m3, m1
+ mov%1 [r0 + 0], m2
+ mov%1 [r0 + 16], m3
+%endif
+ add r0, 32
+ add r1, 32
+ dec r2
+ jnz .loop8_%1
+.left4_%1:
+ mov r2, r3
+ and r3, 4
+ jz .left
+ mov%1 m0, [r1]
+%if cpuflag(ssse3)
+ pshufb m0, m2
+ mov%1 [r0], m0
+%else
+ pshuflw m0, m0, 10110001b
+ pshufhw m0, m0, 10110001b
+ mova m2, m0
+ psllw m0, 8
+ psrlw m2, 8
+ por m2, m0
+ mov%1 [r0], m2
+%endif
+ add r1, 16
+ add r0, 16
+%endmacro
+
+; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
+%macro BSWAP32_BUF 0
+%if cpuflag(ssse3)
+cglobal bswap32_buf, 3,4,3
+ mov r3, r1
+ mova m2, [pb_bswap32]
+%else
+cglobal bswap32_buf, 3,4,5
+ mov r3, r1
+%endif
+ and r3, 15
+ jz .start_align
+ BSWAP_LOOPS u
+ jmp .left
+.start_align:
+ BSWAP_LOOPS a
+.left:
+%if cpuflag(ssse3)
+ mov r3, r2
+ and r2, 2
+ jz .left1
+ movq m0, [r1]
+ pshufb m0, m2
+ movq [r0], m0
+ add r1, 8
+ add r0, 8
+.left1:
+ and r3, 1
+ jz .end
+ mov r2d, [r1]
+ bswap r2d
+ mov [r0], r2d
+%else
+ and r2, 3
+ jz .end
+.loop2:
+ mov r3d, [r1]
+ bswap r3d
+ mov [r0], r3d
+ add r1, 4
+ add r0, 4
+ dec r2
+ jnz .loop2
+%endif
+.end:
+ RET
+%endmacro
+
+INIT_XMM sse2
+BSWAP32_BUF
+
+INIT_XMM ssse3
+BSWAP32_BUF