summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Almer <jamrial@gmail.com>2014-08-02 23:21:31 -0300
committerMichael Niedermayer <michaelni@gmx.at>2014-08-03 04:24:15 +0200
commitd0f56ca0710157144fe00c075dd508085df716ef (patch)
treef17db39ee285ff5aaabe4b1cc24c6a3a95b1f3cf
parent2e6fdcb7f3c86491408a3699f0aa9dc52b7c5686 (diff)
x86/hevc_deblock: improve 8bit transpose store macros
Up to four instructions less depending on function and instruction set. Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
-rw-r--r--libavcodec/x86/hevc_deblock.asm70
-rw-r--r--libavutil/x86/x86util.asm9
2 files changed, 31 insertions, 48 deletions
diff --git a/libavcodec/x86/hevc_deblock.asm b/libavcodec/x86/hevc_deblock.asm
index 5951e86844..89c0f9bb64 100644
--- a/libavcodec/x86/hevc_deblock.asm
+++ b/libavcodec/x86/hevc_deblock.asm
@@ -77,16 +77,10 @@ INIT_XMM sse2
; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
- packuswb m0, m0
- packuswb m1, m1
- packuswb m2, m2
- packuswb m3, m3
-
- punpcklbw m0, m1
- punpcklbw m2, m3
-
- punpckhwd m6, m0, m2
- punpcklwd m0, m2
+ packuswb m0, m2
+ packuswb m1, m3
+ SBUTTERFLY bw, 0, 1, 2
+ SBUTTERFLY wd, 0, 1, 2
movd %1, m0
pshufd m0, m0, 0x39
@@ -96,13 +90,13 @@ INIT_XMM sse2
pshufd m0, m0, 0x39
movd %4, m0
- movd %5, m6
- pshufd m6, m6, 0x39
- movd %6, m6
- pshufd m6, m6, 0x39
- movd %7, m6
- pshufd m6, m6, 0x39
- movd %8, m6
+ movd %5, m1
+ pshufd m1, m1, 0x39
+ movd %6, m1
+ pshufd m1, m1, 0x39
+ movd %7, m1
+ pshufd m1, m1, 0x39
+ movd %8, m1
%endmacro
; in: 8 rows of 4 words in %4..%11
@@ -204,40 +198,20 @@ INIT_XMM sse2
; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 bytes in %1..%8
%macro TRANSPOSE8x8B_STORE 8
- packuswb m0, m0
- packuswb m1, m1
- packuswb m2, m2
- packuswb m3, m3
- packuswb m4, m4
- packuswb m5, m5
- packuswb m6, m6
- packuswb m7, m7
-
- punpcklbw m0, m1
- punpcklbw m2, m3
-
- punpckhwd m8, m0, m2
- punpcklwd m0, m2
-
- punpcklbw m4, m5
- punpcklbw m6, m7
-
- punpckhwd m9, m4, m6
- punpcklwd m4, m6
+ packuswb m0, m4
+ packuswb m1, m5
+ packuswb m2, m6
+ packuswb m3, m7
+ TRANSPOSE2x4x4B 0, 1, 2, 3, 4
- punpckhdq m10, m0, m4; 2, 3
- punpckldq m0, m4; 0, 1
-
- punpckldq m11, m8, m9; 4, 5
- punpckhdq m8, m9; 6, 7
movq %1, m0
movhps %2, m0
- movq %3, m10
- movhps %4, m10
- movq %5, m11
- movhps %6, m11
- movq %7, m8
- movhps %8, m8
+ movq %3, m1
+ movhps %4, m1
+ movq %5, m2
+ movhps %6, m2
+ movq %7, m3
+ movhps %8, m3
%endmacro
; in: 8 rows of 8 words in %1..%8
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 824e449d24..0d0ef0799d 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -69,6 +69,15 @@
SWAP %2, %3
%endmacro
+%macro TRANSPOSE2x4x4B 5
+ SBUTTERFLY bw, %1, %2, %5
+ SBUTTERFLY bw, %3, %4, %5
+ SBUTTERFLY wd, %1, %3, %5
+ SBUTTERFLY wd, %2, %4, %5
+ SBUTTERFLY dq, %1, %2, %5
+ SBUTTERFLY dq, %3, %4, %5
+%endmacro
+
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5