summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDerek Buitenhuis <derek.buitenhuis@gmail.com>2016-04-24 12:51:34 +0100
committerDerek Buitenhuis <derek.buitenhuis@gmail.com>2016-04-24 12:51:42 +0100
commit87b8e9500874930667ac966ea2fabdd6222ef6e0 (patch)
tree96c5df610b7b75913666b263d6de7ea0d1ee0957
parent4fe4c5c3761a92fdaf8b3bbb21c00fb40b08f156 (diff)
parentcdb1665f70def544ddab3e3ed3763ef99c8b3873 (diff)
Merge commit 'cdb1665f70def544ddab3e3ed3763ef99c8b3873'
* commit 'cdb1665f70def544ddab3e3ed3763ef99c8b3873': aarch64: Make transpose_4x4H do a regular transpose Merged-by: Derek Buitenhuis <derek.buitenhuis@gmail.com>
-rw-r--r--libavcodec/aarch64/h264idct_neon.S24
-rw-r--r--libavcodec/aarch64/neon.S12
2 files changed, 18 insertions, 18 deletions
diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 91f1e773c4..fa414f73b2 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -33,25 +33,25 @@ function ff_h264_idct_add_neon, export=1
sshr v17.4H, v3.4H, #1
st1 {v30.8H}, [x1], #16
sub v5.4H, v0.4H, v2.4H
- add v6.4H, v1.4H, v17.4H
- sub v7.4H, v16.4H, v3.4H
- add v0.4H, v4.4H, v6.4H
- add v1.4H, v5.4H, v7.4H
- sub v3.4H, v4.4H, v6.4H
- sub v2.4H, v5.4H, v7.4H
+ sub v6.4H, v16.4H, v3.4H
+ add v7.4H, v1.4H, v17.4H
+ add v0.4H, v4.4H, v7.4H
+ add v1.4H, v5.4H, v6.4H
+ sub v2.4H, v5.4H, v6.4H
+ sub v3.4H, v4.4H, v7.4H
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
- add v4.4H, v0.4H, v3.4H
+ add v4.4H, v0.4H, v2.4H
ld1 {v18.S}[0], [x0], x2
- sshr v16.4H, v2.4H, #1
+ sshr v16.4H, v3.4H, #1
sshr v17.4H, v1.4H, #1
- ld1 {v19.S}[1], [x0], x2
- sub v5.4H, v0.4H, v3.4H
ld1 {v18.S}[1], [x0], x2
+ sub v5.4H, v0.4H, v2.4H
+ ld1 {v19.S}[1], [x0], x2
add v6.4H, v16.4H, v1.4H
ins v4.D[1], v5.D[0]
- sub v7.4H, v2.4H, v17.4H
+ sub v7.4H, v17.4H, v3.4H
ld1 {v19.S}[0], [x0], x2
ins v6.D[1], v7.D[0]
sub x0, x0, x2, lsl #2
@@ -68,8 +68,8 @@ function ff_h264_idct_add_neon, export=1
sqxtun v1.8B, v1.8H
st1 {v0.S}[0], [x0], x2
- st1 {v1.S}[1], [x0], x2
st1 {v0.S}[1], [x0], x2
+ st1 {v1.S}[1], [x0], x2
st1 {v1.S}[0], [x0], x2
sub x1, x1, #32
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index a227cbd3f6..0fddbecae3 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -107,12 +107,12 @@
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4H, \r0\().4H, \r1\().4H
trn2 \r5\().4H, \r0\().4H, \r1\().4H
- trn1 \r7\().4H, \r2\().4H, \r3\().4H
- trn2 \r6\().4H, \r2\().4H, \r3\().4H
- trn1 \r0\().2S, \r4\().2S, \r7\().2S
- trn2 \r3\().2S, \r4\().2S, \r7\().2S
- trn1 \r1\().2S, \r5\().2S, \r6\().2S
- trn2 \r2\().2S, \r5\().2S, \r6\().2S
+ trn1 \r6\().4H, \r2\().4H, \r3\().4H
+ trn2 \r7\().4H, \r2\().4H, \r3\().4H
+ trn1 \r0\().2S, \r4\().2S, \r6\().2S
+ trn2 \r2\().2S, \r4\().2S, \r6\().2S
+ trn1 \r1\().2S, \r5\().2S, \r7\().2S
+ trn2 \r3\().2S, \r5\().2S, \r7\().2S
.endm
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9