summaryrefslogtreecommitdiff
path: root/libavcodec/aarch64
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2016-12-17 13:14:38 +0200
committerMartin Storsjö <martin@martin.st>2017-02-11 00:08:50 +0200
commit388e0d2515bc6bbc9d0c9af1d230bd16cf945fe7 (patch)
tree361f70c8830c1d01601fd30d4ac28337df6a397b /libavcodec/aarch64
parentfea92a4b57d1c328b1de226a5f213a629ee63754 (diff)
aarch64: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter
No measured speedup on a Cortex A53, but other cores might benefit. Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r--libavcodec/aarch64/vp9mc_neon.S15
1 files changed, 13 insertions, 2 deletions
diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index 99f1809270..95ed26c232 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -202,9 +202,12 @@ endfunc
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
mla \dst2\().8h, v21.8h, v0.h[\offset]
mla \dst4\().8h, v23.8h, v0.h[\offset]
-.else
+.elseif \size == 8
mla \dst1\().8h, v20.8h, v0.h[\offset]
mla \dst3\().8h, v22.8h, v0.h[\offset]
+.else
+ mla \dst1\().4h, v20.4h, v0.h[\offset]
+ mla \dst3\().4h, v22.4h, v0.h[\offset]
.endif
.endm
// The same as above, but don't accumulate straight into the
@@ -219,16 +222,24 @@ endfunc
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
mul v21.8h, v21.8h, v0.h[\offset]
mul v23.8h, v23.8h, v0.h[\offset]
-.else
+.elseif \size == 8
mul v20.8h, v20.8h, v0.h[\offset]
mul v22.8h, v22.8h, v0.h[\offset]
+.else
+ mul v20.4h, v20.4h, v0.h[\offset]
+ mul v22.4h, v22.4h, v0.h[\offset]
.endif
+.if \size == 4
+ sqadd \dst1\().4h, \dst1\().4h, v20.4h
+ sqadd \dst3\().4h, \dst3\().4h, v22.4h
+.else
sqadd \dst1\().8h, \dst1\().8h, v20.8h
sqadd \dst3\().8h, \dst3\().8h, v22.8h
.if \size >= 16
sqadd \dst2\().8h, \dst2\().8h, v21.8h
sqadd \dst4\().8h, \dst4\().8h, v23.8h
.endif
+.endif
.endm