aarch64: h264pred: Optimize the inner loop of existing 8 bit functions

Move the loop counter decrement further from the branch instruction, this hides the latency of the decrement. In loops that first load, then store (the horizontal prediction cases), do the decrement after the load (where the next instruction would stall a bit anyway, waiting for the result of the load). In loops that store twice using the same destination register, also do the decrement between the two stores (as the second store would need to wait for the updated destination register from the first instruction). In loops that store twice to two different destination registers, do the decrement before both stores, to do it as soon before the branch as possible. This gives minor (1-2 cycle) speedups in most cases (modulo measurement noise), but the horizontal prediction functions get a rather notable speedup on the Cortex A53. Before: Cortex A53 A72 A73 pred8x8_dc_8_neon: 60.7 46.2 39.2 pred8x8_dc_128_8_neon: 30.7 18.0 14.0 pred8x8_horizontal_8_neon: 42.2 29.2 18.5 pred8x8_left_dc_8_neon: 52.7 36.2 32.2 pred8x8_mad_cow_dc_0l0_8_neon: 48.2 27.7 25.7 pred8x8_mad_cow_dc_0lt_8_neon: 52.5 33.2 34.7 pred8x8_mad_cow_dc_l0t_8_neon: 52.5 31.7 33.2 pred8x8_mad_cow_dc_l00_8_neon: 43.2 27.0 25.5 pred8x8_plane_8_neon: 112.2 86.2 88.2 pred8x8_top_dc_8_neon: 40.7 23.0 21.2 pred8x8_vertical_8_neon: 27.2 15.5 14.0 pred16x16_dc_8_neon: 91.0 73.2 70.5 pred16x16_dc_128_8_neon: 43.0 34.7 30.7 pred16x16_horizontal_8_neon: 86.0 49.7 44.7 pred16x16_left_dc_8_neon: 87.0 67.2 67.5 pred16x16_plane_8_neon: 236.0 175.7 173.0 pred16x16_top_dc_8_neon: 53.2 39.0 41.7 pred16x16_vertical_8_neon: 41.7 29.7 31.0 After: pred8x8_dc_8_neon: 59.0 46.7 42.5 pred8x8_dc_128_8_neon: 28.2 18.0 14.0 pred8x8_horizontal_8_neon: 34.2 29.2 18.5 pred8x8_left_dc_8_neon: 51.0 38.2 32.7 pred8x8_mad_cow_dc_0l0_8_neon: 46.7 28.2 26.2 pred8x8_mad_cow_dc_0lt_8_neon: 55.2 33.7 37.5 pred8x8_mad_cow_dc_l0t_8_neon: 51.2 31.7 37.2 pred8x8_mad_cow_dc_l00_8_neon: 41.7 27.5 26.0 pred8x8_plane_8_neon: 111.5 86.5 89.5 pred8x8_top_dc_8_neon: 39.0 23.2 21.0 pred8x8_vertical_8_neon: 27.2 16.0 14.0 pred16x16_dc_8_neon: 85.0 70.2 70.5 pred16x16_dc_128_8_neon: 42.0 30.0 30.7 pred16x16_horizontal_8_neon: 66.5 49.5 42.5 pred16x16_left_dc_8_neon: 81.0 66.5 67.5 pred16x16_plane_8_neon: 235.0 175.7 173.0 pred16x16_top_dc_8_neon: 52.0 39.0 41.7 pred16x16_vertical_8_neon: 40.2 33.2 31.0 Despite this, a number of these functions still are slower than what e.g. GCC 7 generates - this shows the relative speedup of the neon codepaths over the compiler generated ones: Cortex A53 A72 A73 pred8x8_dc_8_neon: 0.86 0.65 1.04 pred8x8_dc_128_8_neon: 0.59 0.44 0.62 pred8x8_horizontal_8_neon: 1.51 0.58 1.30 pred8x8_left_dc_8_neon: 0.72 0.56 0.89 pred8x8_mad_cow_dc_0l0_8_neon: 0.93 0.93 1.37 pred8x8_mad_cow_dc_0lt_8_neon: 1.37 1.41 1.68 pred8x8_mad_cow_dc_l0t_8_neon: 1.21 1.17 1.32 pred8x8_mad_cow_dc_l00_8_neon: 1.24 1.19 1.60 pred8x8_plane_8_neon: 3.36 3.58 3.76 pred8x8_top_dc_8_neon: 0.97 0.99 1.43 pred8x8_vertical_8_neon: 0.86 0.78 1.18 pred16x16_dc_8_neon: 1.20 1.06 1.49 pred16x16_dc_128_8_neon: 0.83 0.95 0.99 pred16x16_horizontal_8_neon: 1.78 0.96 1.59 pred16x16_left_dc_8_neon: 1.06 0.96 1.32 pred16x16_plane_8_neon: 5.78 6.49 7.19 pred16x16_top_dc_8_neon: 1.48 1.53 1.94 pred16x16_vertical_8_neon: 1.39 1.34 1.98 In particular, on Cortex A72, many of these functions are slower than the compiler generated code, while they're more beneficial on e.g. the Cortex A73. Signed-off-by: Martin Storsjö <martin@martin.st>
author: Martin Storsjö <martin@martin.st> 2021-04-12 10:31:22 +0300
committer: Martin Storsjö <martin@martin.st> 2021-04-14 15:23:44 +0300
commit: 870bfe16a12bf09dca3a4ae27ef6f81a2de80c40 (patch)
tree: 21e3b1879cc83265e7fad1259ba0e1b90e2779a4 /libavcodec/aarch64
parent: c5ca18fd1b1f8437e52305a5c15226b6f67a2e10 (diff)
1 files changed, 11 insertions, 11 deletions
diff --git a/libavcodec/aarch64/h264pred_neon.S b/libavcodec/aarch64/h264pred_neon.S
index 213b40b3e7..6fec33cf6a 100644
--- a/libavcodec/aarch64/h264pred_neon.S
+++ b/libavcodec/aarch64/h264pred_neon.S
@@ -81,8 +81,8 @@ function ff_pred16x16_dc_neon, export=1
 .L_pred16x16_dc_end:
         mov             w3,  #8
 6:      st1             {v0.16b}, [x0], x1
-        st1             {v0.16b}, [x0], x1
         subs            w3,  w3,  #1
+        st1             {v0.16b}, [x0], x1
         b.ne            6b
         ret
 endfunc
@@ -91,8 +91,8 @@ function ff_pred16x16_hor_neon, export=1
         sub             x2,  x0,  #1
         mov             w3,  #16
 1:      ld1r            {v0.16b}, [x2], x1
-        st1             {v0.16b}, [x0], x1
         subs            w3,  w3,  #1
+        st1             {v0.16b}, [x0], x1
         b.ne            1b
         ret
 endfunc
@@ -102,9 +102,9 @@ function ff_pred16x16_vert_neon, export=1
         add             x1,  x1,  x1
         ld1             {v0.16b}, [x2], x1
         mov             w3,  #8
-1:      st1             {v0.16b}, [x0], x1
+1:      subs            w3,  w3,  #1
+        st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x2], x1
-        subs            w3,  w3,  #1
         b.ne            1b
         ret
 endfunc
@@ -158,8 +158,8 @@ function ff_pred16x16_plane_neon, export=1
         add             v1.8h,  v1.8h,  v2.8h
         sqshrun2        v0.16b, v1.8h,  #5
         add             v1.8h,  v1.8h,  v3.8h
-        st1             {v0.16b}, [x0], x1
         subs            w3,  w3,  #1
+        st1             {v0.16b}, [x0], x1
         b.ne            1b
         ret
 endfunc
@@ -175,8 +175,8 @@ function ff_pred8x8_hor_neon, export=1
         sub             x2,  x0,  #1
         mov             w3,  #8
 1:      ld1r            {v0.8b},  [x2], x1
-        st1             {v0.8b},  [x0], x1
         subs            w3,  w3,  #1
+        st1             {v0.8b},  [x0], x1
         b.ne            1b
         ret
 endfunc
@@ -186,9 +186,9 @@ function ff_pred8x8_vert_neon, export=1
         lsl             x1,  x1,  #1
         ld1             {v0.8b},  [x2], x1
         mov             w3,  #4
-1:      st1             {v0.8b},  [x0], x1
+1:      subs            w3,  w3,  #1
+        st1             {v0.8b},  [x0], x1
         st1             {v0.8b},  [x2], x1
-        subs            w3,  w3,  #1
         b.ne            1b
         ret
 endfunc
@@ -232,9 +232,9 @@ function ff_pred8x8_plane_neon, export=1
         mov             w3,  #8
 1:
         sqshrun         v0.8b,  v1.8h,  #5
+        subs            w3,  w3,  #1
         add             v1.8h,  v1.8h,  v2.8h
         st1             {v0.8b},  [x0], x1
-        subs            w3,  w3,  #1
         b.ne            1b
         ret
 endfunc
@@ -290,9 +290,9 @@ function ff_pred8x8_dc_neon, export=1
 .L_pred8x8_dc_end:
         mov             w3,  #4
         add             x2,  x0,  x1,  lsl #2
-6:      st1             {v0.8b},  [x0], x1
+6:      subs            w3,  w3,  #1
+        st1             {v0.8b},  [x0], x1
         st1             {v1.8b},  [x2], x1
-        subs            w3,  w3,  #1
         b.ne            6b
         ret
 endfunc
author	Martin Storsjö <martin@martin.st>	2021-04-12 10:31:22 +0300
committer	Martin Storsjö <martin@martin.st>	2021-04-14 15:23:44 +0300
commit	870bfe16a12bf09dca3a4ae27ef6f81a2de80c40 (patch)
tree	21e3b1879cc83265e7fad1259ba0e1b90e2779a4 /libavcodec/aarch64
parent	c5ca18fd1b1f8437e52305a5c15226b6f67a2e10 (diff)