aarch64: me_cmp: Switch from uabd to uabal in ff_pix_abs16_xy2_neon

Using absolute-difference-accumulate does use twice the amount of absolute-difference instructions, but avoids the need for the uaddl and add instructions, reducing the total number of instructions by 3. These can be interleaved in the rest of the calculation, to avoid tight dependencies at the end. Unfortunately, this is marginally slower on Cortex A53, but faster on A72 and A73. Before: Cortex A53 A72 A73 Graviton 3 pix_abs_0_3_neon: 175.7 109.2 92.0 41.2 After: pix_abs_0_3_neon: 179.7 96.7 87.5 41.2 Signed-off-by: Martin Storsjö <martin@martin.st>
author: Martin Storsjö <martin@martin.st> 2022-07-13 00:06:31 +0300
committer: Martin Storsjö <martin@martin.st> 2022-07-16 17:25:54 +0300
commit: 68a03f64240dcbe408c3fd43d1071a105508a588 (patch)
tree: 36b9b2958c50cc1d34a906276b2b12403847bf19 /libavcodec/aarch64
parent: b46de9aba436dea0cff76f3ed0f7c98448367fd0 (diff)
1 files changed, 11 insertions, 21 deletions
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 0ae23d8922..89546869fb 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -124,6 +124,9 @@ function ff_pix_abs16_xy2_neon, export=1
         add             v26.8h, v30.8h, v2.8h       // add up 0..7, using pix2 + pix2+1 values from pix3 above
         add             v27.8h, v31.8h, v3.8h       // add up 8..15, using pix2 + pix2+1 values from pix3 above
 
+        uabdl           v24.8h, v1.8b,  v23.8b      // absolute difference 0..7, i=0
+        uabdl2          v23.8h, v1.16b, v23.16b     // absolute difference 8..15, i=0
+
         ld1             {v21.16b}, [x5], x3         // load pix3
         ld1             {v20.16b}, [x1], x3         // load pix1
 
@@ -137,6 +140,9 @@ function ff_pix_abs16_xy2_neon, export=1
         rshrn           v28.8b, v28.8h, #2          // shift right 2 0..7 (rounding shift right)
         rshrn2          v28.16b, v29.8h, #2         // shift right 2 8..15
 
+        uabal           v24.8h, v16.8b,  v26.8b     // absolute difference 0..7, i=1
+        uabal2          v23.8h, v16.16b, v26.16b    // absolute difference 8..15, i=1
+
         uaddl           v2.8h, v21.8b, v22.8b       // pix3 + pix3+1 0..7
         uaddl2          v3.8h, v21.16b, v22.16b     // pix3 + pix3+1 8..15
         add             v30.8h, v4.8h, v2.8h        // add up 0..7, using pix2 + pix2+1 values from pix3 above
@@ -144,33 +150,17 @@ function ff_pix_abs16_xy2_neon, export=1
         rshrn           v30.8b, v30.8h, #2          // shift right 2 0..7 (rounding shift right)
         rshrn2          v30.16b, v31.8h, #2         // shift right 2 8..15
 
-        // Averages are now stored in these registers:
-        // v23, v16, v28, v30
-        // pix1 values in these registers:
-        // v1, v16, v17, v20
-        // available:
-        // v4, v5, v7, v18, v19, v24, v25, v27, v29, v31
+        uabal           v24.8h, v17.8b,  v28.8b     // absolute difference 0..7, i=2
+        uabal2          v23.8h, v17.16b, v28.16b    // absolute difference 8..15, i=2
 
         sub             w4, w4, #4                  // h -= 4
 
-        // Using absolute-difference instructions instead of absolute-difference-accumulate allows
-        // us to keep the results in 16b vectors instead of widening values with twice the instructions.
-        // This approach also has fewer data dependencies, allowing better instruction level parallelism.
-        uabd            v4.16b, v1.16b, v23.16b     // absolute difference 0..15, i=0
-        uabd            v5.16b, v16.16b, v26.16b    // absolute difference 0..15, i=1
-        uabd            v6.16b, v17.16b, v28.16b    // absolute difference 0..15, i=2
-        uabd            v7.16b, v20.16b, v30.16b    // absolute difference 0..15, i=3
+        uabal           v24.8h, v20.8b,  v30.8b     // absolute difference 0..7, i=3
+        uabal2          v23.8h, v20.16b, v30.16b    // absolute difference 8..15, i=3
 
         cmp             w4, #4                      // loop if h >= 4
 
-        // Now add up all the values in each vector, v4-v7 with widening adds
-        uaddl           v19.8h, v4.8b, v5.8b
-        uaddl2          v18.8h, v4.16b, v5.16b
-        uaddl           v4.8h, v6.8b, v7.8b
-        uaddl2          v5.8h, v6.16b, v7.16b
-        add             v4.8h, v4.8h, v5.8h
-        add             v4.8h, v4.8h, v18.8h
-        add             v4.8h, v4.8h, v19.8h
+        add             v4.8h, v23.8h, v24.8h
         uaddlv          s4, v4.8h                   // finish adding up accumulated values
         add             d0, d0, d4                  // add the value to the top level accumulator
author	Martin Storsjö <martin@martin.st>	2022-07-13 00:06:31 +0300
committer	Martin Storsjö <martin@martin.st>	2022-07-16 17:25:54 +0300
commit	68a03f64240dcbe408c3fd43d1071a105508a588 (patch)
tree	36b9b2958c50cc1d34a906276b2b12403847bf19 /libavcodec/aarch64
parent	b46de9aba436dea0cff76f3ed0f7c98448367fd0 (diff)