summaryrefslogtreecommitdiff
path: root/libavcodec/aarch64
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2022-03-31 18:23:48 +0100
committerMartin Storsjö <martin@martin.st>2022-04-01 10:03:34 +0300
commit501fdc017deb1b57ecc17420ba41686a14932fcc (patch)
treec93e5c7721ec6cfca4415c02b66578aab1205afd /libavcodec/aarch64
parentc07de58a725a508c628ddea7d936771c42c189aa (diff)
avcodec/vc1: Arm 64-bit NEON inverse transform fast paths
checkasm benchmarks on 1.5 GHz Cortex-A72 are as follows. vc1dsp.vc1_inv_trans_4x4_c: 158.2 vc1dsp.vc1_inv_trans_4x4_neon: 65.7 vc1dsp.vc1_inv_trans_4x4_dc_c: 86.5 vc1dsp.vc1_inv_trans_4x4_dc_neon: 26.5 vc1dsp.vc1_inv_trans_4x8_c: 335.2 vc1dsp.vc1_inv_trans_4x8_neon: 106.2 vc1dsp.vc1_inv_trans_4x8_dc_c: 151.2 vc1dsp.vc1_inv_trans_4x8_dc_neon: 25.5 vc1dsp.vc1_inv_trans_8x4_c: 365.7 vc1dsp.vc1_inv_trans_8x4_neon: 97.2 vc1dsp.vc1_inv_trans_8x4_dc_c: 139.7 vc1dsp.vc1_inv_trans_8x4_dc_neon: 16.5 vc1dsp.vc1_inv_trans_8x8_c: 547.7 vc1dsp.vc1_inv_trans_8x8_neon: 137.0 vc1dsp.vc1_inv_trans_8x8_dc_c: 268.2 vc1dsp.vc1_inv_trans_8x8_dc_neon: 30.5 Signed-off-by: Ben Avison <bavison@riscosopen.org> Signed-off-by: Martin Storsjö <martin@martin.st>
Diffstat (limited to 'libavcodec/aarch64')
-rw-r--r--libavcodec/aarch64/vc1dsp_init_aarch64.c19
-rw-r--r--libavcodec/aarch64/vc1dsp_neon.S678
2 files changed, 697 insertions, 0 deletions
diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index 8f96e4802d..e0eb52dd63 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -25,6 +25,16 @@
#include "config.h"
+void ff_vc1_inv_trans_8x8_neon(int16_t *block);
+void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
@@ -46,6 +56,15 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
+ dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
+ dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
+ dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
+ dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
+ dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
+ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+ dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
+ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_neon;
dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_neon;
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_neon;
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
index 1ea9fa75ff..0201db4f78 100644
--- a/libavcodec/aarch64/vc1dsp_neon.S
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -22,7 +22,685 @@
#include "libavutil/aarch64/asm.S"
+// VC-1 8x8 inverse transform
+// On entry:
+// x0 -> array of 16-bit inverse transform coefficients, in column-major order
+// On exit:
+// array at x0 updated to hold transformed block; also now held in row-major order
+function ff_vc1_inv_trans_8x8_neon, export=1
+ ld1 {v1.16b, v2.16b}, [x0], #32
+ ld1 {v3.16b, v4.16b}, [x0], #32
+ ld1 {v5.16b, v6.16b}, [x0], #32
+ shl v1.8h, v1.8h, #2 // 8/2 * src[0]
+ sub x1, x0, #3*32
+ ld1 {v16.16b, v17.16b}, [x0]
+ shl v7.8h, v2.8h, #4 // 16 * src[8]
+ shl v18.8h, v2.8h, #2 // 4 * src[8]
+ shl v19.8h, v4.8h, #4 // 16 * src[24]
+ ldr d0, .Lcoeffs_it8
+ shl v5.8h, v5.8h, #2 // 8/2 * src[32]
+ shl v20.8h, v6.8h, #4 // 16 * src[40]
+ shl v21.8h, v6.8h, #2 // 4 * src[40]
+ shl v22.8h, v17.8h, #4 // 16 * src[56]
+ ssra v20.8h, v19.8h, #2 // 4 * src[24] + 16 * src[40]
+ mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16]
+ sub v19.8h, v19.8h, v21.8h // 16 * src[24] - 4 * src[40]
+ ssra v7.8h, v22.8h, #2 // 16 * src[8] + 4 * src[56]
+ sub v18.8h, v22.8h, v18.8h // - 4 * src[8] + 16 * src[56]
+ shl v3.8h, v3.8h, #3 // 16/2 * src[16]
+ mls v20.8h, v2.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
+ ssra v1.8h, v1.8h, #1 // 12/2 * src[0]
+ ssra v5.8h, v5.8h, #1 // 12/2 * src[32]
+ mla v7.8h, v4.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
+ shl v21.8h, v16.8h, #3 // 16/2 * src[48]
+ mls v19.8h, v2.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
+ sub v2.8h, v23.8h, v21.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
+ mla v18.8h, v4.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
+ add v4.8h, v1.8h, v5.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
+ sub v1.8h, v1.8h, v5.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
+ mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
+ mla v7.8h, v6.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
+ add v5.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2
+ sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2
+ mla v20.8h, v17.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
+ add v21.8h, v1.8h, v2.8h // t6/2 = t2/2 + t4/2
+ add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
+ mls v19.8h, v17.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
+ sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
+ add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
+ mls v18.8h, v6.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
+ sub v1.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2
+ sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
+ neg v3.8h, v7.8h // -t1
+ neg v4.8h, v20.8h // +t2
+ neg v6.8h, v19.8h // +t3
+ ssra v22.8h, v7.8h, #1 // (t5 + t1) >> 1
+ ssra v1.8h, v19.8h, #1 // (t7 - t3) >> 1
+ neg v7.8h, v18.8h // +t4
+ ssra v5.8h, v4.8h, #1 // (t6 + t2) >> 1
+ ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1
+ ssra v2.8h, v18.8h, #1 // (t8 - t4) >> 1
+ ssra v17.8h, v7.8h, #1 // (t8 + t4) >> 1
+ ssra v21.8h, v20.8h, #1 // (t6 - t2) >> 1
+ ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1
+ srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3
+ srshr v4.8h, v5.8h, #2 // (t6 + t2 + 4) >> 3
+ srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3
+ srshr v6.8h, v17.8h, #2 // (t8 + t4 + 4) >> 3
+ srshr v2.8h, v2.8h, #2 // (t8 - t4 + 4) >> 3
+ srshr v1.8h, v1.8h, #2 // (t7 - t3 + 4) >> 3
+ srshr v7.8h, v21.8h, #2 // (t6 - t2 + 4) >> 3
+ srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3
+ trn2 v17.8h, v3.8h, v4.8h
+ trn2 v18.8h, v5.8h, v6.8h
+ trn2 v19.8h, v2.8h, v1.8h
+ trn2 v20.8h, v7.8h, v16.8h
+ trn1 v21.4s, v17.4s, v18.4s
+ trn2 v17.4s, v17.4s, v18.4s
+ trn1 v18.4s, v19.4s, v20.4s
+ trn2 v19.4s, v19.4s, v20.4s
+ trn1 v3.8h, v3.8h, v4.8h
+ trn2 v4.2d, v21.2d, v18.2d
+ trn1 v20.2d, v17.2d, v19.2d
+ trn1 v5.8h, v5.8h, v6.8h
+ trn1 v1.8h, v2.8h, v1.8h
+ trn1 v2.8h, v7.8h, v16.8h
+ trn1 v6.2d, v21.2d, v18.2d
+ trn2 v7.2d, v17.2d, v19.2d
+ shl v16.8h, v20.8h, #4 // 16 * src[24]
+ shl v17.8h, v4.8h, #4 // 16 * src[40]
+ trn1 v18.4s, v3.4s, v5.4s
+ trn1 v19.4s, v1.4s, v2.4s
+ shl v21.8h, v7.8h, #4 // 16 * src[56]
+ shl v22.8h, v6.8h, #2 // 4 * src[8]
+ shl v23.8h, v4.8h, #2 // 4 * src[40]
+ trn2 v3.4s, v3.4s, v5.4s
+ trn2 v1.4s, v1.4s, v2.4s
+ shl v2.8h, v6.8h, #4 // 16 * src[8]
+ sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40]
+ ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40]
+ sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56]
+ trn1 v22.2d, v18.2d, v19.2d
+ trn2 v18.2d, v18.2d, v19.2d
+ trn1 v19.2d, v3.2d, v1.2d
+ ssra v2.8h, v21.8h, #2 // 16 * src[8] + 4 * src[56]
+ mls v17.8h, v6.8h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
+ shl v21.8h, v22.8h, #2 // 8/2 * src[0]
+ shl v18.8h, v18.8h, #2 // 8/2 * src[32]
+ mls v5.8h, v6.8h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
+ shl v6.8h, v19.8h, #3 // 16/2 * src[16]
+ trn2 v1.2d, v3.2d, v1.2d
+ mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
+ ssra v21.8h, v21.8h, #1 // 12/2 * src[0]
+ ssra v18.8h, v18.8h, #1 // 12/2 * src[32]
+ mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16]
+ shl v19.8h, v1.8h, #3 // 16/2 * src[48]
+ mla v2.8h, v20.8h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
+ add v20.8h, v21.8h, v18.8h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
+ mla v6.8h, v1.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
+ sub v1.8h, v21.8h, v18.8h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
+ sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
+ mla v17.8h, v7.8h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
+ mls v5.8h, v7.8h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
+ add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
+ add v18.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2
+ mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
+ sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
+ neg v21.8h, v17.8h // +t2
+ mla v2.8h, v4.8h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
+ sub v0.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2
+ neg v4.8h, v5.8h // +t3
+ sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
+ sub v23.8h, v20.8h, v6.8h // t8/2 = t1/2 - t3/2
+ neg v24.8h, v16.8h // +t4
+ add v6.8h, v20.8h, v6.8h // t5/2 = t1/2 + t3/2
+ add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
+ ssra v7.8h, v21.8h, #1 // (t6 + t2) >> 1
+ neg v3.8h, v2.8h // -t1
+ ssra v18.8h, v2.8h, #1 // (t5 + t1) >> 1
+ ssra v19.8h, v4.8h, #1 // (t7 + t3) >> 1
+ ssra v0.8h, v24.8h, #1 // (t8 + t4) >> 1
+ srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1
+ srsra v22.8h, v5.8h, #1 // (t7 - t3 + 1) >> 1
+ srsra v1.8h, v17.8h, #1 // (t6 - t2 + 1) >> 1
+ srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1
+ srshr v2.8h, v18.8h, #6 // (t5 + t1 + 64) >> 7
+ srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7
+ srshr v4.8h, v19.8h, #6 // (t7 + t3 + 64) >> 7
+ srshr v5.8h, v0.8h, #6 // (t8 + t4 + 64) >> 7
+ srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7
+ srshr v17.8h, v22.8h, #6 // (t7 - t3 + 65) >> 7
+ st1 {v2.16b, v3.16b}, [x1], #32
+ srshr v0.8h, v1.8h, #6 // (t6 - t2 + 65) >> 7
+ srshr v1.8h, v6.8h, #6 // (t5 - t1 + 65) >> 7
+ st1 {v4.16b, v5.16b}, [x1], #32
+ st1 {v16.16b, v17.16b}, [x1], #32
+ st1 {v0.16b, v1.16b}, [x1]
+ ret
+endfunc
+
+// VC-1 8x4 inverse transform
+// On entry:
+// x0 -> array of 8-bit samples, in row-major order
+// x1 = row stride for 8-bit sample array
+// x2 -> array of 16-bit inverse transform coefficients, in row-major order
+// On exit:
+// array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_8x4_neon, export=1
+ ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
+ mov x3, x0
+ ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
+ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector
+ ld1 {v5.8b}, [x0], x1
+ trn2 v6.4h, v1.4h, v3.4h
+ trn2 v7.4h, v2.4h, v4.4h
+ trn1 v1.4h, v1.4h, v3.4h
+ trn1 v2.4h, v2.4h, v4.4h
+ trn2 v3.4h, v16.4h, v18.4h
+ trn2 v4.4h, v17.4h, v19.4h
+ trn1 v16.4h, v16.4h, v18.4h
+ trn1 v17.4h, v17.4h, v19.4h
+ ld1 {v18.8b}, [x0], x1
+ trn1 v19.2s, v6.2s, v3.2s
+ trn2 v3.2s, v6.2s, v3.2s
+ trn1 v6.2s, v7.2s, v4.2s
+ trn2 v4.2s, v7.2s, v4.2s
+ trn1 v7.2s, v1.2s, v16.2s
+ trn1 v20.2s, v2.2s, v17.2s
+ shl v21.4h, v19.4h, #4 // 16 * src[1]
+ trn2 v1.2s, v1.2s, v16.2s
+ shl v16.4h, v3.4h, #4 // 16 * src[3]
+ trn2 v2.2s, v2.2s, v17.2s
+ shl v17.4h, v6.4h, #4 // 16 * src[5]
+ ld1 {v22.8b}, [x0], x1
+ shl v23.4h, v4.4h, #4 // 16 * src[7]
+ mul v24.4h, v1.4h, v0.h[0] // 6/2 * src[2]
+ ld1 {v25.8b}, [x0]
+ shl v26.4h, v19.4h, #2 // 4 * src[1]
+ shl v27.4h, v6.4h, #2 // 4 * src[5]
+ ssra v21.4h, v23.4h, #2 // 16 * src[1] + 4 * src[7]
+ ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5]
+ sub v23.4h, v23.4h, v26.4h // - 4 * src[1] + 16 * src[7]
+ sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5]
+ shl v7.4h, v7.4h, #2 // 8/2 * src[0]
+ shl v20.4h, v20.4h, #2 // 8/2 * src[4]
+ mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7]
+ shl v1.4h, v1.4h, #3 // 16/2 * src[2]
+ mls v17.4h, v19.4h, v0.h[2] // - 15 * src[1] + 4 * src[3] + 16 * src[5]
+ ssra v7.4h, v7.4h, #1 // 12/2 * src[0]
+ mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5]
+ ssra v20.4h, v20.4h, #1 // 12/2 * src[4]
+ mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7]
+ shl v3.4h, v2.4h, #3 // 16/2 * src[6]
+ mla v1.4h, v2.4h, v0.h[0] // t3/2 = 16/2 * src[2] + 6/2 * src[6]
+ mla v21.4h, v6.4h, v0.h[1] // t1 = 16 * src[1] + 15 * src[3] + 9 * src[5] + 4 * src[7]
+ mla v17.4h, v4.4h, v0.h[1] // -t2 = - 15 * src[1] + 4 * src[3] + 16 * src[5] + 9 * src[7]
+ sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6]
+ mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7]
+ add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4]
+ mls v23.4h, v6.4h, v0.h[2] // -t4 = - 4 * src[1] + 9 * src[3] - 15 * src[5] + 16 * src[7]
+ sub v4.4h, v7.4h, v20.4h // t2/2 = 12/2 * src[0] - 12/2 * src[4]
+ neg v6.4h, v21.4h // -t1
+ add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
+ sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
+ add v20.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2
+ sub v24.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2
+ add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
+ add v27.4h, v4.4h, v2.4h // t6/2 = t2/2 + t4/2
+ sub v2.4h, v4.4h, v2.4h // t7/2 = t2/2 - t4/2
+ sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
+ neg v3.4h, v17.4h // +t2
+ neg v4.4h, v16.4h // +t3
+ neg v28.4h, v23.4h // +t4
+ ssra v7.4h, v21.4h, #1 // (t5 + t1) >> 1
+ ssra v1.4h, v23.4h, #1 // (t8 - t4) >> 1
+ ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1
+ ssra v24.4h, v4.4h, #1 // (t7 + t3) >> 1
+ ssra v19.4h, v28.4h, #1 // (t8 + t4) >> 1
+ ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1
+ ssra v27.4h, v17.4h, #1 // (t6 - t2) >> 1
+ ssra v26.4h, v6.4h, #1 // (t5 - t1) >> 1
+ trn1 v1.2d, v7.2d, v1.2d
+ trn1 v2.2d, v20.2d, v2.2d
+ trn1 v3.2d, v24.2d, v27.2d
+ trn1 v4.2d, v19.2d, v26.2d
+ srshr v1.8h, v1.8h, #2 // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
+ srshr v2.8h, v2.8h, #2 // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
+ srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
+ srshr v4.8h, v4.8h, #2 // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
+ trn2 v6.8h, v1.8h, v2.8h
+ trn1 v1.8h, v1.8h, v2.8h
+ trn2 v2.8h, v3.8h, v4.8h
+ trn1 v3.8h, v3.8h, v4.8h
+ trn2 v4.4s, v6.4s, v2.4s
+ trn1 v7.4s, v1.4s, v3.4s
+ trn2 v1.4s, v1.4s, v3.4s
+ mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24]
+ trn1 v2.4s, v6.4s, v2.4s
+ mul v4.8h, v4.8h, v0.h[4] // 10/2 * src[24]
+ mul v6.8h, v7.8h, v0.h[6] // 17 * src[0]
+ mul v1.8h, v1.8h, v0.h[6] // 17 * src[16]
+ mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
+ mla v4.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[8] + 10/2 * src[24]
+ add v0.8h, v6.8h, v1.8h // t1 = 17 * src[0] + 17 * src[16]
+ sub v1.8h, v6.8h, v1.8h // t2 = 17 * src[0] - 17 * src[16]
+ neg v2.8h, v3.8h // -t4/2
+ neg v6.8h, v4.8h // -t3/2
+ ssra v4.8h, v0.8h, #1 // (t1 + t3) >> 1
+ ssra v2.8h, v1.8h, #1 // (t2 - t4) >> 1
+ ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1
+ ssra v6.8h, v0.8h, #1 // (t1 - t3) >> 1
+ srshr v0.8h, v4.8h, #6 // (t1 + t3 + 64) >> 7
+ srshr v1.8h, v2.8h, #6 // (t2 - t4 + 64) >> 7
+ srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7
+ srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7
+ uaddw v0.8h, v0.8h, v5.8b
+ uaddw v1.8h, v1.8h, v18.8b
+ uaddw v2.8h, v2.8h, v22.8b
+ uaddw v3.8h, v3.8h, v25.8b
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v0.8b}, [x3], x1
+ st1 {v1.8b}, [x3], x1
+ st1 {v2.8b}, [x3], x1
+ st1 {v3.8b}, [x3]
+ ret
+endfunc
+
+// VC-1 4x8 inverse transform
+// On entry:
+// x0 -> array of 8-bit samples, in row-major order
+// x1 = row stride for 8-bit sample array
+// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
+// On exit:
+// array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x8_neon, export=1
+ mov x3, #16
+ ldr q0, .Lcoeffs_it8 // includes 4-point coefficients in upper half of vector
+ mov x4, x0
+ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03
+ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13
+ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
+ ld1 {v4.d}[0], [x2], x3 // 30 31 32 33
+ ld1 {v1.d}[1], [x2], x3 // 40 41 42 43
+ ld1 {v2.d}[1], [x2], x3 // 50 51 52 53
+ ld1 {v3.d}[1], [x2], x3 // 60 61 62 63
+ ld1 {v4.d}[1], [x2] // 70 71 72 73
+ ld1 {v5.s}[0], [x0], x1
+ ld1 {v6.s}[0], [x0], x1
+ ld1 {v7.s}[0], [x0], x1
+ trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53
+ trn1 v1.8h, v1.8h, v2.8h // 00 10 02 12 40 50 42 52
+ trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73
+ trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72
+ ld1 {v4.s}[0], [x0], x1
+ trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73
+ trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70
+ trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71
+ mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3]
+ ld1 {v5.s}[1], [x0], x1
+ mul v17.8h, v17.8h, v0.h[5] // 22/2 * src[3]
+ ld1 {v6.s}[1], [x0], x1
+ trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72
+ mul v3.8h, v18.8h, v0.h[6] // 17 * src[0]
+ ld1 {v7.s}[1], [x0], x1
+ mul v1.8h, v1.8h, v0.h[6] // 17 * src[2]
+ ld1 {v4.s}[1], [x0]
+ mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
+ mls v17.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[1] + 22/2 * src[3]
+ add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2]
+ sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2]
+ neg v3.8h, v16.8h // -t3/2
+ ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1
+ neg v18.8h, v17.8h // -t4/2
+ ssra v17.8h, v1.8h, #1 // (t2 + t4) >> 1
+ ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1
+ ssra v18.8h, v1.8h, #1 // (t2 - t4) >> 1
+ srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3
+ srshr v2.8h, v17.8h, #2 // (t2 + t4 + 64) >> 3
+ srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3
+ srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3
+ trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73
+ trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71
+ trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61
+ trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63
+ trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53
+ trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73
+ trn1 v17.4s, v1.4s, v2.4s // 00 01 02 03 40 41 42 43
+ mov d18, v3.d[1] // 50 51 52 53
+ shl v19.4h, v3.4h, #4 // 16 * src[8]
+ mov d20, v16.d[1] // 70 71 72 73
+ shl v21.4h, v16.4h, #4 // 16 * src[24]
+ mov d22, v17.d[1] // 40 41 42 43
+ shl v23.4h, v3.4h, #2 // 4 * src[8]
+ shl v24.4h, v18.4h, #4 // 16 * src[40]
+ shl v25.4h, v20.4h, #4 // 16 * src[56]
+ shl v26.4h, v18.4h, #2 // 4 * src[40]
+ trn2 v1.4s, v1.4s, v2.4s // 20 21 22 23 60 61 62 63
+ ssra v24.4h, v21.4h, #2 // 4 * src[24] + 16 * src[40]
+ sub v2.4h, v25.4h, v23.4h // - 4 * src[8] + 16 * src[56]
+ shl v17.4h, v17.4h, #2 // 8/2 * src[0]
+ sub v21.4h, v21.4h, v26.4h // 16 * src[24] - 4 * src[40]
+ shl v22.4h, v22.4h, #2 // 8/2 * src[32]
+ mov d23, v1.d[1] // 60 61 62 63
+ ssra v19.4h, v25.4h, #2 // 16 * src[8] + 4 * src[56]
+ mul v25.4h, v1.4h, v0.h[0] // 6/2 * src[16]
+ shl v1.4h, v1.4h, #3 // 16/2 * src[16]
+ mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
+ ssra v17.4h, v17.4h, #1 // 12/2 * src[0]
+ mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
+ ssra v22.4h, v22.4h, #1 // 12/2 * src[32]
+ mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
+ shl v3.4h, v23.4h, #3 // 16/2 * src[48]
+ mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
+ mla v1.4h, v23.4h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
+ mla v24.4h, v20.4h, v0.h[1] // -t2 = - 15 * src[8] + 4 * src[24] + 16 * src[40] + 9 * src[56]
+ add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
+ sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
+ sub v17.4h, v17.4h, v22.4h // t2/2 = 12/2 * src[0] - 12/2 * src[32]
+ mls v21.4h, v20.4h, v0.h[2] // -t3 = - 9 * src[8] + 16 * src[24] - 4 * src[40] - 15 * src[56]
+ mla v19.4h, v18.4h, v0.h[1] // t1 = 16 * src[8] + 15 * src[24] + 9 * src[40] + 4 * src[56]
+ add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
+ mls v2.4h, v18.4h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
+ sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
+ add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
+ sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
+ neg v23.4h, v24.4h // +t2
+ sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
+ add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
+ neg v17.4h, v21.4h // +t3
+ sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
+ add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
+ neg v16.4h, v19.4h // -t1
+ neg v27.4h, v2.4h // +t4
+ ssra v20.4h, v19.4h, #1 // (t5 + t1) >> 1
+ srsra v0.4h, v2.4h, #1 // (t8 - t4 + 1) >> 1
+ ssra v18.4h, v23.4h, #1 // (t6 + t2) >> 1
+ srsra v22.4h, v21.4h, #1 // (t7 - t3 + 1) >> 1
+ ssra v25.4h, v17.4h, #1 // (t7 + t3) >> 1
+ srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1
+ ssra v26.4h, v27.4h, #1 // (t8 + t4) >> 1
+ srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1
+ trn1 v0.2d, v20.2d, v0.2d
+ trn1 v2.2d, v18.2d, v22.2d
+ trn1 v3.2d, v25.2d, v3.2d
+ trn1 v1.2d, v26.2d, v1.2d
+ srshr v0.8h, v0.8h, #6 // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
+ srshr v2.8h, v2.8h, #6 // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
+ srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
+ srshr v1.8h, v1.8h, #6 // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
+ uaddw v0.8h, v0.8h, v5.8b
+ uaddw v2.8h, v2.8h, v6.8b
+ uaddw v3.8h, v3.8h, v7.8b
+ uaddw v1.8h, v1.8h, v4.8b
+ sqxtun v0.8b, v0.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ sqxtun v1.8b, v1.8h
+ st1 {v0.s}[0], [x4], x1
+ st1 {v2.s}[0], [x4], x1
+ st1 {v3.s}[0], [x4], x1
+ st1 {v1.s}[0], [x4], x1
+ st1 {v0.s}[1], [x4], x1
+ st1 {v2.s}[1], [x4], x1
+ st1 {v3.s}[1], [x4], x1
+ st1 {v1.s}[1], [x4]
+ ret
+endfunc
+
+// VC-1 4x4 inverse transform
+// On entry:
+// x0 -> array of 8-bit samples, in row-major order
+// x1 = row stride for 8-bit sample array
+// x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
+// On exit:
+// array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x4_neon, export=1
+ mov x3, #16
+ ldr d0, .Lcoeffs_it4
+ mov x4, x0
+ ld1 {v1.d}[0], [x2], x3 // 00 01 02 03
+ ld1 {v2.d}[0], [x2], x3 // 10 11 12 13
+ ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
+ ld1 {v4.d}[0], [x2] // 30 31 32 33
+ ld1 {v5.s}[0], [x0], x1
+ ld1 {v5.s}[1], [x0], x1
+ ld1 {v6.s}[0], [x0], x1
+ trn2 v7.4h, v1.4h, v2.4h // 01 11 03 13
+ trn1 v1.4h, v1.4h, v2.4h // 00 10 02 12
+ ld1 {v6.s}[1], [x0]
+ trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33
+ trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32
+ trn2 v4.2s, v7.2s, v2.2s // 03 13 23 33
+ trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30
+ trn1 v2.2s, v7.2s, v2.2s // 01 11 21 31
+ trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32
+ mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3]
+ mul v4.4h, v4.4h, v0.h[1] // 22/2 * src[3]
+ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
+ mul v1.4h, v1.4h, v0.h[2] // 17 * src[2]
+ mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
+ mls v4.4h, v2.4h, v0.h[0] // t4/2 = - 10/2 * src[1] + 22/2 * src[3]
+ add v2.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[2]
+ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[2]
+ neg v7.4h, v3.4h // -t3/2
+ neg v16.4h, v4.4h // -t4/2
+ ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1
+ ssra v4.4h, v1.4h, #1 // (t2 + t4) >> 1
+ ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1
+ ssra v7.4h, v2.4h, #1 // (t1 - t3) >> 1
+ srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3
+ srshr v2.4h, v4.4h, #2 // (t2 + t4 + 64) >> 3
+ srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3
+ srshr v4.4h, v7.4h, #2 // (t1 - t3 + 64) >> 3
+ trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31
+ trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21
+ trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33
+ trn1 v2.4h, v2.4h, v4.4h // 02 03 22 23
+ trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33
+ trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03
+ trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13
+ trn2 v1.2s, v1.2s, v2.2s // 20 21 22 23
+ mul v2.4h, v4.4h, v0.h[1] // 22/2 * src[24]
+ mul v4.4h, v4.4h, v0.h[0] // 10/2 * src[24]
+ mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
+ mul v1.4h, v1.4h, v0.h[2] // 17 * src[16]
+ mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
+ mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24]
+ add v0.4h, v7.4h, v1.4h // t1 = 17 * src[0] + 17 * src[16]
+ sub v1.4h, v7.4h, v1.4h // t2 = 17 * src[0] - 17 * src[16]
+ neg v3.4h, v2.4h // -t4/2
+ neg v7.4h, v4.4h // -t3/2
+ ssra v4.4h, v0.4h, #1 // (t1 + t3) >> 1
+ ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1
+ ssra v2.4h, v1.4h, #1 // (t2 + t4) >> 1
+ ssra v7.4h, v0.4h, #1 // (t1 - t3) >> 1
+ trn1 v0.2d, v4.2d, v3.2d
+ trn1 v1.2d, v2.2d, v7.2d
+ srshr v0.8h, v0.8h, #6 // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
+ srshr v1.8h, v1.8h, #6 // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
+ uaddw v0.8h, v0.8h, v5.8b
+ uaddw v1.8h, v1.8h, v6.8b
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ st1 {v0.s}[0], [x4], x1
+ st1 {v0.s}[1], [x4], x1
+ st1 {v1.s}[0], [x4], x1
+ st1 {v1.s}[1], [x4]
+ ret
+endfunc
+
+// VC-1 8x8 inverse transform, DC case
+// On entry:
+// x0 -> array of 8-bit samples, in row-major order
+// x1 = row stride for 8-bit sample array
+// x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+// array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_8x8_dc_neon, export=1
+ ldrsh w2, [x2]
+ mov x3, x0
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ add w2, w2, w2, lsl #1
+ ld1 {v3.8b}, [x0], x1
+ ld1 {v4.8b}, [x0], x1
+ add w2, w2, #1
+ ld1 {v5.8b}, [x0], x1
+ asr w2, w2, #1
+ ld1 {v6.8b}, [x0], x1
+ add w2, w2, w2, lsl #1
+ ld1 {v7.8b}, [x0]
+ add w0, w2, #16
+ asr w0, w0, #5
+ dup v16.8h, w0
+ uaddw v0.8h, v16.8h, v0.8b
+ uaddw v1.8h, v16.8h, v1.8b
+ uaddw v2.8h, v16.8h, v2.8b
+ uaddw v3.8h, v16.8h, v3.8b
+ uaddw v4.8h, v16.8h, v4.8b
+ uaddw v5.8h, v16.8h, v5.8b
+ sqxtun v0.8b, v0.8h
+ uaddw v6.8h, v16.8h, v6.8b
+ sqxtun v1.8b, v1.8h
+ uaddw v7.8h, v16.8h, v7.8b
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ sqxtun v4.8b, v4.8h
+ st1 {v0.8b}, [x3], x1
+ sqxtun v0.8b, v5.8h
+ st1 {v1.8b}, [x3], x1
+ sqxtun v1.8b, v6.8h
+ st1 {v2.8b}, [x3], x1
+ sqxtun v2.8b, v7.8h
+ st1 {v3.8b}, [x3], x1
+ st1 {v4.8b}, [x3], x1
+ st1 {v0.8b}, [x3], x1
+ st1 {v1.8b}, [x3], x1
+ st1 {v2.8b}, [x3]
+ ret
+endfunc
+
+// VC-1 8x4 inverse transform, DC case
+// On entry:
+// x0 -> array of 8-bit samples, in row-major order
+// x1 = row stride for 8-bit sample array
+// x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+// array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_8x4_dc_neon, export=1
+ ldrsh w2, [x2]
+ mov x3, x0
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ add w2, w2, w2, lsl #1
+ ld1 {v3.8b}, [x0]
+ add w0, w2, #1
+ asr w0, w0, #1
+ add w0, w0, w0, lsl #4
+ add w0, w0, #64
+ asr w0, w0, #7
+ dup v4.8h, w0
+ uaddw v0.8h, v4.8h, v0.8b
+ uaddw v1.8h, v4.8h, v1.8b
+ uaddw v2.8h, v4.8h, v2.8b
+ uaddw v3.8h, v4.8h, v3.8b
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v0.8b}, [x3], x1
+ st1 {v1.8b}, [x3], x1
+ st1 {v2.8b}, [x3], x1
+ st1 {v3.8b}, [x3]
+ ret
+endfunc
+
+// VC-1 4x8 inverse transform, DC case
+// On entry:
+// x0 -> array of 8-bit samples, in row-major order
+// x1 = row stride for 8-bit sample array
+// x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+// array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x8_dc_neon, export=1
+ ldrsh w2, [x2]
+ mov x3, x0
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v2.s}[0], [x0], x1
+ add w2, w2, w2, lsl #4
+ ld1 {v3.s}[0], [x0], x1
+ add w2, w2, #4
+ asr w2, w2, #3
+ add w2, w2, w2, lsl #1
+ ld1 {v0.s}[1], [x0], x1
+ add w2, w2, #16
+ asr w2, w2, #5
+ dup v4.8h, w2
+ ld1 {v1.s}[1], [x0], x1
+ ld1 {v2.s}[1], [x0], x1
+ ld1 {v3.s}[1], [x0]
+ uaddw v0.8h, v4.8h, v0.8b
+ uaddw v1.8h, v4.8h, v1.8b
+ uaddw v2.8h, v4.8h, v2.8b
+ uaddw v3.8h, v4.8h, v3.8b
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+ st1 {v0.s}[0], [x3], x1
+ st1 {v1.s}[0], [x3], x1
+ st1 {v2.s}[0], [x3], x1
+ st1 {v3.s}[0], [x3], x1
+ st1 {v0.s}[1], [x3], x1
+ st1 {v1.s}[1], [x3], x1
+ st1 {v2.s}[1], [x3], x1
+ st1 {v3.s}[1], [x3]
+ ret
+endfunc
+
+// VC-1 4x4 inverse transform, DC case
+// On entry:
+// x0 -> array of 8-bit samples, in row-major order
+// x1 = row stride for 8-bit sample array
+// x2 -> 16-bit inverse transform DC coefficient
+// On exit:
+// array at x0 updated by saturated addition of (narrowed) transformed block
+function ff_vc1_inv_trans_4x4_dc_neon, export=1
+ ldrsh w2, [x2]
+ mov x3, x0
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v1.s}[0], [x0], x1
+ ld1 {v0.s}[1], [x0], x1
+ add w2, w2, w2, lsl #4
+ ld1 {v1.s}[1], [x0]
+ add w0, w2, #4
+ asr w0, w0, #3
+ add w0, w0, w0, lsl #4
+ add w0, w0, #64
+ asr w0, w0, #7
+ dup v2.8h, w0
+ uaddw v0.8h, v2.8h, v0.8b
+ uaddw v1.8h, v2.8h, v1.8b
+ sqxtun v0.8b, v0.8h
+ sqxtun v1.8b, v1.8h
+ st1 {v0.s}[0], [x3], x1
+ st1 {v1.s}[0], [x3], x1
+ st1 {v0.s}[1], [x3], x1
+ st1 {v1.s}[1], [x3]
+ ret
+endfunc
+
.align 5
+.Lcoeffs_it8:
+.quad 0x000F00090003
+.Lcoeffs_it4:
+.quad 0x0011000B0005
.Lcoeffs:
.quad 0x00050002