summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Storsjö <martin@martin.st>2021-08-05 01:18:50 +0300
committerMartin Storsjö <martin@martin.st>2021-08-08 23:18:43 +0300
commitc60b76d0c880037309e78badd62943455f715e6e (patch)
tree769e7418e8f91dfcf838add5406028b79b86110d
parente86ec831b0e43909401035588be04778c4bd17aa (diff)
aarch64: h264dsp: Fix indentation of some functions to match the rest
Signed-off-by: Martin Storsjö <martin@martin.st>
-rw-r--r--libavcodec/aarch64/h264dsp_neon.S520
1 files changed, 260 insertions, 260 deletions
diff --git a/libavcodec/aarch64/h264dsp_neon.S b/libavcodec/aarch64/h264dsp_neon.S
index 997167ca88..997082498f 100644
--- a/libavcodec/aarch64/h264dsp_neon.S
+++ b/libavcodec/aarch64/h264dsp_neon.S
@@ -182,198 +182,198 @@ endfunc
.macro h264_loop_filter_start_intra
- orr w4, w2, w3
- cbnz w4, 1f
- ret
+ orr w4, w2, w3
+ cbnz w4, 1f
+ ret
1:
- dup v30.16b, w2 // alpha
- dup v31.16b, w3 // beta
+ dup v30.16b, w2 // alpha
+ dup v31.16b, w3 // beta
.endm
.macro h264_loop_filter_luma_intra
- uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
- uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
- uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
- cmhi v19.16b, v30.16b, v16.16b // < alpha
- cmhi v17.16b, v31.16b, v17.16b // < beta
- cmhi v18.16b, v31.16b, v18.16b // < beta
-
- movi v29.16b, #2
- ushr v30.16b, v30.16b, #2 // alpha >> 2
- add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
- cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
-
- and v19.16b, v19.16b, v17.16b
- and v19.16b, v19.16b, v18.16b
- shrn v20.8b, v19.8h, #4
- mov x4, v20.d[0]
- cbz x4, 9f
-
- ushll v20.8h, v6.8b, #1
- ushll v22.8h, v1.8b, #1
- ushll2 v21.8h, v6.16b, #1
- ushll2 v23.8h, v1.16b, #1
- uaddw v20.8h, v20.8h, v7.8b
- uaddw v22.8h, v22.8h, v0.8b
- uaddw2 v21.8h, v21.8h, v7.16b
- uaddw2 v23.8h, v23.8h, v0.16b
- uaddw v20.8h, v20.8h, v1.8b
- uaddw v22.8h, v22.8h, v6.8b
- uaddw2 v21.8h, v21.8h, v1.16b
- uaddw2 v23.8h, v23.8h, v6.16b
-
- rshrn v24.8b, v20.8h, #2 // p0'_1
- rshrn v25.8b, v22.8h, #2 // q0'_1
- rshrn2 v24.16b, v21.8h, #2 // p0'_1
- rshrn2 v25.16b, v23.8h, #2 // q0'_1
-
- uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
- uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
- cmhi v17.16b, v31.16b, v17.16b // < beta
- cmhi v18.16b, v31.16b, v18.16b // < beta
-
- and v17.16b, v16.16b, v17.16b // if_2 && if_3
- and v18.16b, v16.16b, v18.16b // if_2 && if_4
-
- not v30.16b, v17.16b
- not v31.16b, v18.16b
-
- and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
- and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
-
- and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
- and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
-
- //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
- uaddl v26.8h, v5.8b, v7.8b
- uaddl2 v27.8h, v5.16b, v7.16b
- uaddw v26.8h, v26.8h, v0.8b
- uaddw2 v27.8h, v27.8h, v0.16b
- add v20.8h, v20.8h, v26.8h
- add v21.8h, v21.8h, v27.8h
- uaddw v20.8h, v20.8h, v0.8b
- uaddw2 v21.8h, v21.8h, v0.16b
- rshrn v20.8b, v20.8h, #3 // p0'_2
- rshrn2 v20.16b, v21.8h, #3 // p0'_2
- uaddw v26.8h, v26.8h, v6.8b
- uaddw2 v27.8h, v27.8h, v6.16b
- rshrn v21.8b, v26.8h, #2 // p1'_2
- rshrn2 v21.16b, v27.8h, #2 // p1'_2
- uaddl v28.8h, v4.8b, v5.8b
- uaddl2 v29.8h, v4.16b, v5.16b
- shl v28.8h, v28.8h, #1
- shl v29.8h, v29.8h, #1
- add v28.8h, v28.8h, v26.8h
- add v29.8h, v29.8h, v27.8h
- rshrn v19.8b, v28.8h, #3 // p2'_2
- rshrn2 v19.16b, v29.8h, #3 // p2'_2
-
- //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
- uaddl v26.8h, v2.8b, v0.8b
- uaddl2 v27.8h, v2.16b, v0.16b
- uaddw v26.8h, v26.8h, v7.8b
- uaddw2 v27.8h, v27.8h, v7.16b
- add v22.8h, v22.8h, v26.8h
- add v23.8h, v23.8h, v27.8h
- uaddw v22.8h, v22.8h, v7.8b
- uaddw2 v23.8h, v23.8h, v7.16b
- rshrn v22.8b, v22.8h, #3 // q0'_2
- rshrn2 v22.16b, v23.8h, #3 // q0'_2
- uaddw v26.8h, v26.8h, v1.8b
- uaddw2 v27.8h, v27.8h, v1.16b
- rshrn v23.8b, v26.8h, #2 // q1'_2
- rshrn2 v23.16b, v27.8h, #2 // q1'_2
- uaddl v28.8h, v2.8b, v3.8b
- uaddl2 v29.8h, v2.16b, v3.16b
- shl v28.8h, v28.8h, #1
- shl v29.8h, v29.8h, #1
- add v28.8h, v28.8h, v26.8h
- add v29.8h, v29.8h, v27.8h
- rshrn v26.8b, v28.8h, #3 // q2'_2
- rshrn2 v26.16b, v29.8h, #3 // q2'_2
-
- bit v7.16b, v24.16b, v30.16b // p0'_1
- bit v0.16b, v25.16b, v31.16b // q0'_1
- bit v7.16b, v20.16b, v17.16b // p0'_2
- bit v6.16b, v21.16b, v17.16b // p1'_2
- bit v5.16b, v19.16b, v17.16b // p2'_2
- bit v0.16b, v22.16b, v18.16b // q0'_2
- bit v1.16b, v23.16b, v18.16b // q1'_2
- bit v2.16b, v26.16b, v18.16b // q2'_2
+ uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
+ uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
+ uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
+ cmhi v19.16b, v30.16b, v16.16b // < alpha
+ cmhi v17.16b, v31.16b, v17.16b // < beta
+ cmhi v18.16b, v31.16b, v18.16b // < beta
+
+ movi v29.16b, #2
+ ushr v30.16b, v30.16b, #2 // alpha >> 2
+ add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
+ cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
+
+ and v19.16b, v19.16b, v17.16b
+ and v19.16b, v19.16b, v18.16b
+ shrn v20.8b, v19.8h, #4
+ mov x4, v20.d[0]
+ cbz x4, 9f
+
+ ushll v20.8h, v6.8b, #1
+ ushll v22.8h, v1.8b, #1
+ ushll2 v21.8h, v6.16b, #1
+ ushll2 v23.8h, v1.16b, #1
+ uaddw v20.8h, v20.8h, v7.8b
+ uaddw v22.8h, v22.8h, v0.8b
+ uaddw2 v21.8h, v21.8h, v7.16b
+ uaddw2 v23.8h, v23.8h, v0.16b
+ uaddw v20.8h, v20.8h, v1.8b
+ uaddw v22.8h, v22.8h, v6.8b
+ uaddw2 v21.8h, v21.8h, v1.16b
+ uaddw2 v23.8h, v23.8h, v6.16b
+
+ rshrn v24.8b, v20.8h, #2 // p0'_1
+ rshrn v25.8b, v22.8h, #2 // q0'_1
+ rshrn2 v24.16b, v21.8h, #2 // p0'_1
+ rshrn2 v25.16b, v23.8h, #2 // q0'_1
+
+ uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
+ uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
+ cmhi v17.16b, v31.16b, v17.16b // < beta
+ cmhi v18.16b, v31.16b, v18.16b // < beta
+
+ and v17.16b, v16.16b, v17.16b // if_2 && if_3
+ and v18.16b, v16.16b, v18.16b // if_2 && if_4
+
+ not v30.16b, v17.16b
+ not v31.16b, v18.16b
+
+ and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
+ and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
+
+ and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
+ and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
+
+ //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
+ uaddl v26.8h, v5.8b, v7.8b
+ uaddl2 v27.8h, v5.16b, v7.16b
+ uaddw v26.8h, v26.8h, v0.8b
+ uaddw2 v27.8h, v27.8h, v0.16b
+ add v20.8h, v20.8h, v26.8h
+ add v21.8h, v21.8h, v27.8h
+ uaddw v20.8h, v20.8h, v0.8b
+ uaddw2 v21.8h, v21.8h, v0.16b
+ rshrn v20.8b, v20.8h, #3 // p0'_2
+ rshrn2 v20.16b, v21.8h, #3 // p0'_2
+ uaddw v26.8h, v26.8h, v6.8b
+ uaddw2 v27.8h, v27.8h, v6.16b
+ rshrn v21.8b, v26.8h, #2 // p1'_2
+ rshrn2 v21.16b, v27.8h, #2 // p1'_2
+ uaddl v28.8h, v4.8b, v5.8b
+ uaddl2 v29.8h, v4.16b, v5.16b
+ shl v28.8h, v28.8h, #1
+ shl v29.8h, v29.8h, #1
+ add v28.8h, v28.8h, v26.8h
+ add v29.8h, v29.8h, v27.8h
+ rshrn v19.8b, v28.8h, #3 // p2'_2
+ rshrn2 v19.16b, v29.8h, #3 // p2'_2
+
+ //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
+ uaddl v26.8h, v2.8b, v0.8b
+ uaddl2 v27.8h, v2.16b, v0.16b
+ uaddw v26.8h, v26.8h, v7.8b
+ uaddw2 v27.8h, v27.8h, v7.16b
+ add v22.8h, v22.8h, v26.8h
+ add v23.8h, v23.8h, v27.8h
+ uaddw v22.8h, v22.8h, v7.8b
+ uaddw2 v23.8h, v23.8h, v7.16b
+ rshrn v22.8b, v22.8h, #3 // q0'_2
+ rshrn2 v22.16b, v23.8h, #3 // q0'_2
+ uaddw v26.8h, v26.8h, v1.8b
+ uaddw2 v27.8h, v27.8h, v1.16b
+ rshrn v23.8b, v26.8h, #2 // q1'_2
+ rshrn2 v23.16b, v27.8h, #2 // q1'_2
+ uaddl v28.8h, v2.8b, v3.8b
+ uaddl2 v29.8h, v2.16b, v3.16b
+ shl v28.8h, v28.8h, #1
+ shl v29.8h, v29.8h, #1
+ add v28.8h, v28.8h, v26.8h
+ add v29.8h, v29.8h, v27.8h
+ rshrn v26.8b, v28.8h, #3 // q2'_2
+ rshrn2 v26.16b, v29.8h, #3 // q2'_2
+
+ bit v7.16b, v24.16b, v30.16b // p0'_1
+ bit v0.16b, v25.16b, v31.16b // q0'_1
+ bit v7.16b, v20.16b, v17.16b // p0'_2
+ bit v6.16b, v21.16b, v17.16b // p1'_2
+ bit v5.16b, v19.16b, v17.16b // p2'_2
+ bit v0.16b, v22.16b, v18.16b // q0'_2
+ bit v1.16b, v23.16b, v18.16b // q1'_2
+ bit v2.16b, v26.16b, v18.16b // q2'_2
.endm
function ff_h264_v_loop_filter_luma_intra_neon, export=1
- h264_loop_filter_start_intra
-
- ld1 {v0.16b}, [x0], x1 // q0
- ld1 {v1.16b}, [x0], x1 // q1
- ld1 {v2.16b}, [x0], x1 // q2
- ld1 {v3.16b}, [x0], x1 // q3
- sub x0, x0, x1, lsl #3
- ld1 {v4.16b}, [x0], x1 // p3
- ld1 {v5.16b}, [x0], x1 // p2
- ld1 {v6.16b}, [x0], x1 // p1
- ld1 {v7.16b}, [x0] // p0
-
- h264_loop_filter_luma_intra
-
- sub x0, x0, x1, lsl #1
- st1 {v5.16b}, [x0], x1 // p2
- st1 {v6.16b}, [x0], x1 // p1
- st1 {v7.16b}, [x0], x1 // p0
- st1 {v0.16b}, [x0], x1 // q0
- st1 {v1.16b}, [x0], x1 // q1
- st1 {v2.16b}, [x0] // q2
+ h264_loop_filter_start_intra
+
+ ld1 {v0.16b}, [x0], x1 // q0
+ ld1 {v1.16b}, [x0], x1 // q1
+ ld1 {v2.16b}, [x0], x1 // q2
+ ld1 {v3.16b}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #3
+ ld1 {v4.16b}, [x0], x1 // p3
+ ld1 {v5.16b}, [x0], x1 // p2
+ ld1 {v6.16b}, [x0], x1 // p1
+ ld1 {v7.16b}, [x0] // p0
+
+ h264_loop_filter_luma_intra
+
+ sub x0, x0, x1, lsl #1
+ st1 {v5.16b}, [x0], x1 // p2
+ st1 {v6.16b}, [x0], x1 // p1
+ st1 {v7.16b}, [x0], x1 // p0
+ st1 {v0.16b}, [x0], x1 // q0
+ st1 {v1.16b}, [x0], x1 // q1
+ st1 {v2.16b}, [x0] // q2
9:
- ret
+ ret
endfunc
function ff_h264_h_loop_filter_luma_intra_neon, export=1
- h264_loop_filter_start_intra
-
- sub x0, x0, #4
- ld1 {v4.8b}, [x0], x1
- ld1 {v5.8b}, [x0], x1
- ld1 {v6.8b}, [x0], x1
- ld1 {v7.8b}, [x0], x1
- ld1 {v0.8b}, [x0], x1
- ld1 {v1.8b}, [x0], x1
- ld1 {v2.8b}, [x0], x1
- ld1 {v3.8b}, [x0], x1
- ld1 {v4.d}[1], [x0], x1
- ld1 {v5.d}[1], [x0], x1
- ld1 {v6.d}[1], [x0], x1
- ld1 {v7.d}[1], [x0], x1
- ld1 {v0.d}[1], [x0], x1
- ld1 {v1.d}[1], [x0], x1
- ld1 {v2.d}[1], [x0], x1
- ld1 {v3.d}[1], [x0], x1
-
- transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
-
- h264_loop_filter_luma_intra
-
- transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
-
- sub x0, x0, x1, lsl #4
- st1 {v4.8b}, [x0], x1
- st1 {v5.8b}, [x0], x1
- st1 {v6.8b}, [x0], x1
- st1 {v7.8b}, [x0], x1
- st1 {v0.8b}, [x0], x1
- st1 {v1.8b}, [x0], x1
- st1 {v2.8b}, [x0], x1
- st1 {v3.8b}, [x0], x1
- st1 {v4.d}[1], [x0], x1
- st1 {v5.d}[1], [x0], x1
- st1 {v6.d}[1], [x0], x1
- st1 {v7.d}[1], [x0], x1
- st1 {v0.d}[1], [x0], x1
- st1 {v1.d}[1], [x0], x1
- st1 {v2.d}[1], [x0], x1
- st1 {v3.d}[1], [x0], x1
+ h264_loop_filter_start_intra
+
+ sub x0, x0, #4
+ ld1 {v4.8b}, [x0], x1
+ ld1 {v5.8b}, [x0], x1
+ ld1 {v6.8b}, [x0], x1
+ ld1 {v7.8b}, [x0], x1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x0], x1
+ ld1 {v4.d}[1], [x0], x1
+ ld1 {v5.d}[1], [x0], x1
+ ld1 {v6.d}[1], [x0], x1
+ ld1 {v7.d}[1], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ ld1 {v2.d}[1], [x0], x1
+ ld1 {v3.d}[1], [x0], x1
+
+ transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+ h264_loop_filter_luma_intra
+
+ transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+ sub x0, x0, x1, lsl #4
+ st1 {v4.8b}, [x0], x1
+ st1 {v5.8b}, [x0], x1
+ st1 {v6.8b}, [x0], x1
+ st1 {v7.8b}, [x0], x1
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ st1 {v4.d}[1], [x0], x1
+ st1 {v5.d}[1], [x0], x1
+ st1 {v6.d}[1], [x0], x1
+ st1 {v7.d}[1], [x0], x1
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ st1 {v2.d}[1], [x0], x1
+ st1 {v3.d}[1], [x0], x1
9:
- ret
+ ret
endfunc
.macro h264_loop_filter_chroma
@@ -474,113 +474,113 @@ function ff_h264_h_loop_filter_chroma422_neon, export=1
endfunc
.macro h264_loop_filter_chroma_intra
- uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
- uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
- uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
- cmhi v26.8b, v30.8b, v26.8b // < alpha
- cmhi v27.8b, v31.8b, v27.8b // < beta
- cmhi v28.8b, v31.8b, v28.8b // < beta
- and v26.8b, v26.8b, v27.8b
- and v26.8b, v26.8b, v28.8b
- mov x2, v26.d[0]
-
- ushll v4.8h, v18.8b, #1
- ushll v6.8h, v19.8b, #1
- cbz x2, 9f
- uaddl v20.8h, v16.8b, v19.8b
- uaddl v22.8h, v17.8b, v18.8b
- add v20.8h, v20.8h, v4.8h
- add v22.8h, v22.8h, v6.8h
- uqrshrn v24.8b, v20.8h, #2
- uqrshrn v25.8b, v22.8h, #2
- bit v16.8b, v24.8b, v26.8b
- bit v17.8b, v25.8b, v26.8b
+ uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
+ uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0)
+ uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0)
+ cmhi v26.8b, v30.8b, v26.8b // < alpha
+ cmhi v27.8b, v31.8b, v27.8b // < beta
+ cmhi v28.8b, v31.8b, v28.8b // < beta
+ and v26.8b, v26.8b, v27.8b
+ and v26.8b, v26.8b, v28.8b
+ mov x2, v26.d[0]
+
+ ushll v4.8h, v18.8b, #1
+ ushll v6.8h, v19.8b, #1
+ cbz x2, 9f
+ uaddl v20.8h, v16.8b, v19.8b
+ uaddl v22.8h, v17.8b, v18.8b
+ add v20.8h, v20.8h, v4.8h
+ add v22.8h, v22.8h, v6.8h
+ uqrshrn v24.8b, v20.8h, #2
+ uqrshrn v25.8b, v22.8h, #2
+ bit v16.8b, v24.8b, v26.8b
+ bit v17.8b, v25.8b, v26.8b
.endm
function ff_h264_v_loop_filter_chroma_intra_neon, export=1
- h264_loop_filter_start_intra
+ h264_loop_filter_start_intra
- sub x0, x0, x1, lsl #1
- ld1 {v18.8b}, [x0], x1
- ld1 {v16.8b}, [x0], x1
- ld1 {v17.8b}, [x0], x1
- ld1 {v19.8b}, [x0]
+ sub x0, x0, x1, lsl #1
+ ld1 {v18.8b}, [x0], x1
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v17.8b}, [x0], x1
+ ld1 {v19.8b}, [x0]
- h264_loop_filter_chroma_intra
+ h264_loop_filter_chroma_intra
- sub x0, x0, x1, lsl #1
- st1 {v16.8b}, [x0], x1
- st1 {v17.8b}, [x0], x1
+ sub x0, x0, x1, lsl #1
+ st1 {v16.8b}, [x0], x1
+ st1 {v17.8b}, [x0], x1
9:
- ret
+ ret
endfunc
function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
- h264_loop_filter_start_intra
+ h264_loop_filter_start_intra
- sub x4, x0, #2
- sub x0, x0, #1
- ld1 {v18.8b}, [x4], x1
- ld1 {v16.8b}, [x4], x1
- ld1 {v17.8b}, [x4], x1
- ld1 {v19.8b}, [x4], x1
+ sub x4, x0, #2
+ sub x0, x0, #1
+ ld1 {v18.8b}, [x4], x1
+ ld1 {v16.8b}, [x4], x1
+ ld1 {v17.8b}, [x4], x1
+ ld1 {v19.8b}, [x4], x1
- transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+ transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
- h264_loop_filter_chroma_intra
+ h264_loop_filter_chroma_intra
- st2 {v16.b,v17.b}[0], [x0], x1
- st2 {v16.b,v17.b}[1], [x0], x1
- st2 {v16.b,v17.b}[2], [x0], x1
- st2 {v16.b,v17.b}[3], [x0], x1
+ st2 {v16.b,v17.b}[0], [x0], x1
+ st2 {v16.b,v17.b}[1], [x0], x1
+ st2 {v16.b,v17.b}[2], [x0], x1
+ st2 {v16.b,v17.b}[3], [x0], x1
9:
- ret
+ ret
endfunc
function ff_h264_h_loop_filter_chroma_intra_neon, export=1
- h264_loop_filter_start_intra
+ h264_loop_filter_start_intra
- sub x4, x0, #2
- sub x0, x0, #1
+ sub x4, x0, #2
+ sub x0, x0, #1
h_loop_filter_chroma420_intra:
- ld1 {v18.8b}, [x4], x1
- ld1 {v16.8b}, [x4], x1
- ld1 {v17.8b}, [x4], x1
- ld1 {v19.8b}, [x4], x1
- ld1 {v18.s}[1], [x4], x1
- ld1 {v16.s}[1], [x4], x1
- ld1 {v17.s}[1], [x4], x1
- ld1 {v19.s}[1], [x4], x1
-
- transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
-
- h264_loop_filter_chroma_intra
-
- st2 {v16.b,v17.b}[0], [x0], x1
- st2 {v16.b,v17.b}[1], [x0], x1
- st2 {v16.b,v17.b}[2], [x0], x1
- st2 {v16.b,v17.b}[3], [x0], x1
- st2 {v16.b,v17.b}[4], [x0], x1
- st2 {v16.b,v17.b}[5], [x0], x1
- st2 {v16.b,v17.b}[6], [x0], x1
- st2 {v16.b,v17.b}[7], [x0], x1
+ ld1 {v18.8b}, [x4], x1
+ ld1 {v16.8b}, [x4], x1
+ ld1 {v17.8b}, [x4], x1
+ ld1 {v19.8b}, [x4], x1
+ ld1 {v18.s}[1], [x4], x1
+ ld1 {v16.s}[1], [x4], x1
+ ld1 {v17.s}[1], [x4], x1
+ ld1 {v19.s}[1], [x4], x1
+
+ transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
+
+ h264_loop_filter_chroma_intra
+
+ st2 {v16.b,v17.b}[0], [x0], x1
+ st2 {v16.b,v17.b}[1], [x0], x1
+ st2 {v16.b,v17.b}[2], [x0], x1
+ st2 {v16.b,v17.b}[3], [x0], x1
+ st2 {v16.b,v17.b}[4], [x0], x1
+ st2 {v16.b,v17.b}[5], [x0], x1
+ st2 {v16.b,v17.b}[6], [x0], x1
+ st2 {v16.b,v17.b}[7], [x0], x1
9:
- ret
+ ret
endfunc
function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
- h264_loop_filter_start_intra
- sub x4, x0, #2
- add x5, x0, x1, lsl #3
- sub x0, x0, #1
- mov x7, x30
- bl h_loop_filter_chroma420_intra
- sub x0, x5, #1
- mov x30, x7
- b h_loop_filter_chroma420_intra
+ h264_loop_filter_start_intra
+ sub x4, x0, #2
+ add x5, x0, x1, lsl #3
+ sub x0, x0, #1
+ mov x7, x30
+ bl h_loop_filter_chroma420_intra
+ sub x0, x5, #1
+ mov x30, x7
+ b h_loop_filter_chroma420_intra
endfunc
.macro biweight_16 macs, macd