sws/rgb2rgb: rework R-V V YUY2 to 4:2:2 planar

This saves three scratch registers and three instructions per line. The performance gains are mostly negligible. The main point is to free up registers for further rework.
author: Rémi Denis-Courmont <remi@remlab.net> 2023-11-09 19:54:39 +0200
committer: Rémi Denis-Courmont <remi@remlab.net> 2023-11-13 18:34:29 +0200
commit: 5b8b5ec9c55da6afef85195d12d1ac92647986f5 (patch)
tree: 2e5d0bd6c2c1f324e9248b4cf3851d779dca5a21 /libswscale
parent: 5b33104fca4057edb21598264ee17e087f10d816 (diff)
1 files changed, 12 insertions, 13 deletions
diff --git a/libswscale/riscv/rgb2rgb_rvv.S b/libswscale/riscv/rgb2rgb_rvv.S
index 671089c842..172f5918dc 100644
--- a/libswscale/riscv/rgb2rgb_rvv.S
+++ b/libswscale/riscv/rgb2rgb_rvv.S
@@ -127,31 +127,30 @@ func ff_deinterleave_bytes_rvv, zve32x
 endfunc
 
 .macro yuy2_to_i422p y_shift
-        addi    a4, a4, 1
+        slli    t4, a4, 1 // pixel width -> (source) byte width
         lw      t6, (sp)
+        sub     a6, a6, a4
         srai    a4, a4, 1 // pixel width -> chroma width
+        sub     a7, a7, a4
+        sub     t6, t6, t4
 1:
         mv      t4, a4
-        mv      t3, a3
-        mv      t0, a0
-        mv      t1, a1
-        mv      t2, a2
         addi    a5, a5, -1
 2:
         vsetvli    t5, t4, e8, m2, ta, ma
-        vlseg2e16.v v16, (t3)
+        vlseg2e16.v v16, (a3)
         sub        t4, t4, t5
         vnsrl.wi   v24, v16, \y_shift // Y0
-        sh2add     t3, t5, t3
+        sh2add     a3, t5, a3
         vnsrl.wi   v26, v20, \y_shift // Y1
         vnsrl.wi   v28, v16, 8 - \y_shift // U
         vnsrl.wi   v30, v20, 8 - \y_shift // V
-        vsseg2e8.v v24, (t0)
-        sh1add     t0, t5, t0
-        vse8.v     v28, (t1)
-        add        t1, t5, t1
-        vse8.v     v30, (t2)
-        add        t2, t5, t2
+        vsseg2e8.v v24, (a0)
+        sh1add     a0, t5, a0
+        vse8.v     v28, (a1)
+        add        a1, t5, a1
+        vse8.v     v30, (a2)
+        add        a2, t5, a2
         bnez       t4, 2b
 
         add     a3, a3, t6
author	Rémi Denis-Courmont <remi@remlab.net>	2023-11-09 19:54:39 +0200
committer	Rémi Denis-Courmont <remi@remlab.net>	2023-11-13 18:34:29 +0200
commit	5b8b5ec9c55da6afef85195d12d1ac92647986f5 (patch)
tree	2e5d0bd6c2c1f324e9248b4cf3851d779dca5a21 /libswscale
parent	5b33104fca4057edb21598264ee17e087f10d816 (diff)