summaryrefslogtreecommitdiff
path: root/libavcodec/riscv
diff options
context:
space:
mode:
authorRĂ©mi Denis-Courmont <remi@remlab.net>2022-09-26 17:52:51 +0300
committerLynne <dev@lynne.ee>2022-09-27 13:19:52 +0200
commitc03f9654c997b33b8028eb71c9e7ba61fd53a813 (patch)
tree6ed69aca799f255a2be544572a6d788fd088c19a /libavcodec/riscv
parenta15edb0bc0108362fa3c71de3bf763072341b8b0 (diff)
lavc/aacpsdsp: RISC-V V stereo_interpolate[0]
Diffstat (limited to 'libavcodec/riscv')
-rw-r--r--libavcodec/riscv/aacpsdsp_init.c4
-rw-r--r--libavcodec/riscv/aacpsdsp_rvv.S56
2 files changed, 60 insertions, 0 deletions
diff --git a/libavcodec/riscv/aacpsdsp_init.c b/libavcodec/riscv/aacpsdsp_init.c
index c2201ffb6a..f42baf4251 100644
--- a/libavcodec/riscv/aacpsdsp_init.c
+++ b/libavcodec/riscv/aacpsdsp_init.c
@@ -34,6 +34,9 @@ void ff_ps_hybrid_analysis_ileave_rvv(float (*out)[32][2], float L[2][38][64],
void ff_ps_hybrid_synthesis_deint_rvv(float out[2][38][64], float (*in)[32][2],
int i, int len);
+void ff_ps_stereo_interpolate_rvv(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4], int len);
+
av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
{
#if HAVE_RVV
@@ -43,6 +46,7 @@ av_cold void ff_psdsp_init_riscv(PSDSPContext *c)
c->add_squares = ff_ps_add_squares_rvv;
c->mul_pair_single = ff_ps_mul_pair_single_rvv;
c->hybrid_analysis = ff_ps_hybrid_analysis_rvv;
+ c->stereo_interpolate[0] = ff_ps_stereo_interpolate_rvv;
}
if (flags & AV_CPU_FLAG_RVV_I32) {
diff --git a/libavcodec/riscv/aacpsdsp_rvv.S b/libavcodec/riscv/aacpsdsp_rvv.S
index 0cbe4c1d3c..1d6e73fd2d 100644
--- a/libavcodec/riscv/aacpsdsp_rvv.S
+++ b/libavcodec/riscv/aacpsdsp_rvv.S
@@ -219,3 +219,59 @@ func ff_ps_hybrid_synthesis_deint_rvv, zve32x
3:
ret
endfunc
+
+func ff_ps_stereo_interpolate_rvv, zve32f
+ vsetvli t0, zero, e32, m1, ta, ma
+ vid.v v24
+ flw ft0, (a2)
+ vadd.vi v24, v24, 1 // v24[i] = i + 1
+ flw ft1, 4(a2)
+ vfcvt.f.xu.v v24, v24
+ flw ft2, 8(a2)
+ vfmv.v.f v16, ft0
+ flw ft3, 12(a2)
+ vfmv.v.f v17, ft1
+ flw ft0, (a3)
+ vfmv.v.f v18, ft2
+ flw ft1, 4(a3)
+ vfmv.v.f v19, ft3
+ flw ft2, 8(a3)
+ vfmv.v.f v20, ft0
+ flw ft3, 12(a3)
+ vfmv.v.f v21, ft1
+ fcvt.s.wu ft4, t0 // (float)(vlenb / sizeof (float))
+ vfmv.v.f v22, ft2
+ fmul.s ft0, ft0, ft4
+ vfmv.v.f v23, ft3
+ fmul.s ft1, ft1, ft4
+ vfmacc.vv v16, v24, v20 // h0 += (i + 1) * h0_step
+ fmul.s ft2, ft2, ft4
+ vfmacc.vv v17, v24, v21
+ fmul.s ft3, ft3, ft4
+ vfmacc.vv v18, v24, v22
+ vfmacc.vv v19, v24, v23
+1:
+ vsetvli t0, a4, e32, m1, ta, ma
+ vlseg2e32.v v8, (a0) // v8:l_re, v9:l_im
+ sub a4, a4, t0
+ vlseg2e32.v v10, (a1) // v10:r_re, v11:r_im
+ vfmul.vv v12, v8, v16
+ vfmul.vv v13, v9, v16
+ vfmul.vv v14, v8, v17
+ vfmul.vv v15, v9, v17
+ vfmacc.vv v12, v10, v18
+ vfmacc.vv v13, v11, v18
+ vfmacc.vv v14, v10, v19
+ vfmacc.vv v15, v11, v19
+ vsseg2e32.v v12, (a0)
+ sh3add a0, t0, a0
+ vsseg2e32.v v14, (a1)
+ sh3add a1, t0, a1
+ vfadd.vf v16, v16, ft0 // h0 += (vlenb / sizeof (float)) * h0_step
+ vfadd.vf v17, v17, ft1
+ vfadd.vf v18, v18, ft2
+ vfadd.vf v19, v19, ft3
+ bnez a4, 1b
+
+ ret
+endfunc