diff options
author | Rémi Denis-Courmont <remi@remlab.net> | 2023-11-10 18:21:27 +0200 |
---|---|---|
committer | Rémi Denis-Courmont <remi@remlab.net> | 2023-11-13 18:34:29 +0200 |
commit | c536e9220702dec7fbccd6a03f043cc142d68c79 (patch) | |
tree | 48c5e5e379e8d1bc9f736b2f7d27cf5fa08a6978 | |
parent | 20e6195c54106203e79cb0aa148561b4d469b115 (diff) |
lavc/sbrdsp: R-V V hf_apply_noise functions
This is restricted to 128-bit vectors as larger vector sizes could read
past the end of the noise array. Support for future hardware with larger
vector sizes is left for some other time.
hf_apply_noise_0_c: 2319.7
hf_apply_noise_0_rvv_f32: 1229.0
hf_apply_noise_1_c: 2539.0
hf_apply_noise_1_rvv_f32: 1244.7
hf_apply_noise_2_c: 2319.7
hf_apply_noise_2_rvv_f32: 1232.7
hf_apply_noise_3_c: 2541.2
hf_apply_noise_3_rvv_f32: 1244.2
-rw-r--r-- | libavcodec/riscv/sbrdsp_init.c | 17 | ||||
-rw-r--r-- | libavcodec/riscv/sbrdsp_rvv.S | 67 |
2 files changed, 84 insertions, 0 deletions
diff --git a/libavcodec/riscv/sbrdsp_init.c b/libavcodec/riscv/sbrdsp_init.c index e5736452ec..2ed46153ea 100644 --- a/libavcodec/riscv/sbrdsp_init.c +++ b/libavcodec/riscv/sbrdsp_init.c @@ -21,6 +21,7 @@ #include "config.h" #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/riscv/cpu.h" #include "libavcodec/sbrdsp.h" void ff_sbr_sum64x5_rvv(float *z); @@ -32,6 +33,14 @@ void ff_sbr_hf_gen_rvv(float (*X_high)[2], const float (*X_low)[2], float bw, int start, int end); void ff_sbr_hf_g_filt_rvv(float (*Y)[2], const float (*X_high)[40][2], const float *g_filt, int m_max, intptr_t ixh); +void ff_sbr_hf_apply_noise_0_rvv(float (*Y)[2], const float *s, + const float *f, int n, int kx, int max); +void ff_sbr_hf_apply_noise_1_rvv(float (*Y)[2], const float *s, + const float *f, int n, int kx, int max); +void ff_sbr_hf_apply_noise_2_rvv(float (*Y)[2], const float *s, + const float *f, int n, int kx, int max); +void ff_sbr_hf_apply_noise_3_rvv(float (*Y)[2], const float *s, + const float *f, int n, int kx, int max); av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c) { @@ -44,6 +53,14 @@ av_cold void ff_sbrdsp_init_riscv(SBRDSPContext *c) c->sum_square = ff_sbr_sum_square_rvv; c->hf_gen = ff_sbr_hf_gen_rvv; c->hf_g_filt = ff_sbr_hf_g_filt_rvv; + if (ff_get_rv_vlenb() <= 16) { + c->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_rvv; + c->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_rvv; + if (flags & AV_CPU_FLAG_RVB_BASIC) { + c->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_rvv; + c->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_rvv; + } + } } c->autocorrelate = ff_sbr_autocorrelate_rvv; } diff --git a/libavcodec/riscv/sbrdsp_rvv.S b/libavcodec/riscv/sbrdsp_rvv.S index 43fab1f65f..02feb6451e 100644 --- a/libavcodec/riscv/sbrdsp_rvv.S +++ b/libavcodec/riscv/sbrdsp_rvv.S @@ -243,3 +243,70 @@ func ff_sbr_hf_g_filt_rvv, zve32f ret endfunc + +.macro hf_apply_noise n + lla a6, ff_sbr_noise_table + fmv.s.x ft0, zero + addi a6, a6, 8 +1: +.if \n & 1 + min t0, t0, a5 // preserve parity of t0 for v4 sign injector + vsetvli zero, t0, e32, m4, ta, mu +.else + vsetvli t0, a5, e32, m4, ta, mu +.endif + sh3add t6, a3, a6 + vle32.v v8, (a1) // s_m + sub a5, a5, t0 + vle32.v v12, (a2) // q_filt + sh2add a1, t0, a1 + vmfeq.vf v0, v8, ft0 // s_m == 0.f + vlseg2e32.v v24, (t6) // ff_sbr_noise_table + sh2add a2, t0, a2 +.if \n == 2 + vfneg.v v8, v8 +.endif +.if \n & 1 + vfsgnjx.vv v8, v8, v4 // could equivalent use vxor.vv +.endif + add a3, t0, a3 + vlseg2e32.v v16, (a0) // Y + andi a3, a3, 0x1ff +.if \n & 1 + vfmul.vv v28, v12, v28 + vfmacc.vv v16, v12, v24, v0.t + vmerge.vvm v28, v8, v28, v0 + vfadd.vv v20, v20, v28 +.else + vfmul.vv v24, v12, v24 + vfmacc.vv v20, v12, v28, v0.t + vmerge.vvm v24, v8, v24, v0 + vfadd.vv v16, v16, v24 +.endif + vsseg2e32.v v16, (a0) + sh3add a0, t0, a0 + bnez a5, 1b + + ret +.endm + +func ff_sbr_hf_apply_noise_0_rvv, zve32f + hf_apply_noise 0 +endfunc + +func ff_sbr_hf_apply_noise_3_rvv, zve32f + not a4, a4 // invert parity of kx + // fall through +endfunc + +func ff_sbr_hf_apply_noise_1_rvv, zve32f + vsetvli t0, zero, e32, m4, ta, ma + vid.v v4 + vxor.vx v4, v4, a4 + vsll.vi v4, v4, 31 // v4[i] = (kx & 1) ? -0.f : +0.f + hf_apply_noise 1 +endfunc + +func ff_sbr_hf_apply_noise_2_rvv, zve32f + hf_apply_noise 2 +endfunc |