From 43dab86bcd863739ce51a2742c9f4f5527b5ec7c Mon Sep 17 00:00:00 2001 From: Ivan Kalvachev Date: Sat, 19 Aug 2017 14:29:40 +0300 Subject: opus_pvq_search: Restore the proper use of conditional define and simplify the function name suffix handling. Using named define properly documents the code paths. It also avoids passing additional numbered arguments through multiple levels of macro templates. The suffix handling is done by concatenation, like in other asm functions and avoid having two separate "cglobal" defines. Signed-off-by: Ivan Kalvachev --- libavcodec/x86/opus_pvq_search.asm | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm index 8cf040465d..5c1e6d6174 100644 --- a/libavcodec/x86/opus_pvq_search.asm +++ b/libavcodec/x86/opus_pvq_search.asm @@ -82,7 +82,7 @@ SECTION .text %endif %endmacro -%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation +%macro PULSES_SEARCH 1 ; m6 Syy_norm ; m7 Sxy_norm addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 @@ -96,17 +96,17 @@ align 16 movaps m4, [tmpY + r4] ; y[i] movaps m5, [tmpX + r4] ; X[i] -%if %2 + %if USE_APPROXIMATION == 1 xorps m0, m0 cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) -%endif + %endif addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm -%if %2 + %if USE_APPROXIMATION == 1 andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. -%endif + %endif %else movaps m5, [tmpY + r4] ; m5 = y[i] @@ -119,7 +119,7 @@ align 16 andps m5, m0 ; (0 0 %%add_pulses_loop: - PULSES_SEARCH add, %1 ; m6 Syy_norm ; m7 Sxy_norm + PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm sub Kd, 1 jnz %%add_pulses_loop @@ -325,7 +320,7 @@ align 16 ; K - pulses > 0 align 16 %%remove_pulses_loop: - PULSES_SEARCH sub, %1 ; m6 Syy_norm ; m7 Sxy_norm + PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm add Kd, 1 jnz %%remove_pulses_loop @@ -376,11 +371,15 @@ align 16 ; On Skylake & Ryzen the division is much faster (around 11c/3), ; that makes the full precision code about 2% slower. ; Opus also does use rsqrt approximation in their intrinsics code. +%define USE_APPROXIMATION 1 + INIT_XMM sse2 -PVQ_FAST_SEARCH 1 +PVQ_FAST_SEARCH _approx INIT_XMM sse4 -PVQ_FAST_SEARCH 1 +PVQ_FAST_SEARCH _approx + +%define USE_APPROXIMATION 0 INIT_XMM avx -PVQ_FAST_SEARCH 0 +PVQ_FAST_SEARCH _exact -- cgit v1.2.3