From 86ad823b9ade211bfa9361b61571933aff1c9d24 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 23 Apr 2019 15:36:34 +0200 Subject: egs: merge residual calc and correct when possible Also, merge the reflect boundary condition into residual calc+add. Improves performance due to better locality. --- residual_calc.asm | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index 3a5b800..5eea31c 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -39,6 +39,8 @@ SECTION .text ; mm register allocation (both s1 and s2) ; m0: accumulator for the residual +; m1: dst mult factor +; m2: res mult factor ; m6-m11: working registers ; m12: max(fabs(residual)) ; m13: mask for computing absolute values @@ -140,9 +142,14 @@ SECTION .text %endmacro ; %1: stencil -%macro RESIDUAL_CALC 1 +; %2: 0 - calc; 1 - add +%macro RESIDUAL_CALC 2 %define stencil %1 +%if %2 + vpermq m2, m1, 0 +%endif + vpermq m1, m0, 0 ; compute the mask for absolute value pcmpeqq m13, m13 @@ -195,6 +202,9 @@ SECTION .text ; plain value movu m6, [uq + offsetq] ; m6 = u[x] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 +%if %2 + mulpd m3, m6, m2 +%endif %if stencil == 1 addpd m6, m6 ; m6 = 2 * u[x] @@ -207,6 +217,10 @@ SECTION .text RES_ADD_DIFF_MIXED stencil andpd m6, m0, m13 ; m6 = abs(res) + mulpd m0, m1 +%if %2 + addpd m0, m3 +%endif ; store the result add offsetq, mmsize @@ -255,9 +269,16 @@ SECTION .text INIT_YMM fma3 cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up -RESIDUAL_CALC 1 +RESIDUAL_CALC 1, 0 +cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up +RESIDUAL_CALC 1, 1 INIT_YMM fma3 cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 -RESIDUAL_CALC 2 +RESIDUAL_CALC 2, 0 + +cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 +RESIDUAL_CALC 2, 1 -- cgit v1.2.3