From b2d93a84e3a9c84e8c591c9f55355b5834c03d4b Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 17 Apr 2019 18:28:15 +0200 Subject: egs: premultiply diff_coeffs with the denominator in init Do not do it at every residual calc, which also allows us to get rid of an extra parameter (and reduce the number of registers used in x86 SIMD). --- residual_calc.asm | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index c51ba5e..3a5b800 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -39,7 +39,6 @@ SECTION .text ; mm register allocation (both s1 and s2) ; m0: accumulator for the residual -; m1-m5: splatted constant finite difference coefficients ; m6-m11: working registers ; m12: max(fabs(residual)) ; m13: mask for computing absolute values @@ -90,7 +89,6 @@ SECTION .text subpd m11, m8 ; m11 -= u[x+2] addpd m11, m10 ; m11 += u[x-2] %endif - mulpd m11, m2 vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10 ; second derivative @@ -102,7 +100,6 @@ SECTION .text subpd m11, m10 ; m11 -= u[x-2] %endif subpd m11, m6 ; m11 -= fd0 u[x] - mulpd m11, m5 vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20 %endmacro @@ -139,7 +136,6 @@ SECTION .text vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 %endif - mulpd m6, m3 vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 %endmacro @@ -147,15 +143,6 @@ SECTION .text %macro RESIDUAL_CALC 1 %define stencil %1 - ; load and splat the finite difference factors - movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] - vpermq m1, m0, 00000000b ; diff factor 01 -> m1 - vpermq m2, m0, 01010101b ; diff factor 10 -> m2 - vpermq m3, m0, 10101010b ; diff factor 11 -> m3 - vpermq m4, m0, 11111111b ; diff factor 02 -> m4 - movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] - vpermq m5, m0, 00000000b ; diff factor 20 -> m5 - %define u_downq fd_factorsq ; reuse the fd_factors register after it is no longer needed ; compute the mask for absolute value pcmpeqq m13, m13 @@ -266,11 +253,11 @@ SECTION .text %endmacro INIT_YMM fma3 -cglobal residual_calc_line_s1, 8, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up +cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1 INIT_YMM fma3 -cglobal residual_calc_line_s2, 8, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2 +cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2 -- cgit v1.2.3