From 2f457508915cdf61d2220b9db30d4aaecd7e07b7 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sun, 13 Jan 2019 14:49:57 +0100 Subject: ell_relax: compute the residual norm in residual_calc() It is cheap and avoids an extra step in mg2d. --- residual_calc.asm | 52 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 15 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index 47dda9b..95eb226 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -41,6 +41,8 @@ SECTION .text ; m0: accumulator for the residual ; m1-m5: splatted constant finite difference coefficients ; m6-m11: working registers +; m12: max(fabs(residual)) +; m13: mask for computing absolute values ; (s2 only) m14-m15: splatted constants 8.0, 30.0 ; calculate and add residual contributions from first and second derivatives @@ -140,7 +142,22 @@ SECTION .text ; %1: stencil %macro RESIDUAL_CALC 1 %define stencil %1 - %define u_downq fd_factorsq ; reuse the fd_factors registers after it is no longer needed + + ; load and splat the finite difference factors + movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] + vpermq m1, m0, 00000000b ; diff factor 01 -> m1 + vpermq m2, m0, 01010101b ; diff factor 10 -> m2 + vpermq m3, m0, 10101010b ; diff factor 11 -> m3 + vpermq m4, m0, 11111111b ; diff factor 02 -> m4 + movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] + vpermq m5, m0, 00000000b ; diff factor 20 -> m5 + %define u_downq fd_factorsq ; reuse the fd_factors register after it is no longer needed + + ; compute the mask for absolute value + pcmpeqq m13, m13 + psrlq m13, 1 + movu m12, [res_maxq] + ; load pointers to the equation coefficients %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] @@ -166,23 +183,15 @@ SECTION .text ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq - ; load and splat the finite difference factors - movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] - vpermq m1, m0, 00000000b ; diff factor 01 -> m1 - vpermq m2, m0, 01010101b ; diff factor 10 -> m2 - vpermq m3, m0, 10101010b ; diff factor 11 -> m3 - vpermq m4, m0, 11111111b ; diff factor 02 -> m4 - movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] - vpermq m5, m0, 00000000b ; diff factor 20 -> m5 - ; setup pointers to the line above and below lea u_upq, [uq + strideq] mov u_downq, uq sub u_downq, strideq %if stencil == 2 lea u_up2q, [uq + 2 * strideq] - mov u_down2q, u_downq - sub u_down2q, strideq + neg strideq + add strideq, u_downq + %define u_down2q strideq ; reuse the stride register for the u[y-2] line movu m15, [const30] movu m14, [const8] @@ -206,12 +215,15 @@ SECTION .text RES_ADD_DIFF_SINGLEDIR stencil, 1 RES_ADD_DIFF_MIXED stencil + andpd m6, m0, m13 ; m6 = abs(res) + ; store the result add offsetq, mmsize jg .store_partial ; store full block movu [dstq + offsetq - mmsize], m0 + maxpd m12, m6 js .loop jmp .finish @@ -224,10 +236,16 @@ SECTION .text .store1: ; offsetq is now mmsize-2 after the write position movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + + vpermq m6, m6, 0 + maxpd m12, m6 + jmp .finish .store2: ; offsetq is now mmsize-2 after the write position movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + mova xm6, xm6 + maxpd m12, m6 jmp .finish .store3: ; offsetq is now mmsize-1 after the write position @@ -235,16 +253,20 @@ SECTION .text vextractf128 xm0, m0, 1 movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0 + vpermq m6, m6, 10100100b + maxpd m12, m6 + .finish: + movu [res_maxq], m12 RET %endmacro INIT_YMM fma3 -cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ +cglobal residual_calc_line_s1, 8, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up RESIDUAL_CALC 1 INIT_YMM fma3 -cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down2 +cglobal residual_calc_line_s2, 8, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2 RESIDUAL_CALC 2 -- cgit v1.2.3