From b7aa818e0fc9d61d9fd37d4d4bbdc3394eef4f29 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 9 Jan 2019 22:26:44 +0100 Subject: residual_calc.asm: templatize the entire residual computation --- residual_calc.asm | 106 +++++++++++++++--------------------------------------- 1 file changed, 29 insertions(+), 77 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index 6424b2f..289c3fb 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -137,10 +137,10 @@ SECTION .text vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 %endmacro -INIT_YMM fma3 -cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up - %define u_downq fd_factorsq +; %1: stencil +%macro RESIDUAL_CALC 1 + %define stencil %1 + %define u_downq fd_factorsq ; reuse the fd_factors registers after it is no longer needed ; load pointers to the equation coefficients %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] @@ -179,6 +179,15 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co lea u_upq, [uq + strideq] mov u_downq, uq sub u_downq, strideq + %if stencil == 2 + lea u_up2q, [uq + 2 * strideq] + mov u_down2q, u_downq + sub u_down2q, strideq + + movu m15, [const30] + movu m14, [const8] + movu m13, [const16] + %endif .loop: xorpd m0, m0 @@ -188,11 +197,15 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co movu m6, [uq + offsetq] ; m6 = u[x] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 - addpd m6, m6 ; m6 = 2 * u[x] + %if stencil == 1 + addpd m6, m6 ; m6 = 2 * u[x] + %else + mulpd m6, m15 ; m6 = 30 * u[x] + %endif - RES_ADD_DIFF_SINGLEDIR 1, 0 - RES_ADD_DIFF_SINGLEDIR 1, 1 - RES_ADD_DIFF_MIXED 1 + RES_ADD_DIFF_SINGLEDIR stencil, 0 + RES_ADD_DIFF_SINGLEDIR stencil, 1 + RES_ADD_DIFF_MIXED stencil ; store the result movu [dstq + offsetq], m0 @@ -200,75 +213,14 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co js .loop RET +%endmacro INIT_YMM fma3 -cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down - %define u_down2q fd_factorsq ; reuse the fd_factors registers after it is no longer needed - ; load pointers to the equation coefficients - %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer - mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] - mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] - mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] - mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] - mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] - mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] - - ; setup the data pointers and the loop counter - shl strideq, 3 - shl linesizeq, 3 - add dstq, linesizeq - add uq, linesizeq - add rhsq, linesizeq - add diff_coeffs00q, linesizeq - add diff_coeffs01q, linesizeq - add diff_coeffs10q, linesizeq - add diff_coeffs11q, linesizeq - add diff_coeffs02q, linesizeq - add diff_coeffs20q, linesizeq - neg linesizeq - ; from now on, the register that held linesize is used as the offset into data arrays - %define offsetq linesizeq - - ; load and splat the finite difference factors - movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] - vpermq m1, m0, 00000000b ; diff factor 01 -> m1 - vpermq m2, m0, 01010101b ; diff factor 10 -> m2 - vpermq m3, m0, 10101010b ; diff factor 11 -> m3 - vpermq m4, m0, 11111111b ; diff factor 02 -> m4 - movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] - vpermq m5, m0, 00000000b ; diff factor 20 -> m5 - - movu m15, [const30] - movu m14, [const8] - movu m13, [const16] - - ; setup pointers to the lines above and below - lea u_upq, [uq + strideq] - lea u_up2q, [uq + 2 * strideq] - - mov u_downq, uq - sub u_downq, strideq - mov u_down2q, u_downq - sub u_down2q, strideq - -.loop: - xorpd m0, m0 - subpd m0, [rhsq + offsetq] ; res = -rhs - - ; plain value - movu m6, [uq + offsetq] - vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 - - mulpd m6, m15 ; m6 = 30 u[x] - - RES_ADD_DIFF_SINGLEDIR 2, 0 - RES_ADD_DIFF_SINGLEDIR 2, 1 - RES_ADD_DIFF_MIXED 2 - - ; store the result - movu [dstq + offsetq], m0 - add offsetq, mmsize - js .loop +cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up +RESIDUAL_CALC 1 - RET +INIT_YMM fma3 +cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down2 +RESIDUAL_CALC 2 -- cgit v1.2.3