diff options
Diffstat (limited to 'residual_calc.asm')
-rw-r--r-- | residual_calc.asm | 84 |
1 files changed, 41 insertions, 43 deletions
diff --git a/residual_calc.asm b/residual_calc.asm index 9cd530d..42eb50b 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -22,14 +22,6 @@ ; double precision %define ELEM_SIZE 8 -; offsets to FD coefficients for given derivative -%define OFF_DIFF_COEFF_00 0 * gprsize -%define OFF_DIFF_COEFF_01 1 * gprsize -%define OFF_DIFF_COEFF_10 2 * gprsize -%define OFF_DIFF_COEFF_11 3 * gprsize -%define OFF_DIFF_COEFF_02 4 * gprsize -%define OFF_DIFF_COEFF_20 5 * gprsize - SECTION .rodata const8: times 8 dq 8.0 @@ -65,15 +57,15 @@ SECTION .text %define up2q uq + ELEM_SIZE * 2 %define um1q uq - ELEM_SIZE %define um2q uq - ELEM_SIZE * 2 - %define coeffs1q diff_coeffs10q - %define coeffs2q diff_coeffs20q + %define coeffs1q diff_coeffsq + diff_coeff_offset_10 + %define coeffs2q diff_coeffsq + diff_coeff_offset_20 %else %define up1q u_upq %define up2q u_up2q %define um1q u_downq %define um2q u_down2q - %define coeffs1q diff_coeffs01q - %define coeffs2q diff_coeffs02q + %define coeffs1q diff_coeffsq + diff_coeff_offset_01 + %define coeffs2q diff_coeffsq + diff_coeff_offset_02 %endif ; load the function values @@ -91,7 +83,7 @@ SECTION .text subpd m11, m8 ; m11 -= u[x+2] addpd m11, m10 ; m11 += u[x-2] %endif - vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10 + vfmadd231pd m0, m11, [coeffs1q] ; res += d_x u * diff_coeffs10 ; second derivative addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1] @@ -102,7 +94,7 @@ SECTION .text subpd m11, m10 ; m11 -= u[x-2] %endif subpd m11, m6 ; m11 -= fd0 u[x] - vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20 + vfmadd231pd m0, m11, [coeffs2q] ; res += d_xx u * diff_coeffs20 %endmacro ; calculate and add residual contributions from the second mixed derivative @@ -138,13 +130,30 @@ SECTION .text vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 %endif - vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 + vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11 %endmacro ; %1: stencil ; %2: 0 - calc; 1 - add %macro RESIDUAL_CALC 2 - %define stencil %1 + +%define stencil %1 + +%if %2 +%define opname add +%else +%define opname calc +%endif + +; typedef void ResidualLineCalc/Add( +; size_t linesize, double *dst, double *dst_max, +; ptrdiff_t u_stride, const double *u, const double *rhs, +; const double *diff_coeffs, ptrdiff_t diff_coeffs_offset, +; double res_mult, [double u_mult (add only)]) +cglobal residual_line_ %+ opname %+ _s %+ stencil, \ + 8, 13, 14 + stencil * 2, \ + linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset, \ + u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5 %if %2 vpermq m2, m1, 0 @@ -156,27 +165,13 @@ SECTION .text psrlq m13, 1 movu m12, [res_maxq] - ; load pointers to the equation coefficients - %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer - mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] - mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] - mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] - mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] - mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] - mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] - ; setup the data pointers and the loop counter shl u_strideq, 3 + shl diff_coeffs_offsetq, 3 shl linesizeq, 3 add dstq, linesizeq add uq, linesizeq add rhsq, linesizeq - add diff_coeffs00q, linesizeq - add diff_coeffs01q, linesizeq - add diff_coeffs10q, linesizeq - add diff_coeffs11q, linesizeq - add diff_coeffs02q, linesizeq - add diff_coeffs20q, linesizeq neg linesizeq ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq @@ -195,13 +190,26 @@ SECTION .text movu m14, [const8] %endif + ; offsets to FD coefficients for given derivative + %define diff_coeff_offset_01 1 * diff_coeffs_offsetq + %define diff_coeff_offset_10 2 * diff_coeffs_offsetq + + lea diff_coeffs_off3q, [diff_coeffs_offsetq * 2 + diff_coeffs_offsetq] + %define diff_coeff_offset_11 diff_coeffs_off3q + + %define diff_coeff_offset_02 4 * diff_coeffs_offsetq + + lea diff_coeffs_off5q, [diff_coeffs_offsetq * 4 + diff_coeffs_offsetq] + %define diff_coeff_offset_20 diff_coeffs_off5q + + .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value movu m6, [uq + offsetq] ; m6 = u[x] - vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + vfmadd231pd m0, m6, [diff_coeffsq] ; res += u * diff_coeffs00 %if %2 mulpd m3, m6, m2 %endif @@ -223,6 +231,7 @@ SECTION .text %endif ; store the result + add diff_coeffsq, mmsize add offsetq, mmsize jg .store_partial @@ -267,18 +276,7 @@ SECTION .text %endmacro INIT_YMM fma3 -cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 0 -cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 1 - -INIT_YMM fma3 -cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 0 - -cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, u_stride, u, rhs, diff_coeffs,\ - diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 1 |