From 4a69f2b5c22e1df0474330a176198770b42a365b Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 9 Jan 2019 21:29:23 +0100 Subject: residual_calc.asm: templatize computing non-mixed derivatives --- residual_calc.asm | 136 ++++++++++++++++++++++++++---------------------------- 1 file changed, 66 insertions(+), 70 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index 7048da3..bfb6ce3 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -39,6 +39,68 @@ const64: times 8 dq 64.0 SECTION .text +; mm register allocation (both s1 and s2) +; m0: accumulator for the residual +; m1-m5: splatted constant finite difference coefficients +; m6-m11: working registers +; (s2 only) m12-m15: splatted constants 64.0, 16.0, 8.0, 30.0 + +; calculate and add residual contributions from first and second derivatives +; along a single direction (x or y) +; +; parameters: +; %1: stencil +; %2: 0 -- x; 1 -- y +; +; register use (in addition to register allocation described above): +; m6: on entry contains u[x] multiplied by the corresponding FD coefficient, not +; clobbered +; m7-m11 used for work (clobbered) +%macro RES_ADD_DIFF_SINGLEDIR 2 + %define stencil %1 + + %if %2 == 0 + %define up1q uq + ELEM_SIZE + %define up2q uq + ELEM_SIZE * 2 + %define um1q uq - ELEM_SIZE + %define um2q uq - ELEM_SIZE * 2 + %else + %define up1q u_upq + %define up2q u_up2q + %define um1q u_downq + %define um2q u_down2q + %endif + + ; load the function values + movu m7, [up1q + offsetq] ; m7 = u[x+1] + movu m9, [um1q + offsetq] ; m9 = u[x-1] + %if stencil == 2 + movu m8, [up2q + offsetq] ; m8 = u[x+2] + movu m10, [um2q + offsetq] ; m10 = u[x-2] + %endif + + ; first derivative + subpd m11, m7, m9 ; m11 = u[x+1] - u[x-1] + %if stencil == 2 + mulpd m11, m14 ; m11 *= 8 + subpd m11, m8 ; m11 -= u[x+2] + addpd m11, m10 ; m11 += u[x-2] + %endif + mulpd m11, m2 + vfmadd231pd m0, m11, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 + + ; second derivative + addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1] + %if stencil == 2 + mulpd m11, m13 ; m11 *= 16 + subpd m11, m8 ; m11 -= u[x+2] + subpd m11, m10 ; m11 -= u[x-2] + %endif + subpd m11, m6 ; m11 -= fd0 u[x] + mulpd m11, m5 + vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 +%endmacro + INIT_YMM fma3 cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up @@ -92,31 +154,8 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co addpd m6, m6 ; m6 = 2 * u[x] - ; dx, d2x - movu m8, [uq + offsetq + ELEM_SIZE] - movu m9, [uq + offsetq - ELEM_SIZE] - - subpd m7, m8, m9 - mulpd m7, m2 - vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 - - addpd m7, m8, m9 - subpd m7, m6 - mulpd m7, m5 - vfmadd231pd m0, m7, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 - - ; dy, d2y - movu m8, [u_upq + offsetq] - movu m9, [u_downq + offsetq] - - subpd m7, m8, m9 - mulpd m7, m1 - vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01 - - addpd m7, m8, m9 - subpd m7, m6 - mulpd m7, m4 - vfmadd231pd m0, m7, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02 + RES_ADD_DIFF_SINGLEDIR 1, 0 + RES_ADD_DIFF_SINGLEDIR 1, 1 ; mixed d2xy movu m6, [u_upq + offsetq + ELEM_SIZE] @@ -196,51 +235,8 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co mulpd m6, m15 ; m6 = 30 u[x] - ; dx, d2x - movu m7, [uq + offsetq + ELEM_SIZE] ; m7 = u[x+1] - movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2] - movu m9, [uq + offsetq - ELEM_SIZE] ; m9 = u[x-1] - movu m10, [uq + offsetq - ELEM_SIZE * 2] ; m10 = u[x-2] - - mulpd m11, m14, m7 ; m11 = 8 u[x+1] - vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[x-1] - subpd m11, m8 ; m11 -= u[x+2] - addpd m11, m10 ; m11 += u[x-2] - - mulpd m11, m2, - vfmadd231pd m0, m11, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 - - mulpd m11, m13, m7 ; m11 = 16 u[x+1] - vfmadd231pd m11, m13, m9 ; m11 += 16 u[x-1] - subpd m11, m8 ; m11 -= u[x+2] - subpd m11, m10 ; m11 -= u[x-2] - subpd m11, m6 ; m11 -= 30 u[x] - - mulpd m11, m5 - vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 - - ; dy, d2y - movu m7, [u_upq + offsetq] ; m7 = u[y+1] - movu m8, [u_up2q + offsetq] ; m8 = u[y+2] - movu m9, [u_downq + offsetq] ; m9 = u[y-1] - movu m10, [u_down2q + offsetq] ; m10 = u[y-2] - - mulpd m11, m14, m7 ; m11 = 8 u[y+1] - vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[y-1] - subpd m11, m8 ; m11 -= u[y+2] - addpd m11, m10 ; m11 += u[y-2] - - mulpd m11, m1, - vfmadd231pd m0, m11, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01 - - mulpd m11, m13, m7 ; m11 = 16 u[y+1] - vfmadd231pd m11, m13, m9 ; m11 += 16 u[y-1] - subpd m11, m8 ; m11 -= u[y+2] - subpd m11, m10 ; m11 -= u[y-2] - subpd m11, m6 ; m11 -= 30 u[x] - - mulpd m11, m4 - vfmadd231pd m0, m11, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02 + RES_ADD_DIFF_SINGLEDIR 2, 0 + RES_ADD_DIFF_SINGLEDIR 2, 1 ; mixed d2xy movu m6, [u_up2q + offsetq + 2 * ELEM_SIZE] ; m6 = u[y+2, x+2] -- cgit v1.2.3