; ; SIMD for computing the residual ; Copyright 2018 Anton Khirnov ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program. If not, see . ;/ %include "config.asm" %include "x86inc.asm" ; double precision %define ELEM_SIZE 8 ; offsets to FD coefficients for given derivative %define OFF_DIFF_COEFF_00 0 * gprsize %define OFF_DIFF_COEFF_01 1 * gprsize %define OFF_DIFF_COEFF_10 2 * gprsize %define OFF_DIFF_COEFF_11 3 * gprsize %define OFF_DIFF_COEFF_02 4 * gprsize %define OFF_DIFF_COEFF_20 5 * gprsize SECTION .rodata const8: times 8 dq 8.0 const16: times 8 dq 16.0 const30: times 8 dq 30.0 const64: times 8 dq 64.0 SECTION .text ; mm register allocation (both s1 and s2) ; m0: accumulator for the residual ; m1-m5: splatted constant finite difference coefficients ; m6-m11: working registers ; (s2 only) m12-m15: splatted constants 64.0, 16.0, 8.0, 30.0 ; calculate and add residual contributions from first and second derivatives ; along a single direction (x or y) ; ; parameters: ; %1: stencil ; %2: 0 -- x; 1 -- y ; ; register use (in addition to register allocation described above): ; m6: on entry contains u[x] multiplied by the corresponding FD coefficient, not ; clobbered ; m7-m11 used for work (clobbered) %macro RES_ADD_DIFF_SINGLEDIR 2 %define stencil %1 %if %2 == 0 %define up1q uq + ELEM_SIZE %define up2q uq + ELEM_SIZE * 2 %define um1q uq - ELEM_SIZE %define um2q uq - ELEM_SIZE * 2 %else %define up1q u_upq %define up2q u_up2q %define um1q u_downq %define um2q u_down2q %endif ; load the function values movu m7, [up1q + offsetq] ; m7 = u[x+1] movu m9, [um1q + offsetq] ; m9 = u[x-1] %if stencil == 2 movu m8, [up2q + offsetq] ; m8 = u[x+2] movu m10, [um2q + offsetq] ; m10 = u[x-2] %endif ; first derivative subpd m11, m7, m9 ; m11 = u[x+1] - u[x-1] %if stencil == 2 mulpd m11, m14 ; m11 *= 8 subpd m11, m8 ; m11 -= u[x+2] addpd m11, m10 ; m11 += u[x-2] %endif mulpd m11, m2 vfmadd231pd m0, m11, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 ; second derivative addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1] %if stencil == 2 mulpd m11, m13 ; m11 *= 16 subpd m11, m8 ; m11 -= u[x+2] subpd m11, m10 ; m11 -= u[x-2] %endif subpd m11, m6 ; m11 -= fd0 u[x] mulpd m11, m5 vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 %endmacro INIT_YMM fma3 cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up %define u_downq fd_factorsq ; load pointers to the equation coefficients %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] ; setup the data pointers and the loop counter shl strideq, 3 shl linesizeq, 3 add dstq, linesizeq add uq, linesizeq add rhsq, linesizeq add diff_coeffs00q, linesizeq add diff_coeffs01q, linesizeq add diff_coeffs10q, linesizeq add diff_coeffs11q, linesizeq add diff_coeffs02q, linesizeq add diff_coeffs20q, linesizeq neg linesizeq ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq ; load and splat the finite difference factors movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] vpermq m1, m0, 00000000b ; diff factor 01 -> m1 vpermq m2, m0, 01010101b ; diff factor 10 -> m2 vpermq m3, m0, 10101010b ; diff factor 11 -> m3 vpermq m4, m0, 11111111b ; diff factor 02 -> m4 movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] vpermq m5, m0, 00000000b ; diff factor 20 -> m5 ; setup pointers to the line above and below lea u_upq, [uq + strideq] mov u_downq, uq sub u_downq, strideq .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value movu m6, [uq + offsetq] ; m6 = u[x] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 addpd m6, m6 ; m6 = 2 * u[x] RES_ADD_DIFF_SINGLEDIR 1, 0 RES_ADD_DIFF_SINGLEDIR 1, 1 ; mixed d2xy movu m6, [u_upq + offsetq + ELEM_SIZE] subpd m6, [u_upq + offsetq - ELEM_SIZE] subpd m6, [u_downq + offsetq + ELEM_SIZE] addpd m6, [u_downq + offsetq - ELEM_SIZE] mulpd m6, m3 vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 ; store the result movu [dstq + offsetq], m0 add offsetq, mmsize js .loop RET INIT_YMM fma3 cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down %define u_down2q fd_factorsq ; reuse the fd_factors registers after it is no longer needed ; load pointers to the equation coefficients %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] ; setup the data pointers and the loop counter shl strideq, 3 shl linesizeq, 3 add dstq, linesizeq add uq, linesizeq add rhsq, linesizeq add diff_coeffs00q, linesizeq add diff_coeffs01q, linesizeq add diff_coeffs10q, linesizeq add diff_coeffs11q, linesizeq add diff_coeffs02q, linesizeq add diff_coeffs20q, linesizeq neg linesizeq ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq ; load and splat the finite difference factors movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] vpermq m1, m0, 00000000b ; diff factor 01 -> m1 vpermq m2, m0, 01010101b ; diff factor 10 -> m2 vpermq m3, m0, 10101010b ; diff factor 11 -> m3 vpermq m4, m0, 11111111b ; diff factor 02 -> m4 movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] vpermq m5, m0, 00000000b ; diff factor 20 -> m5 movu m15, [const30] movu m14, [const8] movu m13, [const16] movu m12, [const64] ; setup pointers to the lines above and below lea u_upq, [uq + strideq] lea u_up2q, [uq + 2 * strideq] mov u_downq, uq sub u_downq, strideq mov u_down2q, u_downq sub u_down2q, strideq .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value movu m6, [uq + offsetq] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 mulpd m6, m15 ; m6 = 30 u[x] RES_ADD_DIFF_SINGLEDIR 2, 0 RES_ADD_DIFF_SINGLEDIR 2, 1 ; mixed d2xy movu m6, [u_up2q + offsetq + 2 * ELEM_SIZE] ; m6 = u[y+2, x+2] vfnmadd231pd m6, m14, [u_up2q + offsetq + 1 * ELEM_SIZE] ; m6 -= 8 u[y+2, x+1] vfmadd231pd m6, m14, [u_up2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y+2, x-1] subpd m6, [u_up2q + offsetq - 2 * ELEM_SIZE] ; m6 -= u[y+2, x-2] vfnmadd231pd m6, m14, [u_upq + offsetq + 2 * ELEM_SIZE] ; m6 -= 8 u[y+1, x+2] vfmadd231pd m6, m12, [u_upq + offsetq + 1 * ELEM_SIZE] ; m6 += 64 u[y+1, x+1] vfnmadd231pd m6, m12, [u_upq + offsetq - 1 * ELEM_SIZE] ; m6 -= 64 u[y+1, x-1] vfmadd231pd m6, m14, [u_upq + offsetq - 2 * ELEM_SIZE] ; m6 += 8 u[y+1, x-2] vfmadd231pd m6, m14, [u_downq + offsetq + 2 * ELEM_SIZE] ; m6 += 8 u[y-1, x+2] vfnmadd231pd m6, m12, [u_downq + offsetq + 1 * ELEM_SIZE] ; m6 -= 64 u[y-1, x+1] vfmadd231pd m6, m12, [u_downq + offsetq - 1 * ELEM_SIZE] ; m6 += 64 u[y-1, x-1] vfnmadd231pd m6, m14, [u_downq + offsetq - 2 * ELEM_SIZE] ; m6 -= 8 u[y-1, x-2] subpd m6, [u_down2q + offsetq + 2 * ELEM_SIZE] ; m6 -= u[y-2, x+2] vfmadd231pd m6, m14, [u_down2q + offsetq + 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x+1] vfnmadd231pd m6, m14, [u_down2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x-1] addpd m6, [u_down2q + offsetq - 2 * ELEM_SIZE] ; m6 += u[y-2, x-2] mulpd m6, m3 vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 ; store the result movu [dstq + offsetq], m0 add offsetq, mmsize js .loop RET