; ; SIMD for computing the residual ; Copyright 2018 Anton Khirnov ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program. If not, see . ;/ %include "config.asm" %include "x86inc.asm" ; double precision %define ELEM_SIZE 8 ; offsets to FD coefficients for given derivative %define OFF_DIFF_COEFF_00 0 * gprsize %define OFF_DIFF_COEFF_01 1 * gprsize %define OFF_DIFF_COEFF_10 2 * gprsize %define OFF_DIFF_COEFF_11 3 * gprsize %define OFF_DIFF_COEFF_02 4 * gprsize %define OFF_DIFF_COEFF_20 5 * gprsize SECTION .rodata const8: times 8 dq 8.0 const30: times 8 dq 30.0 SECTION .text ; mm register allocation (both s1 and s2) ; m0: accumulator for the residual ; m1: dst mult factor ; m2: res mult factor ; m6-m11: working registers ; m12: max(fabs(residual)) ; m13: mask for computing absolute values ; (s2 only) m14-m15: splatted constants 8.0, 30.0 ; calculate and add residual contributions from first and second derivatives ; along a single direction (x or y) ; ; parameters: ; %1: stencil ; %2: 0 -- x; 1 -- y ; ; register use (in addition to register allocation described above): ; m6: on entry contains u[x] multiplied by the corresponding FD coefficient, not ; clobbered ; m7-m11 used for work (clobbered) %macro RES_ADD_DIFF_SINGLEDIR 2 %define stencil %1 %if %2 == 0 %define up1q uq + ELEM_SIZE %define up2q uq + ELEM_SIZE * 2 %define um1q uq - ELEM_SIZE %define um2q uq - ELEM_SIZE * 2 %define coeffs1q diff_coeffs10q %define coeffs2q diff_coeffs20q %else %define up1q u_upq %define up2q u_up2q %define um1q u_downq %define um2q u_down2q %define coeffs1q diff_coeffs01q %define coeffs2q diff_coeffs02q %endif ; load the function values movu m7, [up1q + offsetq] ; m7 = u[x+1] movu m9, [um1q + offsetq] ; m9 = u[x-1] %if stencil == 2 movu m8, [up2q + offsetq] ; m8 = u[x+2] movu m10, [um2q + offsetq] ; m10 = u[x-2] %endif ; first derivative subpd m11, m7, m9 ; m11 = u[x+1] - u[x-1] %if stencil == 2 mulpd m11, m14 ; m11 *= 8 subpd m11, m8 ; m11 -= u[x+2] addpd m11, m10 ; m11 += u[x-2] %endif vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10 ; second derivative addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1] %if stencil == 2 mulpd m11, m14 addpd m11, m11 ; m11 *= 16 subpd m11, m8 ; m11 -= u[x+2] subpd m11, m10 ; m11 -= u[x-2] %endif subpd m11, m6 ; m11 -= fd0 u[x] vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20 %endmacro ; calculate and add residual contributions from the second mixed derivative ; ; parameters: ; %1: stencil ; ; register use (in addition to register allocation described above): ; m6, m7: used for work (clobbered) %macro RES_ADD_DIFF_MIXED 1 movu m6, [u_upq + 1 * ELEM_SIZE + offsetq] ; m6 = u[y+1, x+1] subpd m6, [u_upq - 1 * ELEM_SIZE + offsetq] ; - u[y+1, x-1] subpd m6, [u_downq + 1 * ELEM_SIZE + offsetq] ; - u[y-1, x+1] addpd m6, [u_downq - 1 * ELEM_SIZE + offsetq] ; + u[y-1, x-1] %if %1 == 2 movu m7, [u_up2q - 1 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x-1] subpd m7, [u_up2q + 1 * ELEM_SIZE + offsetq] ; - u[y+2, x+1] subpd m7, [u_upq + 2 * ELEM_SIZE + offsetq] ; - u[y+1, x+2] addpd m7, [u_upq - 2 * ELEM_SIZE + offsetq] ; + u[y+1, x-2] addpd m7, [u_downq + 2 * ELEM_SIZE + offsetq] ; + u[y-1, x+2] subpd m7, [u_downq - 2 * ELEM_SIZE + offsetq] ; - u[y-1, x-2] addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ; + u[y-2, x+1] subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ; - u[y-2, x-1] vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 movu m7, [u_up2q + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2] subpd m7, [u_up2q - 2 * ELEM_SIZE + offsetq] ; - u[y+2, x-2] subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ; - u[y-2, x+2] addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ; + u[y-2, x-2] vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 %endif vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 %endmacro ; %1: stencil ; %2: 0 - calc; 1 - add %macro RESIDUAL_CALC 2 %define stencil %1 %if %2 vpermq m2, m1, 0 %endif vpermq m1, m0, 0 ; compute the mask for absolute value pcmpeqq m13, m13 psrlq m13, 1 movu m12, [res_maxq] ; load pointers to the equation coefficients %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] ; setup the data pointers and the loop counter shl strideq, 3 shl linesizeq, 3 add dstq, linesizeq add uq, linesizeq add rhsq, linesizeq add diff_coeffs00q, linesizeq add diff_coeffs01q, linesizeq add diff_coeffs10q, linesizeq add diff_coeffs11q, linesizeq add diff_coeffs02q, linesizeq add diff_coeffs20q, linesizeq neg linesizeq ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq ; setup pointers to the line above and below lea u_upq, [uq + strideq] mov u_downq, uq sub u_downq, strideq %if stencil == 2 lea u_up2q, [uq + 2 * strideq] neg strideq add strideq, u_downq %define u_down2q strideq ; reuse the stride register for the u[y-2] line movu m15, [const30] movu m14, [const8] %endif .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value movu m6, [uq + offsetq] ; m6 = u[x] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 %if %2 mulpd m3, m6, m2 %endif %if stencil == 1 addpd m6, m6 ; m6 = 2 * u[x] %else mulpd m6, m15 ; m6 = 30 * u[x] %endif RES_ADD_DIFF_SINGLEDIR stencil, 0 RES_ADD_DIFF_SINGLEDIR stencil, 1 RES_ADD_DIFF_MIXED stencil andpd m6, m0, m13 ; m6 = abs(res) mulpd m0, m1 %if %2 addpd m0, m3 %endif ; store the result add offsetq, mmsize jg .store_partial ; store full block movu [dstq + offsetq - mmsize], m0 maxpd m12, m6 js .loop jmp .finish .store_partial: sub offsetq, ELEM_SIZE jz .store3 sub offsetq, ELEM_SIZE jz .store2 .store1: ; offsetq is now mmsize-2 after the write position movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 vpermq m6, m6, 0 maxpd m12, m6 jmp .finish .store2: ; offsetq is now mmsize-2 after the write position movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 mova xm6, xm6 maxpd m12, m6 jmp .finish .store3: ; offsetq is now mmsize-1 after the write position movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0 vextractf128 xm0, m0, 1 movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0 vpermq m6, m6, 10100100b maxpd m12, m6 .finish: movu [res_maxq], m12 RET %endmacro INIT_YMM fma3 cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 0 cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 1 INIT_YMM fma3 cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 0 cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 1