; ; SIMD for computing the residual ; Copyright 2018 Anton Khirnov ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program. If not, see . ;/ %include "config.asm" %include "x86inc.asm" ; double precision %define ELEM_SIZE 8 ; offsets to FD coefficients for given derivative %define OFF_DIFF_COEFF_00 0 * gprsize %define OFF_DIFF_COEFF_01 1 * gprsize %define OFF_DIFF_COEFF_10 2 * gprsize %define OFF_DIFF_COEFF_11 3 * gprsize %define OFF_DIFF_COEFF_02 4 * gprsize %define OFF_DIFF_COEFF_20 5 * gprsize SECTION .rodata const5: times 8 dq 5.0 const8: times 8 dq 8.0 const9: times 8 dq 9.0 const10: times 8 dq 10.0 const13_5: times 8 dq 13.5 const30: times 8 dq 30.0 const45: times 8 dq 45.0 const245: times 8 dq 245.0 SECTION .text ; mm register allocation (both s1 and s2) ; m0: accumulator for the residual ; m1: dst mult factor ; m2: res mult factor ; m6-m11: working registers ; m12: max(fabs(residual)) ; m13: mask for computing absolute values ; (s2 only) m14-m15: splatted constants 8.0, 30.0 ; calculate and add residual contributions from first and second derivatives ; along a single direction (x or y) ; ; parameters: ; %1: stencil ; %2: 0 -- x; 1 -- y ; ; register use (in addition to register allocation described above): ; m6: on entry contains u[x] multiplied by the corresponding FD coefficient, not ; clobbered ; m7-m11 used for work (clobbered) %macro RES_ADD_DIFF_SINGLEDIR 2 %define stencil %1 %if %2 == 0 %define up1q uq + ELEM_SIZE %define up2q uq + ELEM_SIZE * 2 %define um1q uq - ELEM_SIZE %define um2q uq - ELEM_SIZE * 2 %define coeffs1q diff_coeffs10q %define coeffs2q diff_coeffs20q %else %define up1q u_upq %define up2q u_up2q %define um1q u_downq %define um2q u_down2q %define coeffs1q diff_coeffs01q %define coeffs2q diff_coeffs02q %endif ; load the function values movu m7, [up1q + offsetq] ; m7 = u[x+1] movu m9, [um1q + offsetq] ; m9 = u[x-1] %if stencil == 2 movu m8, [up2q + offsetq] ; m8 = u[x+2] movu m10, [um2q + offsetq] ; m10 = u[x-2] %endif ; first derivative subpd m11, m7, m9 ; m11 = u[x+1] - u[x-1] %if stencil == 2 mulpd m11, m14 ; m11 *= 8 subpd m11, m8 ; m11 -= u[x+2] addpd m11, m10 ; m11 += u[x-2] %endif vfmadd231pd m0, m11, [coeffs1q + offsetq] ; res += d_x u * diff_coeffs10 ; second derivative addpd m11, m7, m9 ; m11 = u[x+1] + u[x-1] %if stencil == 2 mulpd m11, m14 addpd m11, m11 ; m11 *= 16 subpd m11, m8 ; m11 -= u[x+2] subpd m11, m10 ; m11 -= u[x-2] %endif subpd m11, m6 ; m11 -= fd0 u[x] vfmadd231pd m0, m11, [coeffs2q + offsetq] ; res += d_xx u * diff_coeffs20 %endmacro ; calculate and add residual contributions from the second mixed derivative ; ; parameters: ; %1: stencil ; ; register use (in addition to register allocation described above): ; m6, m7: used for work (clobbered) %macro RES_ADD_DIFF_MIXED 1 movu m6, [u_upq + 1 * ELEM_SIZE + offsetq] ; m6 = u[y+1, x+1] subpd m6, [u_upq - 1 * ELEM_SIZE + offsetq] ; - u[y+1, x-1] subpd m6, [u_downq + 1 * ELEM_SIZE + offsetq] ; - u[y-1, x+1] addpd m6, [u_downq - 1 * ELEM_SIZE + offsetq] ; + u[y-1, x-1] %if %1 == 2 movu m7, [u_up2q - 1 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x-1] subpd m7, [u_up2q + 1 * ELEM_SIZE + offsetq] ; - u[y+2, x+1] subpd m7, [u_upq + 2 * ELEM_SIZE + offsetq] ; - u[y+1, x+2] addpd m7, [u_upq - 2 * ELEM_SIZE + offsetq] ; + u[y+1, x-2] addpd m7, [u_downq + 2 * ELEM_SIZE + offsetq] ; + u[y-1, x+2] subpd m7, [u_downq - 2 * ELEM_SIZE + offsetq] ; - u[y-1, x-2] addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ; + u[y-2, x+1] subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ; - u[y-2, x-1] vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 movu m7, [u_up2q + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2] subpd m7, [u_up2q - 2 * ELEM_SIZE + offsetq] ; - u[y+2, x-2] subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ; - u[y-2, x+2] addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ; + u[y-2, x-2] vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 %endif vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 %endmacro ; %1: stencil ; %2: 0 - calc; 1 - add %macro RESIDUAL_CALC 2 %define stencil %1 %if %2 vpermq m2, m1, 0 %endif vpermq m1, m0, 0 ; compute the mask for absolute value pcmpeqq m13, m13 psrlq m13, 1 movu m12, [res_maxq] ; load pointers to the equation coefficients %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] ; setup the data pointers and the loop counter shl strideq, 3 shl linesizeq, 3 add dstq, linesizeq add uq, linesizeq add rhsq, linesizeq add diff_coeffs00q, linesizeq add diff_coeffs01q, linesizeq add diff_coeffs10q, linesizeq add diff_coeffs11q, linesizeq add diff_coeffs02q, linesizeq add diff_coeffs20q, linesizeq neg linesizeq ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq ; setup pointers to the line above and below lea u_upq, [uq + strideq] mov u_downq, uq sub u_downq, strideq %if stencil == 2 lea u_up2q, [uq + 2 * strideq] neg strideq add strideq, u_downq %define u_down2q strideq ; reuse the stride register for the u[y-2] line movu m15, [const30] movu m14, [const8] %endif .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value movu m6, [uq + offsetq] ; m6 = u[x] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 %if %2 mulpd m3, m6, m2 %endif %if stencil == 1 addpd m6, m6 ; m6 = 2 * u[x] %else mulpd m6, m15 ; m6 = 30 * u[x] %endif RES_ADD_DIFF_SINGLEDIR stencil, 0 RES_ADD_DIFF_SINGLEDIR stencil, 1 RES_ADD_DIFF_MIXED stencil andpd m6, m0, m13 ; m6 = abs(res) mulpd m0, m1 %if %2 addpd m0, m3 %endif ; store the result add offsetq, mmsize jg .store_partial ; store full block movu [dstq + offsetq - mmsize], m0 maxpd m12, m6 js .loop jmp .finish .store_partial: sub offsetq, ELEM_SIZE jz .store3 sub offsetq, ELEM_SIZE jz .store2 .store1: ; offsetq is now mmsize-2 after the write position movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 vpermq m6, m6, 0 maxpd m12, m6 jmp .finish .store2: ; offsetq is now mmsize-2 after the write position movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 mova xm6, xm6 maxpd m12, m6 jmp .finish .store3: ; offsetq is now mmsize-1 after the write position movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0 vextractf128 xm0, m0, 1 movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0 vpermq m6, m6, 10100100b maxpd m12, m6 .finish: movu [res_maxq], m12 RET %endmacro INIT_YMM fma3 cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 0 cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up RESIDUAL_CALC 1, 1 INIT_YMM fma3 cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 0 cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 1 %macro DX3 3 movu %1, [%3 + ELEM_SIZE * 1] ; out = u[x+1] subpd %1, [%3 - ELEM_SIZE * 1] ; -u[x-1] movu %2, [%3 + ELEM_SIZE * 2] ; tmp = u[x+2] subpd %2, [%3 - ELEM_SIZE * 2] ; -u[x-2] vfmsub132pd %1, %2, [const5] ; out = 5 * out - tmp movu %2, [%3 + ELEM_SIZE * 3] ; tmp = u[x+3] subpd %2, [%3 - ELEM_SIZE * 3] ; -u[x-3] vfmadd132pd %1, %2, [const9] ; out = 9 * out + tmp %endmacro %macro RES_ADD_DIFF_X_3 0 ; first derivative DX3 m7, m8, uq + offsetq vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 ; second derivative movu m7, [uq + offsetq + ELEM_SIZE * 1] ; m7 = u[x+1] addpd m7, [uq + offsetq - ELEM_SIZE * 1] ; +u[x-1] movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2] addpd m8, [uq + offsetq - ELEM_SIZE * 2] ; +u[x-2] vfmsub132pd m7, m8, [const10] ; m7 = 10 * m7 - m8 movu m8, [uq + offsetq + ELEM_SIZE * 3] ; m8 = u[x+3] addpd m8, [uq + offsetq - ELEM_SIZE * 3] ; -u[x-3] vfmadd132pd m7, m8, [const13_5] ; m7 = 13.5 * m7 + m8 vfnmadd231pd m7, m6, [const245] ; m7 = m7 - 245 * u[0] vfmadd231pd m0, m7, [diff_coeffs20q + offsetq]; res += d_x u * diff_coeffs20 %endmacro %macro RES_ADD_DIFF_Y_3 0 ; first derivative lea u_upq, [uq + strideq] mov u_downq, uq sub u_downq, strideq movu m7, [u_upq + offsetq] ; m7 = u[y+1] subpd m7, [u_downq + offsetq] ; -u[y-1] add u_upq, strideq sub u_downq, strideq movu m8, [u_upq + offsetq] ; m8 = u[y+2] subpd m8, [u_downq + offsetq] ; -u[y-2] vfmsub132pd m7, m8, [const5] ; m7 = 5 * m7 - m8 add u_upq, strideq sub u_downq, strideq movu m8, [u_upq + offsetq] ; m8 = u[y+3] subpd m8, [u_downq + offsetq] ; -u[y-3] vfmadd132pd m7, m8, [const9] ; m7 = 9 * m7 + m8 vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_x u * diff_coeffs01 ; second derivative lea u_upq, [uq + strideq] mov u_downq, uq sub u_downq, strideq movu m7, [u_upq + offsetq] ; m7 = u[y+1] addpd m7, [u_downq + offsetq] ; +u[y-1] add u_upq, strideq sub u_downq, strideq movu m8, [u_upq + offsetq] ; m8 = u[y+2] addpd m8, [u_downq + offsetq] ; +u[y-2] vfmsub132pd m7, m8, [const10] ; m7 = 10 * m7 - m8 add u_upq, strideq sub u_downq, strideq movu m8, [u_upq + offsetq] ; m8 = u[x+3] addpd m8, [u_downq + offsetq] ; -u[x-3] vfmadd132pd m7, m8, [const13_5] ; m7 = 13.5 * m7 + m8 vfnmadd231pd m7, m6, [const245] ; m7 = m7 - 245 * u[0] vfmadd231pd m0, m7, [diff_coeffs02q + offsetq]; res += d_y u * diff_coeffs02 %endmacro %macro RES_ADD_DIFF_MIXED_3 0 lea u_upq, [uq + 2 * strideq] add u_upq, strideq DX3 m6, m7, u_upq + offsetq ; m6 = dx u[y + 3] sub u_upq, strideq DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y + 2] vfnmadd231pd m6, m7, [const9] ; m6 = m6 - 9 * m7 sub u_upq, strideq DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y + 1] vfmadd231pd m6, m7, [const45] ; m6 = m6 + 45 * m7 sub u_upq, strideq sub u_upq, strideq DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 1] vfnmadd231pd m6, m7, [const45] ; m6 = m6 - 45 * m7 sub u_upq, strideq DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 2] vfmadd231pd m6, m7, [const9] ; m6 = m6 + 9 * m7 sub u_upq, strideq DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 3] subpd m6, m7 vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] %endmacro %macro RESIDUAL_CALC_3 2 %if %2 vpermq m2, m1, 0 %endif vpermq m1, m0, 0 ; compute the mask for absolute value pcmpeqq m13, m13 psrlq m13, 1 movu m12, [res_maxq] ; load pointers to the equation coefficients %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] ; setup the data pointers and the loop counter shl strideq, 3 shl linesizeq, 3 add dstq, linesizeq add uq, linesizeq add rhsq, linesizeq add diff_coeffs00q, linesizeq add diff_coeffs01q, linesizeq add diff_coeffs10q, linesizeq add diff_coeffs11q, linesizeq add diff_coeffs02q, linesizeq add diff_coeffs20q, linesizeq neg linesizeq ; from now on, the register that held linesize is used as the offset into data arrays %define offsetq linesizeq .loop: xorpd m0, m0 subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value movu m6, [uq + offsetq] ; m6 = u[x] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 %if %2 mulpd m3, m6, m2 %endif RES_ADD_DIFF_X_3 RES_ADD_DIFF_Y_3 RES_ADD_DIFF_MIXED_3 andpd m6, m0, m13 ; m6 = abs(res) mulpd m0, m1 %if %2 addpd m0, m3 %endif ; store the result add offsetq, mmsize jg .store_partial ; store full block movu [dstq + offsetq - mmsize], m0 maxpd m12, m6 js .loop jmp .finish .store_partial: sub offsetq, ELEM_SIZE jz .store3 sub offsetq, ELEM_SIZE jz .store2 .store1: ; offsetq is now mmsize-2 after the write position movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 vpermq m6, m6, 0 maxpd m12, m6 jmp .finish .store2: ; offsetq is now mmsize-2 after the write position movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 mova xm6, xm6 maxpd m12, m6 jmp .finish .store3: ; offsetq is now mmsize-1 after the write position movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0 vextractf128 xm0, m0, 1 movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0 vpermq m6, m6, 10100100b maxpd m12, m6 .finish: movu [res_maxq], m12 RET %endmacro INIT_YMM fma3 cglobal residual_calc_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down RESIDUAL_CALC_3 3, 0 cglobal residual_add_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down RESIDUAL_CALC_3 3, 1