From 32cd1b5680db2122eb3321ef967dc3cbe2c7109d Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Thu, 27 Dec 2018 11:59:10 +0100 Subject: ell_relax: add AVX SIMD for residual_calc --- residual_calc.asm | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 269 insertions(+) create mode 100644 residual_calc.asm (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm new file mode 100644 index 0000000..f4b11b6 --- /dev/null +++ b/residual_calc.asm @@ -0,0 +1,269 @@ +; +; SIMD for computing the residual +; Copyright 2018 Anton Khirnov +; +; This program is free software: you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation, either version 3 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program. If not, see . +;/ + +%include "config.asm" +%include "x86inc.asm" + +SECTION .rodata + +const8: times 8 dq 8.0 +const16: times 8 dq 16.0 +const30: times 8 dq 30.0 +const64: times 8 dq 64.0 + +SECTION .text +%define OFF_DIFF_COEFF_00 0 * 8 +%define OFF_DIFF_COEFF_01 1 * 8 +%define OFF_DIFF_COEFF_10 2 * 8 +%define OFF_DIFF_COEFF_11 3 * 8 +%define OFF_DIFF_COEFF_02 4 * 8 +%define OFF_DIFF_COEFF_20 5 * 8 + +INIT_YMM fma3 +cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up + %define u_downq fd_factorsq + ; load pointers to the equation coefficients + %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer + mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] + mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] + mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] + mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] + mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] + mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] + + ; setup the data pointers and the loop counter + shl strideq, 3 + shl linesizeq, 3 + add dstq, linesizeq + add uq, linesizeq + add rhsq, linesizeq + add diff_coeffs00q, linesizeq + add diff_coeffs01q, linesizeq + add diff_coeffs10q, linesizeq + add diff_coeffs11q, linesizeq + add diff_coeffs02q, linesizeq + add diff_coeffs20q, linesizeq + neg linesizeq + ; from now on, the register that held linesize is used as the offset into data arrays + %define offsetq linesizeq + + ; load and splat the finite difference factors + movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] + vpermq m7, m0, 00000000b ; diff factor 01 -> m7 + vpermq m8, m0, 01010101b ; diff factor 10 -> m8 + vpermq m9, m0, 10101010b ; diff factor 11 -> m9 + vpermq m10, m0, 11111111b ; diff factor 02 -> m10 + movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] + vpermq m11, m0, 00000000b ; diff factor 20 -> m11 + + ; setup pointers to the line above and below + lea u_upq, [uq + strideq] + mov u_downq, uq + sub u_downq, strideq + +.loop: + xorpd m0, m0 + subpd m0, [rhsq + offsetq] ; res = -rhs + + ; plain value + movu m1, [uq + offsetq] + vfmadd231pd m0, m1, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + + ; dx, d2x + movu m2, [uq + offsetq + 8] + movu m3, [uq + offsetq - 8] + + mulpd m4, m8, [diff_coeffs10q + offsetq] + mulpd m5, m11, [diff_coeffs20q + offsetq] + + subpd m6, m2, m3 + vfmadd231pd m0, m4, m6 ; res += d_x u * diff_coeffs10 + + addpd m1, m1 + + addpd m6, m2, m3 + subpd m6, m1 + vfmadd231pd m0, m5, m6 ; res += d_xx u * diff_coeffs20 + + ; dy, d2y + movu m2, [u_upq + offsetq] + movu m3, [u_downq + offsetq] + + mulpd m4, m7, [diff_coeffs01q + offsetq] + mulpd m5, m10, [diff_coeffs02q + offsetq] + + subpd m6, m2, m3 + vfmadd231pd m0, m4, m6 ; res += d_y u * diff_coeffs01 + + addpd m6, m2, m3 + subpd m6, m1 + vfmadd231pd m0, m5, m6 ; res += d_yy u * diff_coeffs02 + + ; mixed d2xy + movu m1, [u_upq + offsetq + 8] + subpd m1, [u_upq + offsetq - 8] + subpd m1, [u_downq + offsetq + 8] + addpd m1, [u_downq + offsetq - 8] + + mulpd m2, m9, [diff_coeffs11q + offsetq] + vfmadd231pd m0, m1, m2 ; res += d_xy u * diff_coeffs11 + + ; store the result + movu [dstq + offsetq], m0 + add offsetq, mmsize + js .loop + + RET + +INIT_YMM fma3 +cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down + %define u_down2q fd_factorsq ; reuse the fd_factors registers after it is no longer needed + ; load pointers to the equation coefficients + %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer + mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] + mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] + mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] + mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] + mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] + mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] + + ; setup the data pointers and the loop counter + shl strideq, 3 + shl linesizeq, 3 + add dstq, linesizeq + add uq, linesizeq + add rhsq, linesizeq + add diff_coeffs00q, linesizeq + add diff_coeffs01q, linesizeq + add diff_coeffs10q, linesizeq + add diff_coeffs11q, linesizeq + add diff_coeffs02q, linesizeq + add diff_coeffs20q, linesizeq + neg linesizeq + ; from now on, the register that held linesize is used as the offset into data arrays + %define offsetq linesizeq + + ; load and splat the finite difference factors + movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] + vpermq m1, m0, 00000000b ; diff factor 01 -> m1 + vpermq m2, m0, 01010101b ; diff factor 10 -> m2 + vpermq m3, m0, 10101010b ; diff factor 11 -> m3 + vpermq m4, m0, 11111111b ; diff factor 02 -> m4 + movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] + vpermq m5, m0, 00000000b ; diff factor 20 -> m5 + + movu m15, [const30] + movu m14, [const8] + movu m13, [const16] + movu m12, [const64] + + ; setup pointers to the lines above and below + lea u_upq, [uq + strideq] + lea u_up2q, [uq + 2 * strideq] + + mov u_downq, uq + sub u_downq, strideq + mov u_down2q, u_downq + sub u_down2q, strideq + +.loop: + xorpd m0, m0 + subpd m0, [rhsq + offsetq] ; res = -rhs + + ; plain value + movu m6, [uq + offsetq] + vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + + ; dx, d2x + movu m7, [uq + offsetq + 8] ; m7 = u[x+1] + movu m8, [uq + offsetq + 8 * 2] ; m8 = u[x+2] + movu m9, [uq + offsetq - 8] ; m9 = u[x-1] + movu m10, [uq + offsetq - 8 * 2] ; m10 = u[x-2] + + mulpd m11, m14, m7 ; m11 = 8 u[x+1] + vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[x-1] + subpd m11, m8 ; m11 -= u[x+2] + addpd m11, m10 ; m11 += u[x-2] + + mulpd m11, m2, + vfmadd231pd m0, m11, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 + + mulpd m11, m13, m7 ; m11 = 16 u[x+1] + vfmadd231pd m11, m13, m9 ; m11 += 16 u[x-1] + subpd m11, m8 ; m11 -= u[x+2] + subpd m11, m10 ; m11 -= u[x-2] + vfnmadd231pd m11, m15, m6 ; m11 -= 30 u + + mulpd m11, m5 + vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 + + ; dy, d2y + movu m7, [u_upq + offsetq] ; m7 = u[y+1] + movu m8, [u_up2q + offsetq] ; m8 = u[y+2] + movu m9, [u_downq + offsetq] ; m9 = u[y-1] + movu m10, [u_down2q + offsetq] ; m10 = u[y-2] + + mulpd m11, m14, m7 ; m11 = 8 u[y+1] + vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[y-1] + subpd m11, m8 ; m11 -= u[y+2] + addpd m11, m10 ; m11 += u[y-2] + + mulpd m11, m1, + vfmadd231pd m0, m11, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01 + + mulpd m11, m13, m7 ; m11 = 16 u[y+1] + vfmadd231pd m11, m13, m9 ; m11 += 16 u[y-1] + subpd m11, m8 ; m11 -= u[y+2] + subpd m11, m10 ; m11 -= u[y-2] + vfnmadd231pd m11, m15, m6 ; m11 -= 30 u + + mulpd m11, m4 + vfmadd231pd m0, m11, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02 + + ; mixed d2xy + movu m6, [u_up2q + offsetq + 2 * 8] ; m6 = u[y+2, x+2] + vfnmadd231pd m6, m14, [u_up2q + offsetq + 1 * 8] ; m6 -= 8 u[y+2, x+1] + vfmadd231pd m6, m14, [u_up2q + offsetq - 1 * 8] ; m6 += 8 u[y+2, x-1] + subpd m6, [u_up2q + offsetq - 2 * 8] ; m6 -= u[y+2, x-2] + + vfnmadd231pd m6, m14, [u_upq + offsetq + 2 * 8] ; m6 -= 8 u[y+1, x+2] + vfmadd231pd m6, m12, [u_upq + offsetq + 1 * 8] ; m6 += 64 u[y+1, x+1] + vfnmadd231pd m6, m12, [u_upq + offsetq - 1 * 8] ; m6 -= 64 u[y+1, x-1] + vfmadd231pd m6, m14, [u_upq + offsetq - 2 * 8] ; m6 += 8 u[y+1, x-2] + + vfmadd231pd m6, m14, [u_downq + offsetq + 2 * 8] ; m6 += 8 u[y-1, x+2] + vfnmadd231pd m6, m12, [u_downq + offsetq + 1 * 8] ; m6 -= 64 u[y-1, x+1] + vfmadd231pd m6, m12, [u_downq + offsetq - 1 * 8] ; m6 += 64 u[y-1, x-1] + vfnmadd231pd m6, m14, [u_downq + offsetq - 2 * 8] ; m6 -= 8 u[y-1, x-2] + + subpd m6, [u_down2q + offsetq + 2 * 8] ; m6 -= u[y-2, x+2] + vfmadd231pd m6, m14, [u_down2q + offsetq + 1 * 8] ; m6 += 8 u[y-2, x+1] + vfnmadd231pd m6, m14, [u_down2q + offsetq - 1 * 8] ; m6 += 8 u[y-2, x-1] + addpd m6, [u_down2q + offsetq - 2 * 8] ; m6 += u[y-2, x-2] + + mulpd m6, m3 + vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 + + ; store the result + movu [dstq + offsetq], m0 + add offsetq, mmsize + js .loop + + RET -- cgit v1.2.3