aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2018-12-27 11:59:10 +0100
committerAnton Khirnov <anton@khirnov.net>2018-12-27 11:59:10 +0100
commit32cd1b5680db2122eb3321ef967dc3cbe2c7109d (patch)
tree1243bc1ef367bac90b613da5cda42c178ed88b25 /residual_calc.asm
parent0e0ead5b0b45078c0009a0ce23ce4bd88a49bb13 (diff)
ell_relax: add AVX SIMD for residual_calc
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm269
1 files changed, 269 insertions, 0 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
new file mode 100644
index 0000000..f4b11b6
--- /dev/null
+++ b/residual_calc.asm
@@ -0,0 +1,269 @@
+;
+; SIMD for computing the residual
+; Copyright 2018 Anton Khirnov <anton@khirnov.net>
+;
+; This program is free software: you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation, either version 3 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program. If not, see <http://www.gnu.org/licenses/>.
+;/
+
+%include "config.asm"
+%include "x86inc.asm"
+
+SECTION .rodata
+
+const8: times 8 dq 8.0
+const16: times 8 dq 16.0
+const30: times 8 dq 30.0
+const64: times 8 dq 64.0
+
+SECTION .text
+%define OFF_DIFF_COEFF_00 0 * 8
+%define OFF_DIFF_COEFF_01 1 * 8
+%define OFF_DIFF_COEFF_10 2 * 8
+%define OFF_DIFF_COEFF_11 3 * 8
+%define OFF_DIFF_COEFF_02 4 * 8
+%define OFF_DIFF_COEFF_20 5 * 8
+
+INIT_YMM fma3
+cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
+ %define u_downq fd_factorsq
+ ; load pointers to the equation coefficients
+ %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
+ mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
+ mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
+ mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
+ mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
+ mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
+ mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]
+
+ ; setup the data pointers and the loop counter
+ shl strideq, 3
+ shl linesizeq, 3
+ add dstq, linesizeq
+ add uq, linesizeq
+ add rhsq, linesizeq
+ add diff_coeffs00q, linesizeq
+ add diff_coeffs01q, linesizeq
+ add diff_coeffs10q, linesizeq
+ add diff_coeffs11q, linesizeq
+ add diff_coeffs02q, linesizeq
+ add diff_coeffs20q, linesizeq
+ neg linesizeq
+ ; from now on, the register that held linesize is used as the offset into data arrays
+ %define offsetq linesizeq
+
+ ; load and splat the finite difference factors
+ movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
+ vpermq m7, m0, 00000000b ; diff factor 01 -> m7
+ vpermq m8, m0, 01010101b ; diff factor 10 -> m8
+ vpermq m9, m0, 10101010b ; diff factor 11 -> m9
+ vpermq m10, m0, 11111111b ; diff factor 02 -> m10
+ movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
+ vpermq m11, m0, 00000000b ; diff factor 20 -> m11
+
+ ; setup pointers to the line above and below
+ lea u_upq, [uq + strideq]
+ mov u_downq, uq
+ sub u_downq, strideq
+
+.loop:
+ xorpd m0, m0
+ subpd m0, [rhsq + offsetq] ; res = -rhs
+
+ ; plain value
+ movu m1, [uq + offsetq]
+ vfmadd231pd m0, m1, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+
+ ; dx, d2x
+ movu m2, [uq + offsetq + 8]
+ movu m3, [uq + offsetq - 8]
+
+ mulpd m4, m8, [diff_coeffs10q + offsetq]
+ mulpd m5, m11, [diff_coeffs20q + offsetq]
+
+ subpd m6, m2, m3
+ vfmadd231pd m0, m4, m6 ; res += d_x u * diff_coeffs10
+
+ addpd m1, m1
+
+ addpd m6, m2, m3
+ subpd m6, m1
+ vfmadd231pd m0, m5, m6 ; res += d_xx u * diff_coeffs20
+
+ ; dy, d2y
+ movu m2, [u_upq + offsetq]
+ movu m3, [u_downq + offsetq]
+
+ mulpd m4, m7, [diff_coeffs01q + offsetq]
+ mulpd m5, m10, [diff_coeffs02q + offsetq]
+
+ subpd m6, m2, m3
+ vfmadd231pd m0, m4, m6 ; res += d_y u * diff_coeffs01
+
+ addpd m6, m2, m3
+ subpd m6, m1
+ vfmadd231pd m0, m5, m6 ; res += d_yy u * diff_coeffs02
+
+ ; mixed d2xy
+ movu m1, [u_upq + offsetq + 8]
+ subpd m1, [u_upq + offsetq - 8]
+ subpd m1, [u_downq + offsetq + 8]
+ addpd m1, [u_downq + offsetq - 8]
+
+ mulpd m2, m9, [diff_coeffs11q + offsetq]
+ vfmadd231pd m0, m1, m2 ; res += d_xy u * diff_coeffs11
+
+ ; store the result
+ movu [dstq + offsetq], m0
+ add offsetq, mmsize
+ js .loop
+
+ RET
+
+INIT_YMM fma3
+cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down
+ %define u_down2q fd_factorsq ; reuse the fd_factors registers after it is no longer needed
+ ; load pointers to the equation coefficients
+ %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
+ mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
+ mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
+ mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
+ mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
+ mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
+ mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]
+
+ ; setup the data pointers and the loop counter
+ shl strideq, 3
+ shl linesizeq, 3
+ add dstq, linesizeq
+ add uq, linesizeq
+ add rhsq, linesizeq
+ add diff_coeffs00q, linesizeq
+ add diff_coeffs01q, linesizeq
+ add diff_coeffs10q, linesizeq
+ add diff_coeffs11q, linesizeq
+ add diff_coeffs02q, linesizeq
+ add diff_coeffs20q, linesizeq
+ neg linesizeq
+ ; from now on, the register that held linesize is used as the offset into data arrays
+ %define offsetq linesizeq
+
+ ; load and splat the finite difference factors
+ movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
+ vpermq m1, m0, 00000000b ; diff factor 01 -> m1
+ vpermq m2, m0, 01010101b ; diff factor 10 -> m2
+ vpermq m3, m0, 10101010b ; diff factor 11 -> m3
+ vpermq m4, m0, 11111111b ; diff factor 02 -> m4
+ movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
+ vpermq m5, m0, 00000000b ; diff factor 20 -> m5
+
+ movu m15, [const30]
+ movu m14, [const8]
+ movu m13, [const16]
+ movu m12, [const64]
+
+ ; setup pointers to the lines above and below
+ lea u_upq, [uq + strideq]
+ lea u_up2q, [uq + 2 * strideq]
+
+ mov u_downq, uq
+ sub u_downq, strideq
+ mov u_down2q, u_downq
+ sub u_down2q, strideq
+
+.loop:
+ xorpd m0, m0
+ subpd m0, [rhsq + offsetq] ; res = -rhs
+
+ ; plain value
+ movu m6, [uq + offsetq]
+ vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+
+ ; dx, d2x
+ movu m7, [uq + offsetq + 8] ; m7 = u[x+1]
+ movu m8, [uq + offsetq + 8 * 2] ; m8 = u[x+2]
+ movu m9, [uq + offsetq - 8] ; m9 = u[x-1]
+ movu m10, [uq + offsetq - 8 * 2] ; m10 = u[x-2]
+
+ mulpd m11, m14, m7 ; m11 = 8 u[x+1]
+ vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[x-1]
+ subpd m11, m8 ; m11 -= u[x+2]
+ addpd m11, m10 ; m11 += u[x-2]
+
+ mulpd m11, m2,
+ vfmadd231pd m0, m11, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10
+
+ mulpd m11, m13, m7 ; m11 = 16 u[x+1]
+ vfmadd231pd m11, m13, m9 ; m11 += 16 u[x-1]
+ subpd m11, m8 ; m11 -= u[x+2]
+ subpd m11, m10 ; m11 -= u[x-2]
+ vfnmadd231pd m11, m15, m6 ; m11 -= 30 u
+
+ mulpd m11, m5
+ vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20
+
+ ; dy, d2y
+ movu m7, [u_upq + offsetq] ; m7 = u[y+1]
+ movu m8, [u_up2q + offsetq] ; m8 = u[y+2]
+ movu m9, [u_downq + offsetq] ; m9 = u[y-1]
+ movu m10, [u_down2q + offsetq] ; m10 = u[y-2]
+
+ mulpd m11, m14, m7 ; m11 = 8 u[y+1]
+ vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[y-1]
+ subpd m11, m8 ; m11 -= u[y+2]
+ addpd m11, m10 ; m11 += u[y-2]
+
+ mulpd m11, m1,
+ vfmadd231pd m0, m11, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01
+
+ mulpd m11, m13, m7 ; m11 = 16 u[y+1]
+ vfmadd231pd m11, m13, m9 ; m11 += 16 u[y-1]
+ subpd m11, m8 ; m11 -= u[y+2]
+ subpd m11, m10 ; m11 -= u[y-2]
+ vfnmadd231pd m11, m15, m6 ; m11 -= 30 u
+
+ mulpd m11, m4
+ vfmadd231pd m0, m11, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02
+
+ ; mixed d2xy
+ movu m6, [u_up2q + offsetq + 2 * 8] ; m6 = u[y+2, x+2]
+ vfnmadd231pd m6, m14, [u_up2q + offsetq + 1 * 8] ; m6 -= 8 u[y+2, x+1]
+ vfmadd231pd m6, m14, [u_up2q + offsetq - 1 * 8] ; m6 += 8 u[y+2, x-1]
+ subpd m6, [u_up2q + offsetq - 2 * 8] ; m6 -= u[y+2, x-2]
+
+ vfnmadd231pd m6, m14, [u_upq + offsetq + 2 * 8] ; m6 -= 8 u[y+1, x+2]
+ vfmadd231pd m6, m12, [u_upq + offsetq + 1 * 8] ; m6 += 64 u[y+1, x+1]
+ vfnmadd231pd m6, m12, [u_upq + offsetq - 1 * 8] ; m6 -= 64 u[y+1, x-1]
+ vfmadd231pd m6, m14, [u_upq + offsetq - 2 * 8] ; m6 += 8 u[y+1, x-2]
+
+ vfmadd231pd m6, m14, [u_downq + offsetq + 2 * 8] ; m6 += 8 u[y-1, x+2]
+ vfnmadd231pd m6, m12, [u_downq + offsetq + 1 * 8] ; m6 -= 64 u[y-1, x+1]
+ vfmadd231pd m6, m12, [u_downq + offsetq - 1 * 8] ; m6 += 64 u[y-1, x-1]
+ vfnmadd231pd m6, m14, [u_downq + offsetq - 2 * 8] ; m6 -= 8 u[y-1, x-2]
+
+ subpd m6, [u_down2q + offsetq + 2 * 8] ; m6 -= u[y-2, x+2]
+ vfmadd231pd m6, m14, [u_down2q + offsetq + 1 * 8] ; m6 += 8 u[y-2, x+1]
+ vfnmadd231pd m6, m14, [u_down2q + offsetq - 1 * 8] ; m6 += 8 u[y-2, x-1]
+ addpd m6, [u_down2q + offsetq - 2 * 8] ; m6 += u[y-2, x-2]
+
+ mulpd m6, m3
+ vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
+
+ ; store the result
+ movu [dstq + offsetq], m0
+ add offsetq, mmsize
+ js .loop
+
+ RET