aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-01-13 14:49:57 +0100
committerAnton Khirnov <anton@khirnov.net>2019-01-13 14:49:57 +0100
commit2f457508915cdf61d2220b9db30d4aaecd7e07b7 (patch)
treeeffe7c2b22f3a276093cf3ad951d6ac7e41a0977 /residual_calc.asm
parentd0bce68cfe7f45fc417fb197772c03ebba4af902 (diff)
ell_relax: compute the residual norm in residual_calc()
It is cheap and avoids an extra step in mg2d.
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm52
1 files changed, 37 insertions, 15 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 47dda9b..95eb226 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -41,6 +41,8 @@ SECTION .text
; m0: accumulator for the residual
; m1-m5: splatted constant finite difference coefficients
; m6-m11: working registers
+; m12: max(fabs(residual))
+; m13: mask for computing absolute values
; (s2 only) m14-m15: splatted constants 8.0, 30.0
; calculate and add residual contributions from first and second derivatives
@@ -140,7 +142,22 @@ SECTION .text
; %1: stencil
%macro RESIDUAL_CALC 1
%define stencil %1
- %define u_downq fd_factorsq ; reuse the fd_factors registers after it is no longer needed
+
+ ; load and splat the finite difference factors
+ movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
+ vpermq m1, m0, 00000000b ; diff factor 01 -> m1
+ vpermq m2, m0, 01010101b ; diff factor 10 -> m2
+ vpermq m3, m0, 10101010b ; diff factor 11 -> m3
+ vpermq m4, m0, 11111111b ; diff factor 02 -> m4
+ movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
+ vpermq m5, m0, 00000000b ; diff factor 20 -> m5
+ %define u_downq fd_factorsq ; reuse the fd_factors register after it is no longer needed
+
+ ; compute the mask for absolute value
+ pcmpeqq m13, m13
+ psrlq m13, 1
+ movu m12, [res_maxq]
+
; load pointers to the equation coefficients
%define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
@@ -166,23 +183,15 @@ SECTION .text
; from now on, the register that held linesize is used as the offset into data arrays
%define offsetq linesizeq
- ; load and splat the finite difference factors
- movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
- vpermq m1, m0, 00000000b ; diff factor 01 -> m1
- vpermq m2, m0, 01010101b ; diff factor 10 -> m2
- vpermq m3, m0, 10101010b ; diff factor 11 -> m3
- vpermq m4, m0, 11111111b ; diff factor 02 -> m4
- movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
- vpermq m5, m0, 00000000b ; diff factor 20 -> m5
-
; setup pointers to the line above and below
lea u_upq, [uq + strideq]
mov u_downq, uq
sub u_downq, strideq
%if stencil == 2
lea u_up2q, [uq + 2 * strideq]
- mov u_down2q, u_downq
- sub u_down2q, strideq
+ neg strideq
+ add strideq, u_downq
+ %define u_down2q strideq ; reuse the stride register for the u[y-2] line
movu m15, [const30]
movu m14, [const8]
@@ -206,12 +215,15 @@ SECTION .text
RES_ADD_DIFF_SINGLEDIR stencil, 1
RES_ADD_DIFF_MIXED stencil
+ andpd m6, m0, m13 ; m6 = abs(res)
+
; store the result
add offsetq, mmsize
jg .store_partial
; store full block
movu [dstq + offsetq - mmsize], m0
+ maxpd m12, m6
js .loop
jmp .finish
@@ -224,10 +236,16 @@ SECTION .text
.store1:
; offsetq is now mmsize-2 after the write position
movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+
+ vpermq m6, m6, 0
+ maxpd m12, m6
+
jmp .finish
.store2:
; offsetq is now mmsize-2 after the write position
movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+ mova xm6, xm6
+ maxpd m12, m6
jmp .finish
.store3:
; offsetq is now mmsize-1 after the write position
@@ -235,16 +253,20 @@ SECTION .text
vextractf128 xm0, m0, 1
movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0
+ vpermq m6, m6, 10100100b
+ maxpd m12, m6
+
.finish:
+ movu [res_maxq], m12
RET
%endmacro
INIT_YMM fma3
-cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
+cglobal residual_calc_line_s1, 8, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
RESIDUAL_CALC 1
INIT_YMM fma3
-cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
- diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down2
+cglobal residual_calc_line_s2, 8, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2
RESIDUAL_CALC 2