From 2f457508915cdf61d2220b9db30d4aaecd7e07b7 Mon Sep 17 00:00:00 2001
From: Anton Khirnov <anton@khirnov.net>
Date: Sun, 13 Jan 2019 14:49:57 +0100
Subject: ell_relax: compute the residual norm in residual_calc()

It is cheap and avoids an extra step in mg2d.
---
 residual_calc.asm | 52 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 15 deletions(-)

(limited to 'residual_calc.asm')

diff --git a/residual_calc.asm b/residual_calc.asm
index 47dda9b..95eb226 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -41,6 +41,8 @@ SECTION .text
 ; m0: accumulator for the residual
 ; m1-m5: splatted constant finite difference coefficients
 ; m6-m11: working registers
+; m12: max(fabs(residual))
+; m13: mask for computing absolute values
 ; (s2 only) m14-m15: splatted constants 8.0, 30.0
 
 ; calculate and add residual contributions from first and second derivatives
@@ -140,7 +142,22 @@ SECTION .text
 ; %1: stencil
 %macro RESIDUAL_CALC 1
     %define stencil %1
-    %define u_downq fd_factorsq    ; reuse the fd_factors registers after it is no longer needed
+
+    ; load and splat the finite difference factors
+    movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
+    vpermq  m1, m0, 00000000b                           ; diff factor 01 -> m1
+    vpermq  m2, m0, 01010101b                           ; diff factor 10 -> m2
+    vpermq  m3, m0, 10101010b                           ; diff factor 11 -> m3
+    vpermq  m4, m0, 11111111b                           ; diff factor 02 -> m4
+    movq   xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
+    vpermq  m5, m0, 00000000b                           ; diff factor 20 -> m5
+    %define u_downq fd_factorsq    ; reuse the fd_factors register after it is no longer needed
+
+    ; compute the mask for absolute value
+    pcmpeqq m13, m13
+    psrlq   m13, 1
+    movu    m12, [res_maxq]
+
     ; load pointers to the equation coefficients
     %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
     mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
@@ -166,23 +183,15 @@ SECTION .text
     ; from now on, the register that held linesize is used as the offset into data arrays
     %define offsetq linesizeq
 
-    ; load and splat the finite difference factors
-    movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
-    vpermq  m1, m0, 00000000b                           ; diff factor 01 -> m1
-    vpermq  m2, m0, 01010101b                           ; diff factor 10 -> m2
-    vpermq  m3, m0, 10101010b                           ; diff factor 11 -> m3
-    vpermq  m4, m0, 11111111b                           ; diff factor 02 -> m4
-    movq   xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
-    vpermq  m5, m0, 00000000b                           ; diff factor 20 -> m5
-
     ; setup pointers to the line above and below
     lea u_upq,   [uq + strideq]
     mov u_downq, uq
     sub u_downq, strideq
     %if stencil == 2
         lea u_up2q,   [uq + 2 * strideq]
-        mov u_down2q, u_downq
-        sub u_down2q, strideq
+        neg strideq
+        add strideq, u_downq
+        %define u_down2q strideq ; reuse the stride register for the u[y-2] line
 
         movu    m15, [const30]
         movu    m14, [const8]
@@ -206,12 +215,15 @@ SECTION .text
     RES_ADD_DIFF_SINGLEDIR stencil, 1
     RES_ADD_DIFF_MIXED     stencil
 
+    andpd m6, m0, m13                                   ; m6 = abs(res)
+
     ; store the result
     add offsetq, mmsize
     jg .store_partial
 
     ; store full block
     movu [dstq + offsetq - mmsize], m0
+    maxpd m12, m6
     js .loop
     jmp .finish
 
@@ -224,10 +236,16 @@ SECTION .text
 .store1:
     ; offsetq is now mmsize-2 after the write position
     movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+
+    vpermq m6, m6, 0
+    maxpd m12, m6
+
     jmp .finish
 .store2:
     ; offsetq is now mmsize-2 after the write position
     movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+    mova  xm6, xm6
+    maxpd m12, m6
     jmp .finish
 .store3:
     ; offsetq is now mmsize-1 after the write position
@@ -235,16 +253,20 @@ SECTION .text
     vextractf128 xm0, m0, 1
     movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0
 
+    vpermq m6, m6, 10100100b
+    maxpd m12, m6
+
 .finish:
+    movu [res_maxq], m12
     RET
 %endmacro
 
 INIT_YMM fma3
-cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
+cglobal residual_calc_line_s1, 8, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
                                           diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up
 RESIDUAL_CALC 1
 
 INIT_YMM fma3
-cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
-                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2, u_down2
+cglobal residual_calc_line_s2, 8, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs, fd_factors,\
+                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_up2
 RESIDUAL_CALC 2
-- 
cgit v1.2.3