;
; SIMD for computing the residual
; Copyright 2018 Anton Khirnov <anton@khirnov.net>
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program.  If not, see <http://www.gnu.org/licenses/>.
;/

%include "config.asm"
%include "x86inc.asm"

; double precision
%define ELEM_SIZE 8

; offsets to FD coefficients for given derivative
%define OFF_DIFF_COEFF_00 0 * gprsize
%define OFF_DIFF_COEFF_01 1 * gprsize
%define OFF_DIFF_COEFF_10 2 * gprsize
%define OFF_DIFF_COEFF_11 3 * gprsize
%define OFF_DIFF_COEFF_02 4 * gprsize
%define OFF_DIFF_COEFF_20 5 * gprsize

SECTION .rodata

const5:    times 8 dq  5.0
const8:    times 8 dq  8.0
const9:    times 8 dq  9.0
const10:   times 8 dq 10.0
const13_5: times 8 dq 13.5
const30:   times 8 dq 30.0
const45:   times 8 dq 45.0
const245:  times 8 dq 245.0

SECTION .text

; mm register allocation (both s1 and s2)
; m0: accumulator for the residual
; m1: dst mult factor
; m2: res mult factor
; m6-m11: working registers
; m12: max(fabs(residual))
; m13: mask for computing absolute values
; (s2 only) m14-m15: splatted constants 8.0, 30.0

; calculate and add residual contributions from first and second derivatives
; along a single direction (x or y)
;
; parameters:
; %1: stencil
; %2: 0 -- x; 1 -- y
;
; register use (in addition to register allocation described above):
; m6: on entry contains u[x] multiplied by the corresponding FD coefficient, not
;     clobbered
; m7-m11 used for work (clobbered)
%macro RES_ADD_DIFF_SINGLEDIR 2
    %define stencil %1

    %if %2 == 0
        %define up1q uq + ELEM_SIZE
        %define up2q uq + ELEM_SIZE * 2
        %define um1q uq - ELEM_SIZE
        %define um2q uq - ELEM_SIZE * 2
        %define coeffs1q diff_coeffs10q
        %define coeffs2q diff_coeffs20q
    %else
        %define up1q u_upq
        %define up2q u_up2q
        %define um1q u_downq
        %define um2q u_down2q
        %define coeffs1q diff_coeffs01q
        %define coeffs2q diff_coeffs02q
    %endif

    ; load the function values
    movu     m7,  [up1q + offsetq]                   ; m7  = u[x+1]
    movu     m9,  [um1q + offsetq]                   ; m9  = u[x-1]
    %if stencil == 2
        movu m8,  [up2q + offsetq]                   ; m8  = u[x+2]
        movu m10, [um2q + offsetq]                   ; m10 = u[x-2]
    %endif

    ; first derivative
    subpd        m11, m7, m9                         ; m11 = u[x+1] - u[x-1]
    %if stencil == 2
        mulpd        m11, m14                        ; m11 *= 8
        subpd        m11, m8                         ; m11 -=   u[x+2]
        addpd        m11, m10                        ; m11 +=   u[x-2]
    %endif
    vfmadd231pd m0, m11, [coeffs1q + offsetq]        ; res += d_x u * diff_coeffs10

    ; second derivative
    addpd        m11, m7, m9                         ; m11 = u[x+1] + u[x-1]
    %if stencil == 2
        mulpd        m11, m14
        addpd        m11, m11                        ; m11 *= 16
        subpd        m11, m8                         ; m11 -=    u[x+2]
        subpd        m11, m10                        ; m11 -=    u[x-2]
    %endif
    subpd       m11, m6                              ; m11 -= fd0 u[x]
    vfmadd231pd m0, m11, [coeffs2q + offsetq]        ; res += d_xx u * diff_coeffs20
%endmacro

; calculate and add residual contributions from the second mixed derivative
;
; parameters:
; %1: stencil
;
; register use (in addition to register allocation described above):
; m6, m7: used for work (clobbered)
%macro RES_ADD_DIFF_MIXED 1
    movu  m6, [u_upq    + 1 * ELEM_SIZE + offsetq] ; m6 = u[y+1, x+1]
    subpd m6, [u_upq    - 1 * ELEM_SIZE + offsetq] ;    - u[y+1, x-1]
    subpd m6, [u_downq  + 1 * ELEM_SIZE + offsetq] ;    - u[y-1, x+1]
    addpd m6, [u_downq  - 1 * ELEM_SIZE + offsetq] ;    + u[y-1, x-1]

    %if %1 == 2
        movu  m7, [u_up2q   - 1 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x-1]
        subpd m7, [u_up2q   + 1 * ELEM_SIZE + offsetq] ;    - u[y+2, x+1]
        subpd m7, [u_upq    + 2 * ELEM_SIZE + offsetq] ;    - u[y+1, x+2]
        addpd m7, [u_upq    - 2 * ELEM_SIZE + offsetq] ;    + u[y+1, x-2]
        addpd m7, [u_downq  + 2 * ELEM_SIZE + offsetq] ;    + u[y-1, x+2]
        subpd m7, [u_downq  - 2 * ELEM_SIZE + offsetq] ;    - u[y-1, x-2]
        addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ;    + u[y-2, x+1]
        subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ;    - u[y-2, x-1]

        vfmadd123pd m6, m14, m7                        ; m6 = 8 m6 + m7

        movu  m7, [u_up2q   + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2]
        subpd m7, [u_up2q   - 2 * ELEM_SIZE + offsetq] ;    - u[y+2, x-2]
        subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ;    - u[y-2, x+2]
        addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ;    + u[y-2, x-2]

        vfmadd123pd m6, m14, m7                        ; m6 = 8 m6 + m7
    %endif

    vfmadd231pd m0, m6, [diff_coeffs11q + offsetq]     ; res += d_xy u * diff_coeffs11
%endmacro

; %1: stencil
; %2: 0 - calc; 1 - add
%macro RESIDUAL_CALC 2
    %define stencil %1

%if %2
    vpermq m2, m1, 0
%endif
    vpermq m1, m0, 0

    ; compute the mask for absolute value
    pcmpeqq m13, m13
    psrlq   m13, 1
    movu    m12, [res_maxq]

    ; load pointers to the equation coefficients
    %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
    mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
    mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
    mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
    mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
    mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
    mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]

    ; setup the data pointers and the loop counter
    shl strideq, 3
    shl linesizeq, 3
    add dstq,           linesizeq
    add uq,             linesizeq
    add rhsq,           linesizeq
    add diff_coeffs00q, linesizeq
    add diff_coeffs01q, linesizeq
    add diff_coeffs10q, linesizeq
    add diff_coeffs11q, linesizeq
    add diff_coeffs02q, linesizeq
    add diff_coeffs20q, linesizeq
    neg linesizeq
    ; from now on, the register that held linesize is used as the offset into data arrays
    %define offsetq linesizeq

    ; setup pointers to the line above and below
    lea u_upq,   [uq + strideq]
    mov u_downq, uq
    sub u_downq, strideq
    %if stencil == 2
        lea u_up2q,   [uq + 2 * strideq]
        neg strideq
        add strideq, u_downq
        %define u_down2q strideq ; reuse the stride register for the u[y-2] line

        movu    m15, [const30]
        movu    m14, [const8]
    %endif

.loop:
    xorpd m0, m0
    subpd m0, [rhsq + offsetq]                          ; res  = -rhs

    ; plain value
    movu        m6, [uq + offsetq]                      ; m6 = u[x]
    vfmadd231pd m0, m6, [diff_coeffs00q + offsetq]      ; res += u * diff_coeffs00
%if %2
    mulpd       m3, m6, m2
%endif

    %if stencil == 1
        addpd       m6, m6                              ; m6 = 2  * u[x]
    %else
        mulpd       m6, m15                             ; m6 = 30 * u[x]
    %endif

    RES_ADD_DIFF_SINGLEDIR stencil, 0
    RES_ADD_DIFF_SINGLEDIR stencil, 1
    RES_ADD_DIFF_MIXED     stencil

    andpd m6, m0, m13                                   ; m6 = abs(res)
    mulpd m0, m1
%if %2
    addpd m0, m3
%endif

    ; store the result
    add offsetq, mmsize
    jg .store_partial

    ; store full block
    movu [dstq + offsetq - mmsize], m0
    maxpd m12, m6
    js .loop
    jmp .finish

.store_partial:
    sub offsetq, ELEM_SIZE
    jz .store3
    sub offsetq, ELEM_SIZE
    jz .store2

.store1:
    ; offsetq is now mmsize-2 after the write position
    movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0

    vpermq m6, m6, 0
    maxpd m12, m6

    jmp .finish
.store2:
    ; offsetq is now mmsize-2 after the write position
    movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
    mova  xm6, xm6
    maxpd m12, m6
    jmp .finish
.store3:
    ; offsetq is now mmsize-1 after the write position
    movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0
    vextractf128 xm0, m0, 1
    movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0

    vpermq m6, m6, 10100100b
    maxpd m12, m6

.finish:
    movu [res_maxq], m12
    RET
%endmacro

INIT_YMM fma3
cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 0
cglobal residual_add_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
                                         diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1, 1

INIT_YMM fma3
cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 0

cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
                                         diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 1

%macro DX3 3
    movu     %1,  [%3 + ELEM_SIZE * 1]   ; out =  u[x+1]
    subpd    %1,  [%3 - ELEM_SIZE * 1]   ;       -u[x-1]

    movu     %2,  [%3 + ELEM_SIZE * 2]   ; tmp =  u[x+2]
    subpd    %2,  [%3 - ELEM_SIZE * 2]   ;       -u[x-2]

    vfmsub132pd %1, %2, [const5]         ; out = 5 * out - tmp

    movu     %2,  [%3 + ELEM_SIZE * 3]   ; tmp =  u[x+3]
    subpd    %2,  [%3 - ELEM_SIZE * 3]   ;       -u[x-3]

    vfmadd132pd %1, %2, [const9]         ; out = 9 * out + tmp
%endmacro

%macro RES_ADD_DIFF_X_3 0
    ; first derivative
    DX3 m7, m8, uq + offsetq
    vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10

    ; second derivative
    movu     m7,  [uq + offsetq + ELEM_SIZE * 1]   ; m7 =  u[x+1]
    addpd    m7,  [uq + offsetq - ELEM_SIZE * 1]   ;      +u[x-1]

    movu     m8,  [uq + offsetq + ELEM_SIZE * 2]   ; m8 =  u[x+2]
    addpd    m8,  [uq + offsetq - ELEM_SIZE * 2]   ;      +u[x-2]

    vfmsub132pd m7, m8, [const10]                  ; m7 = 10 * m7 - m8

    movu     m8,  [uq + offsetq + ELEM_SIZE * 3]   ; m8 =  u[x+3]
    addpd    m8,  [uq + offsetq - ELEM_SIZE * 3]   ;      -u[x-3]

    vfmadd132pd  m7, m8, [const13_5]               ; m7 = 13.5 * m7 + m8
    vfnmadd231pd m7, m6, [const245]                ; m7 = m7 - 245 * u[0]
    vfmadd231pd  m0, m7, [diff_coeffs20q + offsetq]; res += d_x u * diff_coeffs20
%endmacro

%macro RES_ADD_DIFF_Y_3 0
    ; first derivative
    lea u_upq,   [uq + strideq]
    mov u_downq, uq
    sub u_downq, strideq
    movu     m7,  [u_upq   + offsetq]              ; m7 =  u[y+1]
    subpd    m7,  [u_downq + offsetq]              ;      -u[y-1]

    add u_upq,   strideq
    sub u_downq, strideq
    movu     m8,  [u_upq   + offsetq]              ; m8 =  u[y+2]
    subpd    m8,  [u_downq + offsetq]              ;      -u[y-2]

    vfmsub132pd m7, m8, [const5]                   ; m7 = 5 * m7 - m8

    add u_upq,   strideq
    sub u_downq, strideq
    movu     m8,  [u_upq   + offsetq]              ; m8 =  u[y+3]
    subpd    m8,  [u_downq + offsetq]              ;      -u[y-3]

    vfmadd132pd m7, m8, [const9]                   ; m7 = 9 * m7 + m8
    vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_x u * diff_coeffs01

    ; second derivative
    lea u_upq,   [uq + strideq]
    mov u_downq, uq
    sub u_downq, strideq
    movu     m7,  [u_upq   + offsetq]              ; m7 =  u[y+1]
    addpd    m7,  [u_downq + offsetq]              ;      +u[y-1]

    add u_upq,   strideq
    sub u_downq, strideq
    movu     m8,  [u_upq   + offsetq]              ; m8 =  u[y+2]
    addpd    m8,  [u_downq + offsetq]              ;      +u[y-2]

    vfmsub132pd m7, m8, [const10]                  ; m7 = 10 * m7 - m8

    add u_upq,   strideq
    sub u_downq, strideq
    movu     m8,  [u_upq   + offsetq]              ; m8 =  u[x+3]
    addpd    m8,  [u_downq + offsetq]              ;      -u[x-3]

    vfmadd132pd  m7, m8, [const13_5]               ; m7 = 13.5 * m7 + m8
    vfnmadd231pd m7, m6, [const245]                ; m7 = m7 - 245 * u[0]
    vfmadd231pd  m0, m7, [diff_coeffs02q + offsetq]; res += d_y u * diff_coeffs02
%endmacro

%macro RES_ADD_DIFF_MIXED_3 0
    lea u_upq, [uq + 2 * strideq]
    add u_upq, strideq
    DX3 m6, m7, u_upq + offsetq                 ; m6 = dx u[y + 3]

    sub u_upq, strideq
    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y + 2]
    vfnmadd231pd m6, m7, [const9]               ; m6 = m6 - 9 * m7

    sub u_upq, strideq
    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y + 1]
    vfmadd231pd m6, m7, [const45]               ; m6 = m6 + 45 * m7

    sub u_upq, strideq
    sub u_upq, strideq
    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y - 1]
    vfnmadd231pd m6, m7, [const45]              ; m6 = m6 - 45 * m7

    sub u_upq, strideq
    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y - 2]
    vfmadd231pd m6, m7, [const9]               ; m6 = m6 + 9 * m7

    sub u_upq, strideq
    DX3 m7, m8, u_upq + offsetq                 ; m7 = dx u[y - 3]
    subpd m6, m7
    vfmadd231pd m0, m6, [diff_coeffs11q + offsetq]
%endmacro

%macro RESIDUAL_CALC_3 2
%if %2
    vpermq m2, m1, 0
%endif
    vpermq m1, m0, 0

    ; compute the mask for absolute value
    pcmpeqq m13, m13
    psrlq   m13, 1
    movu    m12, [res_maxq]

    ; load pointers to the equation coefficients
    %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
    mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
    mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
    mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
    mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
    mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
    mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]

    ; setup the data pointers and the loop counter
    shl strideq, 3
    shl linesizeq, 3
    add dstq,           linesizeq
    add uq,             linesizeq
    add rhsq,           linesizeq
    add diff_coeffs00q, linesizeq
    add diff_coeffs01q, linesizeq
    add diff_coeffs10q, linesizeq
    add diff_coeffs11q, linesizeq
    add diff_coeffs02q, linesizeq
    add diff_coeffs20q, linesizeq
    neg linesizeq
    ; from now on, the register that held linesize is used as the offset into data arrays
    %define offsetq linesizeq
.loop:
    xorpd m0, m0
    subpd m0, [rhsq + offsetq]                          ; res  = -rhs

    ; plain value
    movu        m6, [uq + offsetq]                      ; m6 = u[x]
    vfmadd231pd m0, m6, [diff_coeffs00q + offsetq]      ; res += u * diff_coeffs00
%if %2
    mulpd       m3, m6, m2
%endif

    RES_ADD_DIFF_X_3
    RES_ADD_DIFF_Y_3
    RES_ADD_DIFF_MIXED_3

    andpd m6, m0, m13                                   ; m6 = abs(res)
    mulpd m0, m1
%if %2
    addpd m0, m3
%endif

    ; store the result
    add offsetq, mmsize
    jg .store_partial

    ; store full block
    movu [dstq + offsetq - mmsize], m0
    maxpd m12, m6
    js .loop
    jmp .finish

.store_partial:
    sub offsetq, ELEM_SIZE
    jz .store3
    sub offsetq, ELEM_SIZE
    jz .store2

.store1:
    ; offsetq is now mmsize-2 after the write position
    movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0

    vpermq m6, m6, 0
    maxpd m12, m6

    jmp .finish
.store2:
    ; offsetq is now mmsize-2 after the write position
    movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
    mova  xm6, xm6
    maxpd m12, m6
    jmp .finish
.store3:
    ; offsetq is now mmsize-1 after the write position
    movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0
    vextractf128 xm0, m0, 1
    movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0

    vpermq m6, m6, 10100100b
    maxpd m12, m6

.finish:
    movu [res_maxq], m12
    RET
%endmacro

INIT_YMM fma3
cglobal residual_calc_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down
RESIDUAL_CALC_3 3, 0

cglobal residual_add_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down
RESIDUAL_CALC_3 3, 1