residual_calc.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

;
; SIMD for computing the residual
; Copyright 2018 Anton Khirnov <anton@khirnov.net>
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program.  If not, see <http://www.gnu.org/licenses/>.
;/

%include "config.asm"
%include "x86inc.asm"

; double precision
%define ELEM_SIZE 8

; offsets to FD coefficients for given derivative
%define OFF_DIFF_COEFF_00 0 * gprsize
%define OFF_DIFF_COEFF_01 1 * gprsize
%define OFF_DIFF_COEFF_10 2 * gprsize
%define OFF_DIFF_COEFF_11 3 * gprsize
%define OFF_DIFF_COEFF_02 4 * gprsize
%define OFF_DIFF_COEFF_20 5 * gprsize

SECTION .rodata

const8:  times 8 dq  8.0
const30: times 8 dq 30.0

SECTION .text

; mm register allocation (both s1 and s2)
; m0: accumulator for the residual
; m6-m11: working registers
; m12: max(fabs(residual))
; m13: mask for computing absolute values
; (s2 only) m14-m15: splatted constants 8.0, 30.0

; calculate and add residual contributions from first and second derivatives
; along a single direction (x or y)
;
; parameters:
; %1: stencil
; %2: 0 -- x; 1 -- y
;
; register use (in addition to register allocation described above):
; m6: on entry contains u[x] multiplied by the corresponding FD coefficient, not
;     clobbered
; m7-m11 used for work (clobbered)
%macro RES_ADD_DIFF_SINGLEDIR 2
    %define stencil %1

    %if %2 == 0
        %define up1q uq + ELEM_SIZE
        %define up2q uq + ELEM_SIZE * 2
        %define um1q uq - ELEM_SIZE
        %define um2q uq - ELEM_SIZE * 2
        %define coeffs1q diff_coeffs10q
        %define coeffs2q diff_coeffs20q
    %else
        %define up1q u_upq
        %define up2q u_up2q
        %define um1q u_downq
        %define um2q u_down2q
        %define coeffs1q diff_coeffs01q
        %define coeffs2q diff_coeffs02q
    %endif

    ; load the function values
    movu     m7,  [up1q + offsetq]                   ; m7  = u[x+1]
    movu     m9,  [um1q + offsetq]                   ; m9  = u[x-1]
    %if stencil == 2
        movu m8,  [up2q + offsetq]                   ; m8  = u[x+2]
        movu m10, [um2q + offsetq]                   ; m10 = u[x-2]
    %endif

    ; first derivative
    subpd        m11, m7, m9                         ; m11 = u[x+1] - u[x-1]
    %if stencil == 2
        mulpd        m11, m14                        ; m11 *= 8
        subpd        m11, m8                         ; m11 -=   u[x+2]
        addpd        m11, m10                        ; m11 +=   u[x-2]
    %endif
    vfmadd231pd m0, m11, [coeffs1q + offsetq]        ; res += d_x u * diff_coeffs10

    ; second derivative
    addpd        m11, m7, m9                         ; m11 = u[x+1] + u[x-1]
    %if stencil == 2
        mulpd        m11, m14
        addpd        m11, m11                        ; m11 *= 16
        subpd        m11, m8                         ; m11 -=    u[x+2]
        subpd        m11, m10                        ; m11 -=    u[x-2]
    %endif
    subpd       m11, m6                              ; m11 -= fd0 u[x]
    vfmadd231pd m0, m11, [coeffs2q + offsetq]        ; res += d_xx u * diff_coeffs20
%endmacro

; calculate and add residual contributions from the second mixed derivative
;
; parameters:
; %1: stencil
;
; register use (in addition to register allocation described above):
; m6, m7: used for work (clobbered)
%macro RES_ADD_DIFF_MIXED 1
    movu  m6, [u_upq    + 1 * ELEM_SIZE + offsetq] ; m6 = u[y+1, x+1]
    subpd m6, [u_upq    - 1 * ELEM_SIZE + offsetq] ;    - u[y+1, x-1]
    subpd m6, [u_downq  + 1 * ELEM_SIZE + offsetq] ;    - u[y-1, x+1]
    addpd m6, [u_downq  - 1 * ELEM_SIZE + offsetq] ;    + u[y-1, x-1]

    %if %1 == 2
        movu  m7, [u_up2q   - 1 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x-1]
        subpd m7, [u_up2q   + 1 * ELEM_SIZE + offsetq] ;    - u[y+2, x+1]
        subpd m7, [u_upq    + 2 * ELEM_SIZE + offsetq] ;    - u[y+1, x+2]
        addpd m7, [u_upq    - 2 * ELEM_SIZE + offsetq] ;    + u[y+1, x-2]
        addpd m7, [u_downq  + 2 * ELEM_SIZE + offsetq] ;    + u[y-1, x+2]
        subpd m7, [u_downq  - 2 * ELEM_SIZE + offsetq] ;    - u[y-1, x-2]
        addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ;    + u[y-2, x+1]
        subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ;    - u[y-2, x-1]

        vfmadd123pd m6, m14, m7                        ; m6 = 8 m6 + m7

        movu  m7, [u_up2q   + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2]
        subpd m7, [u_up2q   - 2 * ELEM_SIZE + offsetq] ;    - u[y+2, x-2]
        subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ;    - u[y-2, x+2]
        addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ;    + u[y-2, x-2]

        vfmadd123pd m6, m14, m7                        ; m6 = 8 m6 + m7
    %endif

    vfmadd231pd m0, m6, [diff_coeffs11q + offsetq]     ; res += d_xy u * diff_coeffs11
%endmacro

; %1: stencil
%macro RESIDUAL_CALC 1
    %define stencil %1


    ; compute the mask for absolute value
    pcmpeqq m13, m13
    psrlq   m13, 1
    movu    m12, [res_maxq]

    ; load pointers to the equation coefficients
    %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
    mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
    mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
    mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
    mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
    mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
    mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]

    ; setup the data pointers and the loop counter
    shl strideq, 3
    shl linesizeq, 3
    add dstq,           linesizeq
    add uq,             linesizeq
    add rhsq,           linesizeq
    add diff_coeffs00q, linesizeq
    add diff_coeffs01q, linesizeq
    add diff_coeffs10q, linesizeq
    add diff_coeffs11q, linesizeq
    add diff_coeffs02q, linesizeq
    add diff_coeffs20q, linesizeq
    neg linesizeq
    ; from now on, the register that held linesize is used as the offset into data arrays
    %define offsetq linesizeq

    ; setup pointers to the line above and below
    lea u_upq,   [uq + strideq]
    mov u_downq, uq
    sub u_downq, strideq
    %if stencil == 2
        lea u_up2q,   [uq + 2 * strideq]
        neg strideq
        add strideq, u_downq
        %define u_down2q strideq ; reuse the stride register for the u[y-2] line

        movu    m15, [const30]
        movu    m14, [const8]
    %endif

.loop:
    xorpd m0, m0
    subpd m0, [rhsq + offsetq]                          ; res  = -rhs

    ; plain value
    movu        m6, [uq + offsetq]                      ; m6 = u[x]
    vfmadd231pd m0, m6, [diff_coeffs00q + offsetq]      ; res += u * diff_coeffs00

    %if stencil == 1
        addpd       m6, m6                              ; m6 = 2  * u[x]
    %else
        mulpd       m6, m15                             ; m6 = 30 * u[x]
    %endif

    RES_ADD_DIFF_SINGLEDIR stencil, 0
    RES_ADD_DIFF_SINGLEDIR stencil, 1
    RES_ADD_DIFF_MIXED     stencil

    andpd m6, m0, m13                                   ; m6 = abs(res)

    ; store the result
    add offsetq, mmsize
    jg .store_partial

    ; store full block
    movu [dstq + offsetq - mmsize], m0
    maxpd m12, m6
    js .loop
    jmp .finish

.store_partial:
    sub offsetq, ELEM_SIZE
    jz .store3
    sub offsetq, ELEM_SIZE
    jz .store2

.store1:
    ; offsetq is now mmsize-2 after the write position
    movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0

    vpermq m6, m6, 0
    maxpd m12, m6

    jmp .finish
.store2:
    ; offsetq is now mmsize-2 after the write position
    movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
    mova  xm6, xm6
    maxpd m12, m6
    jmp .finish
.store3:
    ; offsetq is now mmsize-1 after the write position
    movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0
    vextractf128 xm0, m0, 1
    movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0

    vpermq m6, m6, 10100100b
    maxpd m12, m6

.finish:
    movu [res_maxq], m12
    RET
%endmacro

INIT_YMM fma3
cglobal residual_calc_line_s1, 7, 14, 14, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up
RESIDUAL_CALC 1

INIT_YMM fma3
cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
                                          diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2