residual_calc.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283

;
; SIMD for computing the residual
; Copyright 2018 Anton Khirnov <anton@khirnov.net>
;
; This program is free software: you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation, either version 3 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program.  If not, see <http://www.gnu.org/licenses/>.
;/

%include "config.asm"
%include "x86inc.asm"
%include "util.asm"

; double precision
%define ELEM_SIZE 8

SECTION .rodata

const8:  times 8 dq  8.0
const30: times 8 dq 30.0

SECTION .text

; mm register allocation (both s1 and s2)
; m0: accumulator for the residual
; m1: dst mult factor
; m2: res mult factor
; m6-m11: working registers
; m12: max(fabs(residual))
; m13: mask for computing absolute values
; (s2 only) m14-m15: splatted constants 8.0, 30.0

; calculate and add residual contributions from first and second derivatives
; along a single direction (x or y)
;
; parameters:
; %1: stencil
; %2: 0 -- x; 1 -- y
;
; register use (in addition to register allocation described above):
; m6: on entry contains u[x] multiplied by the corresponding FD coefficient, not
;     clobbered
; m7-m11 used for work (clobbered)
%macro RES_ADD_DIFF_SINGLEDIR 2
    %define stencil %1

    %if %2 == 0
        %define up1q uq + ELEM_SIZE
        %define up2q uq + ELEM_SIZE * 2
        %define um1q uq - ELEM_SIZE
        %define um2q uq - ELEM_SIZE * 2
        %define coeffs1q diff_coeffsq + diff_coeff_offset_10
        %define coeffs2q diff_coeffsq + diff_coeff_offset_20
    %else
        %define up1q u_upq
        %define up2q u_up2q
        %define um1q u_downq
        %define um2q u_down2q
        %define coeffs1q diff_coeffsq + diff_coeff_offset_01
        %define coeffs2q diff_coeffsq + diff_coeff_offset_02
    %endif

    ; load the function values
    movu     m7,  [up1q + offsetq]                   ; m7  = u[x+1]
    movu     m9,  [um1q + offsetq]                   ; m9  = u[x-1]
    %if stencil == 2
        movu m8,  [up2q + offsetq]                   ; m8  = u[x+2]
        movu m10, [um2q + offsetq]                   ; m10 = u[x-2]
    %endif

    ; first derivative
    subpd        m11, m7, m9                         ; m11 = u[x+1] - u[x-1]
    %if stencil == 2
        mulpd        m11, m14                        ; m11 *= 8
        subpd        m11, m8                         ; m11 -=   u[x+2]
        addpd        m11, m10                        ; m11 +=   u[x-2]
    %endif
    vfmadd231pd m0, m11, [coeffs1q]                  ; res += d_x u * diff_coeffs10

    ; second derivative
    addpd        m11, m7, m9                         ; m11 = u[x+1] + u[x-1]
    %if stencil == 2
        mulpd        m11, m14
        addpd        m11, m11                        ; m11 *= 16
        subpd        m11, m8                         ; m11 -=    u[x+2]
        subpd        m11, m10                        ; m11 -=    u[x-2]
    %endif
    subpd       m11, m6                              ; m11 -= fd0 u[x]
    vfmadd231pd m0, m11, [coeffs2q]                  ; res += d_xx u * diff_coeffs20
%endmacro

; calculate and add residual contributions from the second mixed derivative
;
; parameters:
; %1: stencil
;
; register use (in addition to register allocation described above):
; m6, m7: used for work (clobbered)
%macro RES_ADD_DIFF_MIXED 1
    movu  m6, [u_upq    + 1 * ELEM_SIZE + offsetq] ; m6 = u[y+1, x+1]
    subpd m6, [u_upq    - 1 * ELEM_SIZE + offsetq] ;    - u[y+1, x-1]
    subpd m6, [u_downq  + 1 * ELEM_SIZE + offsetq] ;    - u[y-1, x+1]
    addpd m6, [u_downq  - 1 * ELEM_SIZE + offsetq] ;    + u[y-1, x-1]

    %if %1 == 2
        movu  m7, [u_up2q   - 1 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x-1]
        subpd m7, [u_up2q   + 1 * ELEM_SIZE + offsetq] ;    - u[y+2, x+1]
        subpd m7, [u_upq    + 2 * ELEM_SIZE + offsetq] ;    - u[y+1, x+2]
        addpd m7, [u_upq    - 2 * ELEM_SIZE + offsetq] ;    + u[y+1, x-2]
        addpd m7, [u_downq  + 2 * ELEM_SIZE + offsetq] ;    + u[y-1, x+2]
        subpd m7, [u_downq  - 2 * ELEM_SIZE + offsetq] ;    - u[y-1, x-2]
        addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ;    + u[y-2, x+1]
        subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ;    - u[y-2, x-1]

        vfmadd123pd m6, m14, m7                        ; m6 = 8 m6 + m7

        movu  m7, [u_up2q   + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2]
        subpd m7, [u_up2q   - 2 * ELEM_SIZE + offsetq] ;    - u[y+2, x-2]
        subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ;    - u[y-2, x+2]
        addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ;    + u[y-2, x-2]

        vfmadd123pd m6, m14, m7                        ; m6 = 8 m6 + m7
    %endif

    vfmadd231pd m0, m6, [diff_coeffsq + diff_coeff_offset_11] ; res += d_xy u * diff_coeffs11
%endmacro

; %1: stencil
; %2: 0 - calc; 1 - add
%macro RESIDUAL_CALC 2

%define stencil %1

%if %2
%define opname add
%else
%define opname calc
%endif

; typedef void ResidualLineCalc/Add(
;   size_t linesize, double *dst, double *dst_max,
;   ptrdiff_t u_stride, const double *u, const double *rhs,
;   const double *diff_coeffs, ptrdiff_t diff_coeffs_offset,
;   double res_mult, [double u_mult (add only)])
cglobal residual_line_ %+ opname %+ _s %+ stencil,                              \
    8, 13, 14 + stencil * 2,                                                    \
    linesize, dst, res_max, u_stride, u, rhs, diff_coeffs, diff_coeffs_offset,  \
    u_down, u_up, u_up2, diff_coeffs_off3, diff_coeffs_off5

%if %2
    vpermq m2, m1, 0
%endif
    vpermq m1, m0, 0

    ; compute the mask for absolute value
    pcmpeqq m13, m13
    psrlq   m13, 1
    movu    m12, [res_maxq]

    ; setup the data pointers and the loop counter
    shl u_strideq, 3
    shl diff_coeffs_offsetq, 3
    shl linesizeq, 3
    add dstq,           linesizeq
    add uq,             linesizeq
    add rhsq,           linesizeq
    neg linesizeq
    ; from now on, the register that held linesize is used as the offset into data arrays
    %define offsetq linesizeq

    ; setup pointers to the line above and below
    lea u_upq,   [uq + u_strideq]
    mov u_downq, uq
    sub u_downq, u_strideq
    %if stencil == 2
        lea u_up2q,   [uq + 2 * u_strideq]
        neg u_strideq
        add u_strideq, u_downq
        %define u_down2q u_strideq ; reuse the stride register for the u[y-2] line

        movu    m15, [const30]
        movu    m14, [const8]
    %endif

    ; offsets to FD coefficients for given derivative
    %define diff_coeff_offset_01 1 * diff_coeffs_offsetq
    %define diff_coeff_offset_10 2 * diff_coeffs_offsetq

    lea diff_coeffs_off3q, [diff_coeffs_offsetq * 2 + diff_coeffs_offsetq]
    %define diff_coeff_offset_11 diff_coeffs_off3q

    %define diff_coeff_offset_02 4 * diff_coeffs_offsetq

    lea diff_coeffs_off5q, [diff_coeffs_offsetq * 4 + diff_coeffs_offsetq]
    %define diff_coeff_offset_20 diff_coeffs_off5q


.loop:
    xorpd m0, m0
    subpd m0, [rhsq + offsetq]                          ; res  = -rhs

    ; plain value
    movu        m6, [uq + offsetq]                      ; m6 = u[x]
    vfmadd231pd m0, m6, [diff_coeffsq]                  ; res += u * diff_coeffs00
%if %2
    mulpd       m3, m6, m2
%endif

    %if stencil == 1
        addpd       m6, m6                              ; m6 = 2  * u[x]
    %else
        mulpd       m6, m15                             ; m6 = 30 * u[x]
    %endif

    RES_ADD_DIFF_SINGLEDIR stencil, 0
    RES_ADD_DIFF_SINGLEDIR stencil, 1
    RES_ADD_DIFF_MIXED     stencil

    andpd m6, m0, m13                                   ; m6 = abs(res)
    mulpd m0, m1
%if %2
    addpd m0, m3
%endif

    ; store the result
    add diff_coeffsq, mmsize
    add offsetq, mmsize
    jg .store_partial

    ; store full block
    movu [dstq + offsetq - mmsize], m0
    maxpd m12, m6
    js .loop
    jmp .finish

.store_partial:
    sub offsetq, ELEM_SIZE
    jz .store3
    sub offsetq, ELEM_SIZE
    jz .store2

.store1:
    ; offsetq is now mmsize-2 after the write position
    movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0

    vpermq m6, m6, 0
    maxpd m12, m6

    jmp .finish
.store2:
    ; offsetq is now mmsize-2 after the write position
    movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
    mova  xm6, xm6
    maxpd m12, m6
    jmp .finish
.store3:
    ; offsetq is now mmsize-1 after the write position
    movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0
    vextractf128 xm0, m0, 1
    movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0

    vpermq m6, m6, 10100100b
    maxpd m12, m6

.finish:
    movu [res_maxq], m12
    RET
%endmacro

INIT_YMM avx2
RESIDUAL_CALC 1, 0
RESIDUAL_CALC 1, 1
RESIDUAL_CALC 2, 0
RESIDUAL_CALC 2, 1