From 3eebc8df5edb3ca82e8f726a6f2555f6d2a70dfb Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Tue, 9 Apr 2019 09:45:16 +0200 Subject: egs: add higher-order finite difference operators --- residual_calc.asm | 231 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 229 insertions(+), 2 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index 5eea31c..471f864 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -32,8 +32,14 @@ SECTION .rodata -const8: times 8 dq 8.0 -const30: times 8 dq 30.0 +const5: times 8 dq 5.0 +const8: times 8 dq 8.0 +const9: times 8 dq 9.0 +const10: times 8 dq 10.0 +const13_5: times 8 dq 13.5 +const30: times 8 dq 30.0 +const45: times 8 dq 45.0 +const245: times 8 dq 245.0 SECTION .text @@ -282,3 +288,224 @@ RESIDUAL_CALC 2, 0 cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2 RESIDUAL_CALC 2, 1 + +%macro DX3 3 + movu %1, [%3 + ELEM_SIZE * 1] ; out = u[x+1] + subpd %1, [%3 - ELEM_SIZE * 1] ; -u[x-1] + + movu %2, [%3 + ELEM_SIZE * 2] ; tmp = u[x+2] + subpd %2, [%3 - ELEM_SIZE * 2] ; -u[x-2] + + vfmsub132pd %1, %2, [const5] ; out = 5 * out - tmp + + movu %2, [%3 + ELEM_SIZE * 3] ; tmp = u[x+3] + subpd %2, [%3 - ELEM_SIZE * 3] ; -u[x-3] + + vfmadd132pd %1, %2, [const9] ; out = 9 * out + tmp +%endmacro + +%macro RES_ADD_DIFF_X_3 0 + ; first derivative + DX3 m7, m8, uq + offsetq + vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 + + ; second derivative + movu m7, [uq + offsetq + ELEM_SIZE * 1] ; m7 = u[x+1] + addpd m7, [uq + offsetq - ELEM_SIZE * 1] ; +u[x-1] + + movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2] + addpd m8, [uq + offsetq - ELEM_SIZE * 2] ; +u[x-2] + + vfmsub132pd m7, m8, [const10] ; m7 = 10 * m7 - m8 + + movu m8, [uq + offsetq + ELEM_SIZE * 3] ; m8 = u[x+3] + addpd m8, [uq + offsetq - ELEM_SIZE * 3] ; -u[x-3] + + vfmadd132pd m7, m8, [const13_5] ; m7 = 13.5 * m7 + m8 + vfnmadd231pd m7, m6, [const245] ; m7 = m7 - 245 * u[0] + vfmadd231pd m0, m7, [diff_coeffs20q + offsetq]; res += d_x u * diff_coeffs20 +%endmacro + +%macro RES_ADD_DIFF_Y_3 0 + ; first derivative + lea u_upq, [uq + strideq] + mov u_downq, uq + sub u_downq, strideq + movu m7, [u_upq + offsetq] ; m7 = u[y+1] + subpd m7, [u_downq + offsetq] ; -u[y-1] + + add u_upq, strideq + sub u_downq, strideq + movu m8, [u_upq + offsetq] ; m8 = u[y+2] + subpd m8, [u_downq + offsetq] ; -u[y-2] + + vfmsub132pd m7, m8, [const5] ; m7 = 5 * m7 - m8 + + add u_upq, strideq + sub u_downq, strideq + movu m8, [u_upq + offsetq] ; m8 = u[y+3] + subpd m8, [u_downq + offsetq] ; -u[y-3] + + vfmadd132pd m7, m8, [const9] ; m7 = 9 * m7 + m8 + vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_x u * diff_coeffs01 + + ; second derivative + lea u_upq, [uq + strideq] + mov u_downq, uq + sub u_downq, strideq + movu m7, [u_upq + offsetq] ; m7 = u[y+1] + addpd m7, [u_downq + offsetq] ; +u[y-1] + + add u_upq, strideq + sub u_downq, strideq + movu m8, [u_upq + offsetq] ; m8 = u[y+2] + addpd m8, [u_downq + offsetq] ; +u[y-2] + + vfmsub132pd m7, m8, [const10] ; m7 = 10 * m7 - m8 + + add u_upq, strideq + sub u_downq, strideq + movu m8, [u_upq + offsetq] ; m8 = u[x+3] + addpd m8, [u_downq + offsetq] ; -u[x-3] + + vfmadd132pd m7, m8, [const13_5] ; m7 = 13.5 * m7 + m8 + vfnmadd231pd m7, m6, [const245] ; m7 = m7 - 245 * u[0] + vfmadd231pd m0, m7, [diff_coeffs02q + offsetq]; res += d_y u * diff_coeffs02 +%endmacro + +%macro RES_ADD_DIFF_MIXED_3 0 + lea u_upq, [uq + 2 * strideq] + add u_upq, strideq + DX3 m6, m7, u_upq + offsetq ; m6 = dx u[y + 3] + + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y + 2] + vfnmadd231pd m6, m7, [const9] ; m6 = m6 - 9 * m7 + + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y + 1] + vfmadd231pd m6, m7, [const45] ; m6 = m6 + 45 * m7 + + sub u_upq, strideq + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 1] + vfnmadd231pd m6, m7, [const45] ; m6 = m6 - 45 * m7 + + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 2] + vfmadd231pd m6, m7, [const9] ; m6 = m6 + 9 * m7 + + sub u_upq, strideq + DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 3] + subpd m6, m7 + vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] +%endmacro + +%macro RESIDUAL_CALC_3 2 +%if %2 + vpermq m2, m1, 0 +%endif + vpermq m1, m0, 0 + + ; compute the mask for absolute value + pcmpeqq m13, m13 + psrlq m13, 1 + movu m12, [res_maxq] + + ; load pointers to the equation coefficients + %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer + mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00] + mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01] + mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10] + mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11] + mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02] + mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20] + + ; setup the data pointers and the loop counter + shl strideq, 3 + shl linesizeq, 3 + add dstq, linesizeq + add uq, linesizeq + add rhsq, linesizeq + add diff_coeffs00q, linesizeq + add diff_coeffs01q, linesizeq + add diff_coeffs10q, linesizeq + add diff_coeffs11q, linesizeq + add diff_coeffs02q, linesizeq + add diff_coeffs20q, linesizeq + neg linesizeq + ; from now on, the register that held linesize is used as the offset into data arrays + %define offsetq linesizeq +.loop: + xorpd m0, m0 + subpd m0, [rhsq + offsetq] ; res = -rhs + + ; plain value + movu m6, [uq + offsetq] ; m6 = u[x] + vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 +%if %2 + mulpd m3, m6, m2 +%endif + + RES_ADD_DIFF_X_3 + RES_ADD_DIFF_Y_3 + RES_ADD_DIFF_MIXED_3 + + andpd m6, m0, m13 ; m6 = abs(res) + mulpd m0, m1 +%if %2 + addpd m0, m3 +%endif + + ; store the result + add offsetq, mmsize + jg .store_partial + + ; store full block + movu [dstq + offsetq - mmsize], m0 + maxpd m12, m6 + js .loop + jmp .finish + +.store_partial: + sub offsetq, ELEM_SIZE + jz .store3 + sub offsetq, ELEM_SIZE + jz .store2 + +.store1: + ; offsetq is now mmsize-2 after the write position + movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + + vpermq m6, m6, 0 + maxpd m12, m6 + + jmp .finish +.store2: + ; offsetq is now mmsize-2 after the write position + movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + mova xm6, xm6 + maxpd m12, m6 + jmp .finish +.store3: + ; offsetq is now mmsize-1 after the write position + movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0 + vextractf128 xm0, m0, 1 + movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0 + + vpermq m6, m6, 10100100b + maxpd m12, m6 + +.finish: + movu [res_maxq], m12 + RET +%endmacro + +INIT_YMM fma3 +cglobal residual_calc_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down +RESIDUAL_CALC_3 3, 0 + +cglobal residual_add_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\ + diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down +RESIDUAL_CALC_3 3, 1 -- cgit v1.2.3