From f989d2831f72ee69d5e96ca73a5a6427b064534f Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 9 Jan 2019 22:11:52 +0100 Subject: residual_calc.asm: templatize computing the mixed derivative --- residual_calc.asm | 74 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 35 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index bfb6ce3..6424b2f 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -35,7 +35,6 @@ SECTION .rodata const8: times 8 dq 8.0 const16: times 8 dq 16.0 const30: times 8 dq 30.0 -const64: times 8 dq 64.0 SECTION .text @@ -101,6 +100,43 @@ SECTION .text vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 %endmacro +; calculate and add residual contributions from the second mixed derivative +; +; parameters: +; %1: stencil +; +; register use (in addition to register allocation described above): +; m6, m7: used for work (clobbered) +%macro RES_ADD_DIFF_MIXED 1 + movu m6, [u_upq + 1 * ELEM_SIZE + offsetq] ; m6 = u[y+1, x+1] + subpd m6, [u_upq - 1 * ELEM_SIZE + offsetq] ; - u[y+1, x-1] + subpd m6, [u_downq + 1 * ELEM_SIZE + offsetq] ; - u[y-1, x+1] + addpd m6, [u_downq - 1 * ELEM_SIZE + offsetq] ; + u[y-1, x-1] + + %if %1 == 2 + movu m7, [u_up2q - 1 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x-1] + subpd m7, [u_up2q + 1 * ELEM_SIZE + offsetq] ; - u[y+2, x+1] + subpd m7, [u_upq + 2 * ELEM_SIZE + offsetq] ; - u[y+1, x+2] + addpd m7, [u_upq - 2 * ELEM_SIZE + offsetq] ; + u[y+1, x-2] + addpd m7, [u_downq + 2 * ELEM_SIZE + offsetq] ; + u[y-1, x+2] + subpd m7, [u_downq - 2 * ELEM_SIZE + offsetq] ; - u[y-1, x-2] + addpd m7, [u_down2q + 1 * ELEM_SIZE + offsetq] ; + u[y-2, x+1] + subpd m7, [u_down2q - 1 * ELEM_SIZE + offsetq] ; - u[y-2, x-1] + + vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 + + movu m7, [u_up2q + 2 * ELEM_SIZE + offsetq] ; m7 = u[y+2, x+2] + subpd m7, [u_up2q - 2 * ELEM_SIZE + offsetq] ; - u[y+2, x-2] + subpd m7, [u_down2q + 2 * ELEM_SIZE + offsetq] ; - u[y-2, x+2] + addpd m7, [u_down2q - 2 * ELEM_SIZE + offsetq] ; + u[y-2, x-2] + + vfmadd123pd m6, m14, m7 ; m6 = 8 m6 + m7 + %endif + + mulpd m6, m3 + vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 +%endmacro + INIT_YMM fma3 cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up @@ -156,15 +192,7 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co RES_ADD_DIFF_SINGLEDIR 1, 0 RES_ADD_DIFF_SINGLEDIR 1, 1 - - ; mixed d2xy - movu m6, [u_upq + offsetq + ELEM_SIZE] - subpd m6, [u_upq + offsetq - ELEM_SIZE] - subpd m6, [u_downq + offsetq + ELEM_SIZE] - addpd m6, [u_downq + offsetq - ELEM_SIZE] - - mulpd m6, m3 - vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 + RES_ADD_DIFF_MIXED 1 ; store the result movu [dstq + offsetq], m0 @@ -214,7 +242,6 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co movu m15, [const30] movu m14, [const8] movu m13, [const16] - movu m12, [const64] ; setup pointers to the lines above and below lea u_upq, [uq + strideq] @@ -237,30 +264,7 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co RES_ADD_DIFF_SINGLEDIR 2, 0 RES_ADD_DIFF_SINGLEDIR 2, 1 - - ; mixed d2xy - movu m6, [u_up2q + offsetq + 2 * ELEM_SIZE] ; m6 = u[y+2, x+2] - vfnmadd231pd m6, m14, [u_up2q + offsetq + 1 * ELEM_SIZE] ; m6 -= 8 u[y+2, x+1] - vfmadd231pd m6, m14, [u_up2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y+2, x-1] - subpd m6, [u_up2q + offsetq - 2 * ELEM_SIZE] ; m6 -= u[y+2, x-2] - - vfnmadd231pd m6, m14, [u_upq + offsetq + 2 * ELEM_SIZE] ; m6 -= 8 u[y+1, x+2] - vfmadd231pd m6, m12, [u_upq + offsetq + 1 * ELEM_SIZE] ; m6 += 64 u[y+1, x+1] - vfnmadd231pd m6, m12, [u_upq + offsetq - 1 * ELEM_SIZE] ; m6 -= 64 u[y+1, x-1] - vfmadd231pd m6, m14, [u_upq + offsetq - 2 * ELEM_SIZE] ; m6 += 8 u[y+1, x-2] - - vfmadd231pd m6, m14, [u_downq + offsetq + 2 * ELEM_SIZE] ; m6 += 8 u[y-1, x+2] - vfnmadd231pd m6, m12, [u_downq + offsetq + 1 * ELEM_SIZE] ; m6 -= 64 u[y-1, x+1] - vfmadd231pd m6, m12, [u_downq + offsetq - 1 * ELEM_SIZE] ; m6 += 64 u[y-1, x-1] - vfnmadd231pd m6, m14, [u_downq + offsetq - 2 * ELEM_SIZE] ; m6 -= 8 u[y-1, x-2] - - subpd m6, [u_down2q + offsetq + 2 * ELEM_SIZE] ; m6 -= u[y-2, x+2] - vfmadd231pd m6, m14, [u_down2q + offsetq + 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x+1] - vfnmadd231pd m6, m14, [u_down2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x-1] - addpd m6, [u_down2q + offsetq - 2 * ELEM_SIZE] ; m6 += u[y-2, x-2] - - mulpd m6, m3 - vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 + RES_ADD_DIFF_MIXED 2 ; store the result movu [dstq + offsetq], m0 -- cgit v1.2.3