From 4dddc9813215eefb2f637fa62d8e165ad5a8ecaa Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 9 Jan 2019 15:19:58 +0100 Subject: residual_calc.asm: reduce the use of magic constants --- residual_calc.asm | 75 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index 638ff42..2dd5f7c 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -19,6 +19,17 @@ %include "config.asm" %include "x86inc.asm" +; double precision +%define ELEM_SIZE 8 + +; offsets to FD coefficients for given derivative +%define OFF_DIFF_COEFF_00 0 * gprsize +%define OFF_DIFF_COEFF_01 1 * gprsize +%define OFF_DIFF_COEFF_10 2 * gprsize +%define OFF_DIFF_COEFF_11 3 * gprsize +%define OFF_DIFF_COEFF_02 4 * gprsize +%define OFF_DIFF_COEFF_20 5 * gprsize + SECTION .rodata const8: times 8 dq 8.0 @@ -27,12 +38,6 @@ const30: times 8 dq 30.0 const64: times 8 dq 64.0 SECTION .text -%define OFF_DIFF_COEFF_00 0 * 8 -%define OFF_DIFF_COEFF_01 1 * 8 -%define OFF_DIFF_COEFF_10 2 * 8 -%define OFF_DIFF_COEFF_11 3 * 8 -%define OFF_DIFF_COEFF_02 4 * 8 -%define OFF_DIFF_COEFF_20 5 * 8 INIT_YMM fma3 cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\ @@ -86,8 +91,8 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co vfmadd231pd m0, m1, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 ; dx, d2x - movu m2, [uq + offsetq + 8] - movu m3, [uq + offsetq - 8] + movu m2, [uq + offsetq + ELEM_SIZE] + movu m3, [uq + offsetq - ELEM_SIZE] subpd m6, m2, m3 mulpd m6, m8 @@ -114,10 +119,10 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co vfmadd231pd m0, m6, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02 ; mixed d2xy - movu m1, [u_upq + offsetq + 8] - subpd m1, [u_upq + offsetq - 8] - subpd m1, [u_downq + offsetq + 8] - addpd m1, [u_downq + offsetq - 8] + movu m1, [u_upq + offsetq + ELEM_SIZE] + subpd m1, [u_upq + offsetq - ELEM_SIZE] + subpd m1, [u_downq + offsetq + ELEM_SIZE] + addpd m1, [u_downq + offsetq - ELEM_SIZE] mulpd m2, m9, [diff_coeffs11q + offsetq] vfmadd231pd m0, m1, m2 ; res += d_xy u * diff_coeffs11 @@ -190,10 +195,10 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 ; dx, d2x - movu m7, [uq + offsetq + 8] ; m7 = u[x+1] - movu m8, [uq + offsetq + 8 * 2] ; m8 = u[x+2] - movu m9, [uq + offsetq - 8] ; m9 = u[x-1] - movu m10, [uq + offsetq - 8 * 2] ; m10 = u[x-2] + movu m7, [uq + offsetq + ELEM_SIZE] ; m7 = u[x+1] + movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2] + movu m9, [uq + offsetq - ELEM_SIZE] ; m9 = u[x-1] + movu m10, [uq + offsetq - ELEM_SIZE * 2] ; m10 = u[x-2] mulpd m11, m14, m7 ; m11 = 8 u[x+1] vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[x-1] @@ -236,25 +241,25 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co vfmadd231pd m0, m11, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02 ; mixed d2xy - movu m6, [u_up2q + offsetq + 2 * 8] ; m6 = u[y+2, x+2] - vfnmadd231pd m6, m14, [u_up2q + offsetq + 1 * 8] ; m6 -= 8 u[y+2, x+1] - vfmadd231pd m6, m14, [u_up2q + offsetq - 1 * 8] ; m6 += 8 u[y+2, x-1] - subpd m6, [u_up2q + offsetq - 2 * 8] ; m6 -= u[y+2, x-2] - - vfnmadd231pd m6, m14, [u_upq + offsetq + 2 * 8] ; m6 -= 8 u[y+1, x+2] - vfmadd231pd m6, m12, [u_upq + offsetq + 1 * 8] ; m6 += 64 u[y+1, x+1] - vfnmadd231pd m6, m12, [u_upq + offsetq - 1 * 8] ; m6 -= 64 u[y+1, x-1] - vfmadd231pd m6, m14, [u_upq + offsetq - 2 * 8] ; m6 += 8 u[y+1, x-2] - - vfmadd231pd m6, m14, [u_downq + offsetq + 2 * 8] ; m6 += 8 u[y-1, x+2] - vfnmadd231pd m6, m12, [u_downq + offsetq + 1 * 8] ; m6 -= 64 u[y-1, x+1] - vfmadd231pd m6, m12, [u_downq + offsetq - 1 * 8] ; m6 += 64 u[y-1, x-1] - vfnmadd231pd m6, m14, [u_downq + offsetq - 2 * 8] ; m6 -= 8 u[y-1, x-2] - - subpd m6, [u_down2q + offsetq + 2 * 8] ; m6 -= u[y-2, x+2] - vfmadd231pd m6, m14, [u_down2q + offsetq + 1 * 8] ; m6 += 8 u[y-2, x+1] - vfnmadd231pd m6, m14, [u_down2q + offsetq - 1 * 8] ; m6 += 8 u[y-2, x-1] - addpd m6, [u_down2q + offsetq - 2 * 8] ; m6 += u[y-2, x-2] + movu m6, [u_up2q + offsetq + 2 * ELEM_SIZE] ; m6 = u[y+2, x+2] + vfnmadd231pd m6, m14, [u_up2q + offsetq + 1 * ELEM_SIZE] ; m6 -= 8 u[y+2, x+1] + vfmadd231pd m6, m14, [u_up2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y+2, x-1] + subpd m6, [u_up2q + offsetq - 2 * ELEM_SIZE] ; m6 -= u[y+2, x-2] + + vfnmadd231pd m6, m14, [u_upq + offsetq + 2 * ELEM_SIZE] ; m6 -= 8 u[y+1, x+2] + vfmadd231pd m6, m12, [u_upq + offsetq + 1 * ELEM_SIZE] ; m6 += 64 u[y+1, x+1] + vfnmadd231pd m6, m12, [u_upq + offsetq - 1 * ELEM_SIZE] ; m6 -= 64 u[y+1, x-1] + vfmadd231pd m6, m14, [u_upq + offsetq - 2 * ELEM_SIZE] ; m6 += 8 u[y+1, x-2] + + vfmadd231pd m6, m14, [u_downq + offsetq + 2 * ELEM_SIZE] ; m6 += 8 u[y-1, x+2] + vfnmadd231pd m6, m12, [u_downq + offsetq + 1 * ELEM_SIZE] ; m6 -= 64 u[y-1, x+1] + vfmadd231pd m6, m12, [u_downq + offsetq - 1 * ELEM_SIZE] ; m6 += 64 u[y-1, x-1] + vfnmadd231pd m6, m14, [u_downq + offsetq - 2 * ELEM_SIZE] ; m6 -= 8 u[y-1, x-2] + + subpd m6, [u_down2q + offsetq + 2 * ELEM_SIZE] ; m6 -= u[y-2, x+2] + vfmadd231pd m6, m14, [u_down2q + offsetq + 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x+1] + vfnmadd231pd m6, m14, [u_down2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x-1] + addpd m6, [u_down2q + offsetq - 2 * ELEM_SIZE] ; m6 += u[y-2, x-2] mulpd m6, m3 vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 -- cgit v1.2.3