From 88db705983c7b518372c93ef50bac5e8ddb1e6bf Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 9 Jan 2019 15:45:11 +0100 Subject: residual_calc.asm: make mm register use more consistent between s1 and s2 --- residual_calc.asm | 54 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 26 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index 9b74cc8..7048da3 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -87,45 +87,45 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value - movu m7, [uq + offsetq] - vfmadd231pd m0, m7, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + movu m6, [uq + offsetq] ; m6 = u[x] + vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + + addpd m6, m6 ; m6 = 2 * u[x] ; dx, d2x movu m8, [uq + offsetq + ELEM_SIZE] movu m9, [uq + offsetq - ELEM_SIZE] - subpd m6, m8, m9 - mulpd m6, m2 - vfmadd231pd m0, m6, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 + subpd m7, m8, m9 + mulpd m7, m2 + vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 - addpd m7, m7 - - addpd m6, m8, m9 - subpd m6, m7 - mulpd m6, m5 - vfmadd231pd m0, m6, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 + addpd m7, m8, m9 + subpd m7, m6 + mulpd m7, m5 + vfmadd231pd m0, m7, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 ; dy, d2y movu m8, [u_upq + offsetq] movu m9, [u_downq + offsetq] - subpd m6, m8, m9 - mulpd m6, m1 - vfmadd231pd m0, m6, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01 + subpd m7, m8, m9 + mulpd m7, m1 + vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01 - addpd m6, m8, m9 - subpd m6, m7 - mulpd m6, m4 - vfmadd231pd m0, m6, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02 + addpd m7, m8, m9 + subpd m7, m6 + mulpd m7, m4 + vfmadd231pd m0, m7, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02 ; mixed d2xy - movu m7, [u_upq + offsetq + ELEM_SIZE] - subpd m7, [u_upq + offsetq - ELEM_SIZE] - subpd m7, [u_downq + offsetq + ELEM_SIZE] - addpd m7, [u_downq + offsetq - ELEM_SIZE] + movu m6, [u_upq + offsetq + ELEM_SIZE] + subpd m6, [u_upq + offsetq - ELEM_SIZE] + subpd m6, [u_downq + offsetq + ELEM_SIZE] + addpd m6, [u_downq + offsetq - ELEM_SIZE] - mulpd m8, m3, [diff_coeffs11q + offsetq] - vfmadd231pd m0, m7, m8 ; res += d_xy u * diff_coeffs11 + mulpd m6, m3 + vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11 ; store the result movu [dstq + offsetq], m0 @@ -194,6 +194,8 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co movu m6, [uq + offsetq] vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + mulpd m6, m15 ; m6 = 30 u[x] + ; dx, d2x movu m7, [uq + offsetq + ELEM_SIZE] ; m7 = u[x+1] movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2] @@ -212,7 +214,7 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co vfmadd231pd m11, m13, m9 ; m11 += 16 u[x-1] subpd m11, m8 ; m11 -= u[x+2] subpd m11, m10 ; m11 -= u[x-2] - vfnmadd231pd m11, m15, m6 ; m11 -= 30 u + subpd m11, m6 ; m11 -= 30 u[x] mulpd m11, m5 vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 @@ -235,7 +237,7 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co vfmadd231pd m11, m13, m9 ; m11 += 16 u[y-1] subpd m11, m8 ; m11 -= u[y+2] subpd m11, m10 ; m11 -= u[y-2] - vfnmadd231pd m11, m15, m6 ; m11 -= 30 u + subpd m11, m6 ; m11 -= 30 u[x] mulpd m11, m4 vfmadd231pd m0, m11, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02 -- cgit v1.2.3