From a5b6c8522595edea8bf5c544ebd88d36c8a75f05 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Wed, 9 Jan 2019 15:31:08 +0100 Subject: residual_calc.asm: make register use in s1 more similar to s2 --- residual_calc.asm | 56 +++++++++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'residual_calc.asm') diff --git a/residual_calc.asm b/residual_calc.asm index 2dd5f7c..9b74cc8 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -70,12 +70,12 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co ; load and splat the finite difference factors movu m0, [fd_factorsq + OFF_DIFF_COEFF_01] - vpermq m7, m0, 00000000b ; diff factor 01 -> m7 - vpermq m8, m0, 01010101b ; diff factor 10 -> m8 - vpermq m9, m0, 10101010b ; diff factor 11 -> m9 - vpermq m10, m0, 11111111b ; diff factor 02 -> m10 + vpermq m1, m0, 00000000b ; diff factor 01 -> m1 + vpermq m2, m0, 01010101b ; diff factor 10 -> m2 + vpermq m3, m0, 10101010b ; diff factor 11 -> m3 + vpermq m4, m0, 11111111b ; diff factor 02 -> m4 movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20] - vpermq m11, m0, 00000000b ; diff factor 20 -> m11 + vpermq m5, m0, 00000000b ; diff factor 20 -> m5 ; setup pointers to the line above and below lea u_upq, [uq + strideq] @@ -87,45 +87,45 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co subpd m0, [rhsq + offsetq] ; res = -rhs ; plain value - movu m1, [uq + offsetq] - vfmadd231pd m0, m1, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 + movu m7, [uq + offsetq] + vfmadd231pd m0, m7, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00 ; dx, d2x - movu m2, [uq + offsetq + ELEM_SIZE] - movu m3, [uq + offsetq - ELEM_SIZE] + movu m8, [uq + offsetq + ELEM_SIZE] + movu m9, [uq + offsetq - ELEM_SIZE] - subpd m6, m2, m3 - mulpd m6, m8 + subpd m6, m8, m9 + mulpd m6, m2 vfmadd231pd m0, m6, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10 - addpd m1, m1 + addpd m7, m7 - addpd m6, m2, m3 - subpd m6, m1 - mulpd m6, m11 + addpd m6, m8, m9 + subpd m6, m7 + mulpd m6, m5 vfmadd231pd m0, m6, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20 ; dy, d2y - movu m2, [u_upq + offsetq] - movu m3, [u_downq + offsetq] + movu m8, [u_upq + offsetq] + movu m9, [u_downq + offsetq] - subpd m6, m2, m3 - mulpd m6, m7 + subpd m6, m8, m9 + mulpd m6, m1 vfmadd231pd m0, m6, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01 - addpd m6, m2, m3 - subpd m6, m1 - mulpd m6, m10 + addpd m6, m8, m9 + subpd m6, m7 + mulpd m6, m4 vfmadd231pd m0, m6, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02 ; mixed d2xy - movu m1, [u_upq + offsetq + ELEM_SIZE] - subpd m1, [u_upq + offsetq - ELEM_SIZE] - subpd m1, [u_downq + offsetq + ELEM_SIZE] - addpd m1, [u_downq + offsetq - ELEM_SIZE] + movu m7, [u_upq + offsetq + ELEM_SIZE] + subpd m7, [u_upq + offsetq - ELEM_SIZE] + subpd m7, [u_downq + offsetq + ELEM_SIZE] + addpd m7, [u_downq + offsetq - ELEM_SIZE] - mulpd m2, m9, [diff_coeffs11q + offsetq] - vfmadd231pd m0, m1, m2 ; res += d_xy u * diff_coeffs11 + mulpd m8, m3, [diff_coeffs11q + offsetq] + vfmadd231pd m0, m7, m8 ; res += d_xy u * diff_coeffs11 ; store the result movu [dstq + offsetq], m0 -- cgit v1.2.3