aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-01-09 15:31:08 +0100
committerAnton Khirnov <anton@khirnov.net>2019-01-10 09:14:21 +0100
commita5b6c8522595edea8bf5c544ebd88d36c8a75f05 (patch)
treead6c2a375094d85a68ccf0e9fbccacd3c3f935b1 /residual_calc.asm
parent4dddc9813215eefb2f637fa62d8e165ad5a8ecaa (diff)
residual_calc.asm: make register use in s1 more similar to s2
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm56
1 files changed, 28 insertions, 28 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 2dd5f7c..9b74cc8 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -70,12 +70,12 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
; load and splat the finite difference factors
movu m0, [fd_factorsq + OFF_DIFF_COEFF_01]
- vpermq m7, m0, 00000000b ; diff factor 01 -> m7
- vpermq m8, m0, 01010101b ; diff factor 10 -> m8
- vpermq m9, m0, 10101010b ; diff factor 11 -> m9
- vpermq m10, m0, 11111111b ; diff factor 02 -> m10
+ vpermq m1, m0, 00000000b ; diff factor 01 -> m1
+ vpermq m2, m0, 01010101b ; diff factor 10 -> m2
+ vpermq m3, m0, 10101010b ; diff factor 11 -> m3
+ vpermq m4, m0, 11111111b ; diff factor 02 -> m4
movq xm0, [fd_factorsq + OFF_DIFF_COEFF_20]
- vpermq m11, m0, 00000000b ; diff factor 20 -> m11
+ vpermq m5, m0, 00000000b ; diff factor 20 -> m5
; setup pointers to the line above and below
lea u_upq, [uq + strideq]
@@ -87,45 +87,45 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
subpd m0, [rhsq + offsetq] ; res = -rhs
; plain value
- movu m1, [uq + offsetq]
- vfmadd231pd m0, m1, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+ movu m7, [uq + offsetq]
+ vfmadd231pd m0, m7, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
; dx, d2x
- movu m2, [uq + offsetq + ELEM_SIZE]
- movu m3, [uq + offsetq - ELEM_SIZE]
+ movu m8, [uq + offsetq + ELEM_SIZE]
+ movu m9, [uq + offsetq - ELEM_SIZE]
- subpd m6, m2, m3
- mulpd m6, m8
+ subpd m6, m8, m9
+ mulpd m6, m2
vfmadd231pd m0, m6, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10
- addpd m1, m1
+ addpd m7, m7
- addpd m6, m2, m3
- subpd m6, m1
- mulpd m6, m11
+ addpd m6, m8, m9
+ subpd m6, m7
+ mulpd m6, m5
vfmadd231pd m0, m6, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20
; dy, d2y
- movu m2, [u_upq + offsetq]
- movu m3, [u_downq + offsetq]
+ movu m8, [u_upq + offsetq]
+ movu m9, [u_downq + offsetq]
- subpd m6, m2, m3
- mulpd m6, m7
+ subpd m6, m8, m9
+ mulpd m6, m1
vfmadd231pd m0, m6, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01
- addpd m6, m2, m3
- subpd m6, m1
- mulpd m6, m10
+ addpd m6, m8, m9
+ subpd m6, m7
+ mulpd m6, m4
vfmadd231pd m0, m6, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02
; mixed d2xy
- movu m1, [u_upq + offsetq + ELEM_SIZE]
- subpd m1, [u_upq + offsetq - ELEM_SIZE]
- subpd m1, [u_downq + offsetq + ELEM_SIZE]
- addpd m1, [u_downq + offsetq - ELEM_SIZE]
+ movu m7, [u_upq + offsetq + ELEM_SIZE]
+ subpd m7, [u_upq + offsetq - ELEM_SIZE]
+ subpd m7, [u_downq + offsetq + ELEM_SIZE]
+ addpd m7, [u_downq + offsetq - ELEM_SIZE]
- mulpd m2, m9, [diff_coeffs11q + offsetq]
- vfmadd231pd m0, m1, m2 ; res += d_xy u * diff_coeffs11
+ mulpd m8, m3, [diff_coeffs11q + offsetq]
+ vfmadd231pd m0, m7, m8 ; res += d_xy u * diff_coeffs11
; store the result
movu [dstq + offsetq], m0