aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-01-09 15:45:11 +0100
committerAnton Khirnov <anton@khirnov.net>2019-01-10 09:14:21 +0100
commit88db705983c7b518372c93ef50bac5e8ddb1e6bf (patch)
tree9c6e9c55bdefd8209b2aee2dcb086d9879c4ba4a /residual_calc.asm
parenta5b6c8522595edea8bf5c544ebd88d36c8a75f05 (diff)
residual_calc.asm: make mm register use more consistent between s1 and s2
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm54
1 files changed, 28 insertions, 26 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 9b74cc8..7048da3 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -87,45 +87,45 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
subpd m0, [rhsq + offsetq] ; res = -rhs
; plain value
- movu m7, [uq + offsetq]
- vfmadd231pd m0, m7, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+ movu m6, [uq + offsetq] ; m6 = u[x]
+ vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+
+ addpd m6, m6 ; m6 = 2 * u[x]
; dx, d2x
movu m8, [uq + offsetq + ELEM_SIZE]
movu m9, [uq + offsetq - ELEM_SIZE]
- subpd m6, m8, m9
- mulpd m6, m2
- vfmadd231pd m0, m6, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10
+ subpd m7, m8, m9
+ mulpd m7, m2
+ vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10
- addpd m7, m7
-
- addpd m6, m8, m9
- subpd m6, m7
- mulpd m6, m5
- vfmadd231pd m0, m6, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20
+ addpd m7, m8, m9
+ subpd m7, m6
+ mulpd m7, m5
+ vfmadd231pd m0, m7, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20
; dy, d2y
movu m8, [u_upq + offsetq]
movu m9, [u_downq + offsetq]
- subpd m6, m8, m9
- mulpd m6, m1
- vfmadd231pd m0, m6, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01
+ subpd m7, m8, m9
+ mulpd m7, m1
+ vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_y u * diff_coeffs01
- addpd m6, m8, m9
- subpd m6, m7
- mulpd m6, m4
- vfmadd231pd m0, m6, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02
+ addpd m7, m8, m9
+ subpd m7, m6
+ mulpd m7, m4
+ vfmadd231pd m0, m7, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02
; mixed d2xy
- movu m7, [u_upq + offsetq + ELEM_SIZE]
- subpd m7, [u_upq + offsetq - ELEM_SIZE]
- subpd m7, [u_downq + offsetq + ELEM_SIZE]
- addpd m7, [u_downq + offsetq - ELEM_SIZE]
+ movu m6, [u_upq + offsetq + ELEM_SIZE]
+ subpd m6, [u_upq + offsetq - ELEM_SIZE]
+ subpd m6, [u_downq + offsetq + ELEM_SIZE]
+ addpd m6, [u_downq + offsetq - ELEM_SIZE]
- mulpd m8, m3, [diff_coeffs11q + offsetq]
- vfmadd231pd m0, m7, m8 ; res += d_xy u * diff_coeffs11
+ mulpd m6, m3
+ vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11
; store the result
movu [dstq + offsetq], m0
@@ -194,6 +194,8 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co
movu m6, [uq + offsetq]
vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+ mulpd m6, m15 ; m6 = 30 u[x]
+
; dx, d2x
movu m7, [uq + offsetq + ELEM_SIZE] ; m7 = u[x+1]
movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2]
@@ -212,7 +214,7 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co
vfmadd231pd m11, m13, m9 ; m11 += 16 u[x-1]
subpd m11, m8 ; m11 -= u[x+2]
subpd m11, m10 ; m11 -= u[x-2]
- vfnmadd231pd m11, m15, m6 ; m11 -= 30 u
+ subpd m11, m6 ; m11 -= 30 u[x]
mulpd m11, m5
vfmadd231pd m0, m11, [diff_coeffs20q + offsetq] ; res += d_xx u * diff_coeffs20
@@ -235,7 +237,7 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co
vfmadd231pd m11, m13, m9 ; m11 += 16 u[y-1]
subpd m11, m8 ; m11 -= u[y+2]
subpd m11, m10 ; m11 -= u[y-2]
- vfnmadd231pd m11, m15, m6 ; m11 -= 30 u
+ subpd m11, m6 ; m11 -= 30 u[x]
mulpd m11, m4
vfmadd231pd m0, m11, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02