aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-01-09 15:19:58 +0100
committerAnton Khirnov <anton@khirnov.net>2019-01-10 09:14:21 +0100
commit4dddc9813215eefb2f637fa62d8e165ad5a8ecaa (patch)
tree807d7eecbfcfd6f7c0aa48fc753901f9ae6fdff2 /residual_calc.asm
parenta4ce9ca28b1b7b3943b0e7f1b62d1024525e053c (diff)
residual_calc.asm: reduce the use of magic constants
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm75
1 files changed, 40 insertions, 35 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 638ff42..2dd5f7c 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -19,6 +19,17 @@
%include "config.asm"
%include "x86inc.asm"
+; double precision
+%define ELEM_SIZE 8
+
+; offsets to FD coefficients for given derivative
+%define OFF_DIFF_COEFF_00 0 * gprsize
+%define OFF_DIFF_COEFF_01 1 * gprsize
+%define OFF_DIFF_COEFF_10 2 * gprsize
+%define OFF_DIFF_COEFF_11 3 * gprsize
+%define OFF_DIFF_COEFF_02 4 * gprsize
+%define OFF_DIFF_COEFF_20 5 * gprsize
+
SECTION .rodata
const8: times 8 dq 8.0
@@ -27,12 +38,6 @@ const30: times 8 dq 30.0
const64: times 8 dq 64.0
SECTION .text
-%define OFF_DIFF_COEFF_00 0 * 8
-%define OFF_DIFF_COEFF_01 1 * 8
-%define OFF_DIFF_COEFF_10 2 * 8
-%define OFF_DIFF_COEFF_11 3 * 8
-%define OFF_DIFF_COEFF_02 4 * 8
-%define OFF_DIFF_COEFF_20 5 * 8
INIT_YMM fma3
cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_coeffs, fd_factors,\
@@ -86,8 +91,8 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
vfmadd231pd m0, m1, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
; dx, d2x
- movu m2, [uq + offsetq + 8]
- movu m3, [uq + offsetq - 8]
+ movu m2, [uq + offsetq + ELEM_SIZE]
+ movu m3, [uq + offsetq - ELEM_SIZE]
subpd m6, m2, m3
mulpd m6, m8
@@ -114,10 +119,10 @@ cglobal residual_calc_line_s1, 7, 13, 12, linesize, dst, stride, u, rhs, diff_co
vfmadd231pd m0, m6, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02
; mixed d2xy
- movu m1, [u_upq + offsetq + 8]
- subpd m1, [u_upq + offsetq - 8]
- subpd m1, [u_downq + offsetq + 8]
- addpd m1, [u_downq + offsetq - 8]
+ movu m1, [u_upq + offsetq + ELEM_SIZE]
+ subpd m1, [u_upq + offsetq - ELEM_SIZE]
+ subpd m1, [u_downq + offsetq + ELEM_SIZE]
+ addpd m1, [u_downq + offsetq - ELEM_SIZE]
mulpd m2, m9, [diff_coeffs11q + offsetq]
vfmadd231pd m0, m1, m2 ; res += d_xy u * diff_coeffs11
@@ -190,10 +195,10 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co
vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
; dx, d2x
- movu m7, [uq + offsetq + 8] ; m7 = u[x+1]
- movu m8, [uq + offsetq + 8 * 2] ; m8 = u[x+2]
- movu m9, [uq + offsetq - 8] ; m9 = u[x-1]
- movu m10, [uq + offsetq - 8 * 2] ; m10 = u[x-2]
+ movu m7, [uq + offsetq + ELEM_SIZE] ; m7 = u[x+1]
+ movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2]
+ movu m9, [uq + offsetq - ELEM_SIZE] ; m9 = u[x-1]
+ movu m10, [uq + offsetq - ELEM_SIZE * 2] ; m10 = u[x-2]
mulpd m11, m14, m7 ; m11 = 8 u[x+1]
vfnmadd231pd m11, m14, m9 ; m11 -= 8 u[x-1]
@@ -236,25 +241,25 @@ cglobal residual_calc_line_s2, 7, 15, 16, linesize, dst, stride, u, rhs, diff_co
vfmadd231pd m0, m11, [diff_coeffs02q + offsetq] ; res += d_yy u * diff_coeffs02
; mixed d2xy
- movu m6, [u_up2q + offsetq + 2 * 8] ; m6 = u[y+2, x+2]
- vfnmadd231pd m6, m14, [u_up2q + offsetq + 1 * 8] ; m6 -= 8 u[y+2, x+1]
- vfmadd231pd m6, m14, [u_up2q + offsetq - 1 * 8] ; m6 += 8 u[y+2, x-1]
- subpd m6, [u_up2q + offsetq - 2 * 8] ; m6 -= u[y+2, x-2]
-
- vfnmadd231pd m6, m14, [u_upq + offsetq + 2 * 8] ; m6 -= 8 u[y+1, x+2]
- vfmadd231pd m6, m12, [u_upq + offsetq + 1 * 8] ; m6 += 64 u[y+1, x+1]
- vfnmadd231pd m6, m12, [u_upq + offsetq - 1 * 8] ; m6 -= 64 u[y+1, x-1]
- vfmadd231pd m6, m14, [u_upq + offsetq - 2 * 8] ; m6 += 8 u[y+1, x-2]
-
- vfmadd231pd m6, m14, [u_downq + offsetq + 2 * 8] ; m6 += 8 u[y-1, x+2]
- vfnmadd231pd m6, m12, [u_downq + offsetq + 1 * 8] ; m6 -= 64 u[y-1, x+1]
- vfmadd231pd m6, m12, [u_downq + offsetq - 1 * 8] ; m6 += 64 u[y-1, x-1]
- vfnmadd231pd m6, m14, [u_downq + offsetq - 2 * 8] ; m6 -= 8 u[y-1, x-2]
-
- subpd m6, [u_down2q + offsetq + 2 * 8] ; m6 -= u[y-2, x+2]
- vfmadd231pd m6, m14, [u_down2q + offsetq + 1 * 8] ; m6 += 8 u[y-2, x+1]
- vfnmadd231pd m6, m14, [u_down2q + offsetq - 1 * 8] ; m6 += 8 u[y-2, x-1]
- addpd m6, [u_down2q + offsetq - 2 * 8] ; m6 += u[y-2, x-2]
+ movu m6, [u_up2q + offsetq + 2 * ELEM_SIZE] ; m6 = u[y+2, x+2]
+ vfnmadd231pd m6, m14, [u_up2q + offsetq + 1 * ELEM_SIZE] ; m6 -= 8 u[y+2, x+1]
+ vfmadd231pd m6, m14, [u_up2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y+2, x-1]
+ subpd m6, [u_up2q + offsetq - 2 * ELEM_SIZE] ; m6 -= u[y+2, x-2]
+
+ vfnmadd231pd m6, m14, [u_upq + offsetq + 2 * ELEM_SIZE] ; m6 -= 8 u[y+1, x+2]
+ vfmadd231pd m6, m12, [u_upq + offsetq + 1 * ELEM_SIZE] ; m6 += 64 u[y+1, x+1]
+ vfnmadd231pd m6, m12, [u_upq + offsetq - 1 * ELEM_SIZE] ; m6 -= 64 u[y+1, x-1]
+ vfmadd231pd m6, m14, [u_upq + offsetq - 2 * ELEM_SIZE] ; m6 += 8 u[y+1, x-2]
+
+ vfmadd231pd m6, m14, [u_downq + offsetq + 2 * ELEM_SIZE] ; m6 += 8 u[y-1, x+2]
+ vfnmadd231pd m6, m12, [u_downq + offsetq + 1 * ELEM_SIZE] ; m6 -= 64 u[y-1, x+1]
+ vfmadd231pd m6, m12, [u_downq + offsetq - 1 * ELEM_SIZE] ; m6 += 64 u[y-1, x-1]
+ vfnmadd231pd m6, m14, [u_downq + offsetq - 2 * ELEM_SIZE] ; m6 -= 8 u[y-1, x-2]
+
+ subpd m6, [u_down2q + offsetq + 2 * ELEM_SIZE] ; m6 -= u[y-2, x+2]
+ vfmadd231pd m6, m14, [u_down2q + offsetq + 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x+1]
+ vfnmadd231pd m6, m14, [u_down2q + offsetq - 1 * ELEM_SIZE] ; m6 += 8 u[y-2, x-1]
+ addpd m6, [u_down2q + offsetq - 2 * ELEM_SIZE] ; m6 += u[y-2, x-2]
mulpd m6, m3
vfmadd231pd m0, m6, [diff_coeffs11q + offsetq] ; res += d_xy u * diff_coeffs11