aboutsummaryrefslogtreecommitdiff
path: root/residual_calc.asm
diff options
context:
space:
mode:
Diffstat (limited to 'residual_calc.asm')
-rw-r--r--residual_calc.asm231
1 files changed, 229 insertions, 2 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 5eea31c..471f864 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -32,8 +32,14 @@
SECTION .rodata
-const8: times 8 dq 8.0
-const30: times 8 dq 30.0
+const5: times 8 dq 5.0
+const8: times 8 dq 8.0
+const9: times 8 dq 9.0
+const10: times 8 dq 10.0
+const13_5: times 8 dq 13.5
+const30: times 8 dq 30.0
+const45: times 8 dq 45.0
+const245: times 8 dq 245.0
SECTION .text
@@ -282,3 +288,224 @@ RESIDUAL_CALC 2, 0
cglobal residual_add_line_s2, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_down, u_up, u_up2
RESIDUAL_CALC 2, 1
+
+%macro DX3 3
+ movu %1, [%3 + ELEM_SIZE * 1] ; out = u[x+1]
+ subpd %1, [%3 - ELEM_SIZE * 1] ; -u[x-1]
+
+ movu %2, [%3 + ELEM_SIZE * 2] ; tmp = u[x+2]
+ subpd %2, [%3 - ELEM_SIZE * 2] ; -u[x-2]
+
+ vfmsub132pd %1, %2, [const5] ; out = 5 * out - tmp
+
+ movu %2, [%3 + ELEM_SIZE * 3] ; tmp = u[x+3]
+ subpd %2, [%3 - ELEM_SIZE * 3] ; -u[x-3]
+
+ vfmadd132pd %1, %2, [const9] ; out = 9 * out + tmp
+%endmacro
+
+%macro RES_ADD_DIFF_X_3 0
+ ; first derivative
+ DX3 m7, m8, uq + offsetq
+ vfmadd231pd m0, m7, [diff_coeffs10q + offsetq] ; res += d_x u * diff_coeffs10
+
+ ; second derivative
+ movu m7, [uq + offsetq + ELEM_SIZE * 1] ; m7 = u[x+1]
+ addpd m7, [uq + offsetq - ELEM_SIZE * 1] ; +u[x-1]
+
+ movu m8, [uq + offsetq + ELEM_SIZE * 2] ; m8 = u[x+2]
+ addpd m8, [uq + offsetq - ELEM_SIZE * 2] ; +u[x-2]
+
+ vfmsub132pd m7, m8, [const10] ; m7 = 10 * m7 - m8
+
+ movu m8, [uq + offsetq + ELEM_SIZE * 3] ; m8 = u[x+3]
+ addpd m8, [uq + offsetq - ELEM_SIZE * 3] ; -u[x-3]
+
+ vfmadd132pd m7, m8, [const13_5] ; m7 = 13.5 * m7 + m8
+ vfnmadd231pd m7, m6, [const245] ; m7 = m7 - 245 * u[0]
+ vfmadd231pd m0, m7, [diff_coeffs20q + offsetq]; res += d_x u * diff_coeffs20
+%endmacro
+
+%macro RES_ADD_DIFF_Y_3 0
+ ; first derivative
+ lea u_upq, [uq + strideq]
+ mov u_downq, uq
+ sub u_downq, strideq
+ movu m7, [u_upq + offsetq] ; m7 = u[y+1]
+ subpd m7, [u_downq + offsetq] ; -u[y-1]
+
+ add u_upq, strideq
+ sub u_downq, strideq
+ movu m8, [u_upq + offsetq] ; m8 = u[y+2]
+ subpd m8, [u_downq + offsetq] ; -u[y-2]
+
+ vfmsub132pd m7, m8, [const5] ; m7 = 5 * m7 - m8
+
+ add u_upq, strideq
+ sub u_downq, strideq
+ movu m8, [u_upq + offsetq] ; m8 = u[y+3]
+ subpd m8, [u_downq + offsetq] ; -u[y-3]
+
+ vfmadd132pd m7, m8, [const9] ; m7 = 9 * m7 + m8
+ vfmadd231pd m0, m7, [diff_coeffs01q + offsetq] ; res += d_x u * diff_coeffs01
+
+ ; second derivative
+ lea u_upq, [uq + strideq]
+ mov u_downq, uq
+ sub u_downq, strideq
+ movu m7, [u_upq + offsetq] ; m7 = u[y+1]
+ addpd m7, [u_downq + offsetq] ; +u[y-1]
+
+ add u_upq, strideq
+ sub u_downq, strideq
+ movu m8, [u_upq + offsetq] ; m8 = u[y+2]
+ addpd m8, [u_downq + offsetq] ; +u[y-2]
+
+ vfmsub132pd m7, m8, [const10] ; m7 = 10 * m7 - m8
+
+ add u_upq, strideq
+ sub u_downq, strideq
+ movu m8, [u_upq + offsetq] ; m8 = u[x+3]
+ addpd m8, [u_downq + offsetq] ; -u[x-3]
+
+ vfmadd132pd m7, m8, [const13_5] ; m7 = 13.5 * m7 + m8
+ vfnmadd231pd m7, m6, [const245] ; m7 = m7 - 245 * u[0]
+ vfmadd231pd m0, m7, [diff_coeffs02q + offsetq]; res += d_y u * diff_coeffs02
+%endmacro
+
+%macro RES_ADD_DIFF_MIXED_3 0
+ lea u_upq, [uq + 2 * strideq]
+ add u_upq, strideq
+ DX3 m6, m7, u_upq + offsetq ; m6 = dx u[y + 3]
+
+ sub u_upq, strideq
+ DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y + 2]
+ vfnmadd231pd m6, m7, [const9] ; m6 = m6 - 9 * m7
+
+ sub u_upq, strideq
+ DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y + 1]
+ vfmadd231pd m6, m7, [const45] ; m6 = m6 + 45 * m7
+
+ sub u_upq, strideq
+ sub u_upq, strideq
+ DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 1]
+ vfnmadd231pd m6, m7, [const45] ; m6 = m6 - 45 * m7
+
+ sub u_upq, strideq
+ DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 2]
+ vfmadd231pd m6, m7, [const9] ; m6 = m6 + 9 * m7
+
+ sub u_upq, strideq
+ DX3 m7, m8, u_upq + offsetq ; m7 = dx u[y - 3]
+ subpd m6, m7
+ vfmadd231pd m0, m6, [diff_coeffs11q + offsetq]
+%endmacro
+
+%macro RESIDUAL_CALC_3 2
+%if %2
+ vpermq m2, m1, 0
+%endif
+ vpermq m1, m0, 0
+
+ ; compute the mask for absolute value
+ pcmpeqq m13, m13
+ psrlq m13, 1
+ movu m12, [res_maxq]
+
+ ; load pointers to the equation coefficients
+ %define diff_coeffs20q diff_coeffsq ; reuse the array register to store the last pointer
+ mov diff_coeffs00q, [diff_coeffsq + OFF_DIFF_COEFF_00]
+ mov diff_coeffs01q, [diff_coeffsq + OFF_DIFF_COEFF_01]
+ mov diff_coeffs10q, [diff_coeffsq + OFF_DIFF_COEFF_10]
+ mov diff_coeffs11q, [diff_coeffsq + OFF_DIFF_COEFF_11]
+ mov diff_coeffs02q, [diff_coeffsq + OFF_DIFF_COEFF_02]
+ mov diff_coeffs20q, [diff_coeffsq + OFF_DIFF_COEFF_20]
+
+ ; setup the data pointers and the loop counter
+ shl strideq, 3
+ shl linesizeq, 3
+ add dstq, linesizeq
+ add uq, linesizeq
+ add rhsq, linesizeq
+ add diff_coeffs00q, linesizeq
+ add diff_coeffs01q, linesizeq
+ add diff_coeffs10q, linesizeq
+ add diff_coeffs11q, linesizeq
+ add diff_coeffs02q, linesizeq
+ add diff_coeffs20q, linesizeq
+ neg linesizeq
+ ; from now on, the register that held linesize is used as the offset into data arrays
+ %define offsetq linesizeq
+.loop:
+ xorpd m0, m0
+ subpd m0, [rhsq + offsetq] ; res = -rhs
+
+ ; plain value
+ movu m6, [uq + offsetq] ; m6 = u[x]
+ vfmadd231pd m0, m6, [diff_coeffs00q + offsetq] ; res += u * diff_coeffs00
+%if %2
+ mulpd m3, m6, m2
+%endif
+
+ RES_ADD_DIFF_X_3
+ RES_ADD_DIFF_Y_3
+ RES_ADD_DIFF_MIXED_3
+
+ andpd m6, m0, m13 ; m6 = abs(res)
+ mulpd m0, m1
+%if %2
+ addpd m0, m3
+%endif
+
+ ; store the result
+ add offsetq, mmsize
+ jg .store_partial
+
+ ; store full block
+ movu [dstq + offsetq - mmsize], m0
+ maxpd m12, m6
+ js .loop
+ jmp .finish
+
+.store_partial:
+ sub offsetq, ELEM_SIZE
+ jz .store3
+ sub offsetq, ELEM_SIZE
+ jz .store2
+
+.store1:
+ ; offsetq is now mmsize-2 after the write position
+ movq [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+
+ vpermq m6, m6, 0
+ maxpd m12, m6
+
+ jmp .finish
+.store2:
+ ; offsetq is now mmsize-2 after the write position
+ movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+ mova xm6, xm6
+ maxpd m12, m6
+ jmp .finish
+.store3:
+ ; offsetq is now mmsize-1 after the write position
+ movu [dstq + offsetq - mmsize + 1 * ELEM_SIZE], xm0
+ vextractf128 xm0, m0, 1
+ movq [dstq + offsetq - mmsize + 3 * ELEM_SIZE], xm0
+
+ vpermq m6, m6, 10100100b
+ maxpd m12, m6
+
+.finish:
+ movu [res_maxq], m12
+ RET
+%endmacro
+
+INIT_YMM fma3
+cglobal residual_calc_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down
+RESIDUAL_CALC_3 3, 0
+
+cglobal residual_add_line_s3, 7, 15, 16, linesize, dst, res_max, stride, u, rhs, diff_coeffs,\
+ diff_coeffs00, diff_coeffs01, diff_coeffs10, diff_coeffs11, diff_coeffs02, u_up, u_down
+RESIDUAL_CALC_3 3, 1