aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--residual_calc.asm29
1 files changed, 27 insertions, 2 deletions
diff --git a/residual_calc.asm b/residual_calc.asm
index 289c3fb..f6f34f3 100644
--- a/residual_calc.asm
+++ b/residual_calc.asm
@@ -208,10 +208,35 @@ SECTION .text
RES_ADD_DIFF_MIXED stencil
; store the result
- movu [dstq + offsetq], m0
add offsetq, mmsize
- js .loop
+ jg .store_partial
+ ; store full block
+ movu [dstq + offsetq - mmsize], m0
+ js .loop
+ jmp .finish
+
+.store_partial:
+ sub offsetq, ELEM_SIZE
+ jz .store1
+ sub offsetq, ELEM_SIZE
+ jz .store2
+
+.store3:
+ ; offsetq is now mmsize-2 after the write position
+ movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+ vextractf128 xm0, m0, 1
+ movq [dstq + offsetq - mmsize + 4 * ELEM_SIZE], xm0
+ jmp .finish
+.store2:
+ ; offsetq is now mmsize-2 after the write position
+ movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0
+ jmp .finish
+.store1:
+ ; offsetq is now mmsize-1 after the write position
+ movq [dstq + offsetq - mmsize + ELEM_SIZE], xm0
+
+.finish:
RET
%endmacro