diff options
Diffstat (limited to 'residual_calc.asm')
-rw-r--r-- | residual_calc.asm | 29 |
1 files changed, 27 insertions, 2 deletions
diff --git a/residual_calc.asm b/residual_calc.asm index 289c3fb..f6f34f3 100644 --- a/residual_calc.asm +++ b/residual_calc.asm @@ -208,10 +208,35 @@ SECTION .text RES_ADD_DIFF_MIXED stencil ; store the result - movu [dstq + offsetq], m0 add offsetq, mmsize - js .loop + jg .store_partial + ; store full block + movu [dstq + offsetq - mmsize], m0 + js .loop + jmp .finish + +.store_partial: + sub offsetq, ELEM_SIZE + jz .store1 + sub offsetq, ELEM_SIZE + jz .store2 + +.store3: + ; offsetq is now mmsize-2 after the write position + movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + vextractf128 xm0, m0, 1 + movq [dstq + offsetq - mmsize + 4 * ELEM_SIZE], xm0 + jmp .finish +.store2: + ; offsetq is now mmsize-2 after the write position + movu [dstq + offsetq - mmsize + 2 * ELEM_SIZE], xm0 + jmp .finish +.store1: + ; offsetq is now mmsize-1 after the write position + movq [dstq + offsetq - mmsize + ELEM_SIZE], xm0 + +.finish: RET %endmacro |