From d215c872bbdf8ba733f9a9fbd586374b841fdfb5 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Sat, 23 Mar 2019 17:55:57 +0100 Subject: Add a new separate module for grid transfers/interpolation. --- transfer_interp.asm | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 transfer_interp.asm (limited to 'transfer_interp.asm') diff --git a/transfer_interp.asm b/transfer_interp.asm new file mode 100644 index 0000000..a6ae60f --- /dev/null +++ b/transfer_interp.asm @@ -0,0 +1,74 @@ +; +; SIMD for interpolation +; Copyright 2019 Anton Khirnov +; +; This program is free software: you can redistribute it and/or modify +; it under the terms of the GNU General Public License as published by +; the Free Software Foundation, either version 3 of the License, or +; (at your option) any later version. +; +; This program is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +; GNU General Public License for more details. +; +; You should have received a copy of the GNU General Public License +; along with this program. If not, see . +;/ + +%include "config.asm" +%include "x86inc.asm" + +SECTION .text + +INIT_YMM fma3 +cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ + idx_x_val + shl src_strideq, 3 + shl dst_lenq, 3 + + add dstq, dst_lenq + add idx_xq, dst_lenq + lea fact_xq, [fact_xq + 4 * dst_lenq] + neg dst_lenq + ; from now on, the register that held the line size is used as the offset into data arrays + %define offsetq dst_lenq + + movu m0, [fact_yq] + vpermq m1, m0, 01010101b ; fact y + 1 -> m1 + vpermq m2, m0, 10101010b ; fact y + 2 -> m2 + vpermq m3, m0, 11111111b ; fact y + 3 -> m3 + vpermq m0, m0, 00000000b ; fact y + 0 -> m0 + +.loop: + mov idx_x_valq, [idx_xq + offsetq] + shl idx_x_valq, 3 + + xorpd m4, m4 + + movu m5, [fact_xq + 4 * offsetq] + + mulpd m6, m5, [srcq + idx_x_valq] + mulpd m6, m0 + + add idx_x_valq, src_strideq + mulpd m7, m5, [srcq + idx_x_valq] + vfmadd231pd m6, m7, m1 + + add idx_x_valq, src_strideq + mulpd m7, m5, [srcq + idx_x_valq] + vfmadd231pd m6, m7, m2 + + add idx_x_valq, src_strideq + mulpd m7, m5, [srcq + idx_x_valq] + vfmadd231pd m6, m7, m3 + + haddpd m6, m6 + vpermq m6, m6, 00001000b + haddpd m6, m6 + + movq [dstq + offsetq], xm6 + add offsetq, 8 + js .loop + + RET -- cgit v1.2.3