; ; SIMD for interpolation ; Copyright 2019 Anton Khirnov ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program. If not, see . ;/ %include "config.asm" %include "x86inc.asm" SECTION .text INIT_YMM fma3 cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ idx_x_val shl src_strideq, 3 shl dst_lenq, 3 add dstq, dst_lenq add idx_xq, dst_lenq lea fact_xq, [fact_xq + 4 * dst_lenq] neg dst_lenq ; from now on, the register that held the line size is used as the offset into data arrays %define offsetq dst_lenq movu m0, [fact_yq] vpermq m1, m0, 01010101b ; fact y + 1 -> m1 vpermq m2, m0, 10101010b ; fact y + 2 -> m2 vpermq m3, m0, 11111111b ; fact y + 3 -> m3 vpermq m0, m0, 00000000b ; fact y + 0 -> m0 .loop: mov idx_x_valq, [idx_xq + offsetq] shl idx_x_valq, 3 xorpd m4, m4 movu m5, [fact_xq + 4 * offsetq] mulpd m6, m5, [srcq + idx_x_valq] mulpd m6, m0 add idx_x_valq, src_strideq mulpd m7, m5, [srcq + idx_x_valq] vfmadd231pd m6, m7, m1 add idx_x_valq, src_strideq mulpd m7, m5, [srcq + idx_x_valq] vfmadd231pd m6, m7, m2 add idx_x_valq, src_strideq mulpd m7, m5, [srcq + idx_x_valq] vfmadd231pd m6, m7, m3 haddpd m6, m6 vpermq m6, m6, 00001000b haddpd m6, m6 movq [dstq + offsetq], xm6 add offsetq, 8 js .loop RET