; ; SIMD for interpolation ; Copyright 2019 Anton Khirnov ; ; This program is free software: you can redistribute it and/or modify ; it under the terms of the GNU General Public License as published by ; the Free Software Foundation, either version 3 of the License, or ; (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program. If not, see . ;/ %include "config.asm" %include "x86inc.asm" %include "util.asm" SECTION .text INIT_YMM avx2 cglobal transfer_interp2d_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ idx_x_val shl src_strideq, 3 shl dst_lenq, 3 add dstq, dst_lenq add idx_xq, dst_lenq lea fact_xq, [fact_xq + 4 * dst_lenq] neg dst_lenq ; from now on, the register that held the line size is used as the offset into data arrays %define offsetq dst_lenq movu m0, [fact_yq] SPLATPD m1, m0, 1 ; fact y + 1 -> m1 SPLATPD m2, m0, 2 ; fact y + 2 -> m2 SPLATPD m3, m0, 3 ; fact y + 3 -> m3 SPLATPD m0, m0, 0 ; fact y + 0 -> m0 .loop: mov idx_x_valq, [idx_xq + offsetq] shl idx_x_valq, 3 xorpd m4, m4 movu m5, [fact_xq + 4 * offsetq] mulpd m6, m5, [srcq + idx_x_valq] mulpd m6, m0 add idx_x_valq, src_strideq mulpd m7, m5, [srcq + idx_x_valq] vfmadd231pd m6, m7, m1 add idx_x_valq, src_strideq mulpd m7, m5, [srcq + idx_x_valq] vfmadd231pd m6, m7, m2 add idx_x_valq, src_strideq mulpd m7, m5, [srcq + idx_x_valq] vfmadd231pd m6, m7, m3 haddpd m6, m6 vpermq m6, m6, 00001000b haddpd m6, m6 movq [dstq + offsetq], xm6 add offsetq, 8 js .loop RET INIT_YMM avx2 cglobal transfer_interp2d_line_cont_6, 7, 9, 11, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\ idx_x_val, offset6 shl src_strideq, 3 shl dst_lenq, 3 add dstq, dst_lenq add idx_xq, dst_lenq lea fact_xq, [fact_xq + 4 * dst_lenq] lea fact_xq, [fact_xq + 2 * dst_lenq] neg dst_lenq ; from now on, the register that held the line size is used as the offset into data arrays %define offsetq dst_lenq lea offset6q, [offsetq + 2 * offsetq] add offset6q, offset6q movu m0, [fact_yq] SPLATPD m1, m0, 1 ; fact y + 1 -> m1 SPLATPD m2, m0, 2 ; fact y + 2 -> m2 SPLATPD m3, m0, 3 ; fact y + 3 -> m3 SPLATPD m0, m0, 0 ; fact y + 0 -> m0 movu xm4, [fact_yq + 8 * 4] SPLATPD m5, m4, 1 SPLATPD m4, m4, 0 .loop: mov idx_x_valq, [idx_xq + offsetq] shl idx_x_valq, 3 movu m6, [fact_xq + offset6q] movu xm7, [fact_xq + offset6q + mmsize] mulpd m8, m6, [srcq + idx_x_valq] mulpd xm9, xm7, [srcq + idx_x_valq + mmsize] mulpd m8, m0 mulpd m9, m0 add idx_x_valq, src_strideq mulpd m10, m6, [srcq + idx_x_valq] vfmadd231pd m8, m10, m1 mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] vfmadd231pd m9, m11, m1 add idx_x_valq, src_strideq mulpd m10, m6, [srcq + idx_x_valq] vfmadd231pd m8, m10, m2 mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] vfmadd231pd m9, m11, m2 add idx_x_valq, src_strideq mulpd m10, m6, [srcq + idx_x_valq] vfmadd231pd m8, m10, m3 mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] vfmadd231pd m9, m11, m3 add idx_x_valq, src_strideq mulpd m10, m6, [srcq + idx_x_valq] vfmadd231pd m8, m10, m4 mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] vfmadd231pd m9, m11, m4 add idx_x_valq, src_strideq mulpd m10, m6, [srcq + idx_x_valq] vfmadd231pd m8, m10, m5 mulpd xm11, xm7, [srcq + idx_x_valq + mmsize] vfmadd231pd m9, m11, m5 haddpd m8, m9 vpermq m9, m8, 10b haddpd xm8, xm8 addpd m8, m9 movq [dstq + offsetq], xm8 add offsetq, 8 add offset6q, 8 * 6 js .loop RET