aboutsummaryrefslogtreecommitdiff
path: root/transfer_interp.asm
diff options
context:
space:
mode:
authorAnton Khirnov <anton@khirnov.net>2019-03-23 17:55:57 +0100
committerAnton Khirnov <anton@khirnov.net>2019-03-23 17:55:57 +0100
commitd215c872bbdf8ba733f9a9fbd586374b841fdfb5 (patch)
treefb0072a8dc35179ef35d86c6fbb057b6c504445c /transfer_interp.asm
parentbd178d67da6a8c30b3ccbd020be1b00f42eceb53 (diff)
Add a new separate module for grid transfers/interpolation.
Diffstat (limited to 'transfer_interp.asm')
-rw-r--r--transfer_interp.asm74
1 files changed, 74 insertions, 0 deletions
diff --git a/transfer_interp.asm b/transfer_interp.asm
new file mode 100644
index 0000000..a6ae60f
--- /dev/null
+++ b/transfer_interp.asm
@@ -0,0 +1,74 @@
+;
+; SIMD for interpolation
+; Copyright 2019 Anton Khirnov <anton@khirnov.net>
+;
+; This program is free software: you can redistribute it and/or modify
+; it under the terms of the GNU General Public License as published by
+; the Free Software Foundation, either version 3 of the License, or
+; (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program. If not, see <http://www.gnu.org/licenses/>.
+;/
+
+%include "config.asm"
+%include "x86inc.asm"
+
+SECTION .text
+
+INIT_YMM fma3
+cglobal transfer_interp_line_cont_4, 7, 8, 6, dst, dst_len, src, src_stride, idx_x, fact_x, fact_y,\
+ idx_x_val
+ shl src_strideq, 3
+ shl dst_lenq, 3
+
+ add dstq, dst_lenq
+ add idx_xq, dst_lenq
+ lea fact_xq, [fact_xq + 4 * dst_lenq]
+ neg dst_lenq
+ ; from now on, the register that held the line size is used as the offset into data arrays
+ %define offsetq dst_lenq
+
+ movu m0, [fact_yq]
+ vpermq m1, m0, 01010101b ; fact y + 1 -> m1
+ vpermq m2, m0, 10101010b ; fact y + 2 -> m2
+ vpermq m3, m0, 11111111b ; fact y + 3 -> m3
+ vpermq m0, m0, 00000000b ; fact y + 0 -> m0
+
+.loop:
+ mov idx_x_valq, [idx_xq + offsetq]
+ shl idx_x_valq, 3
+
+ xorpd m4, m4
+
+ movu m5, [fact_xq + 4 * offsetq]
+
+ mulpd m6, m5, [srcq + idx_x_valq]
+ mulpd m6, m0
+
+ add idx_x_valq, src_strideq
+ mulpd m7, m5, [srcq + idx_x_valq]
+ vfmadd231pd m6, m7, m1
+
+ add idx_x_valq, src_strideq
+ mulpd m7, m5, [srcq + idx_x_valq]
+ vfmadd231pd m6, m7, m2
+
+ add idx_x_valq, src_strideq
+ mulpd m7, m5, [srcq + idx_x_valq]
+ vfmadd231pd m6, m7, m3
+
+ haddpd m6, m6
+ vpermq m6, m6, 00001000b
+ haddpd m6, m6
+
+ movq [dstq + offsetq], xm6
+ add offsetq, 8
+ js .loop
+
+ RET