summaryrefslogtreecommitdiff
path: root/libavfilter/x86/vf_stereo3d.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavfilter/x86/vf_stereo3d.asm')
-rw-r--r--libavfilter/x86/vf_stereo3d.asm216
1 files changed, 216 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_stereo3d.asm b/libavfilter/x86/vf_stereo3d.asm
new file mode 100644
index 0000000000..a057e495f1
--- /dev/null
+++ b/libavfilter/x86/vf_stereo3d.asm
@@ -0,0 +1,216 @@
+;*****************************************************************************
+;* x86-optimized functions for stereo3d filter
+;*
+;* Copyright (C) 2015 Paul B Mahol
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*****************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+; rgbrgbrgbrgb
+; rrrrggggbbbb
+
+shuf: db 0, 4, 8, 1,5, 9, 2, 6,10,3, 7,11,-1,-1,-1,-1
+ex_r: db 0,-1,-1,-1,3,-1,-1,-1,6,-1,-1,-1, 9,-1,-1,-1
+ex_g: db 1,-1,-1,-1,4,-1,-1,-1,7,-1,-1,-1,10,-1,-1,-1
+ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1
+
+SECTION .text
+
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal anaglyph, 6, 10, 14, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, o, cnt
+%define ana_matrix_rq r6q
+%define ana_matrix_gq r7q
+%define ana_matrix_bq r8q
+
+%else ; ARCH_X86_32
+%if HAVE_ALIGNED_STACK
+cglobal anaglyph, 3, 7, 8, 2*9*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, o, cnt
+%else
+cglobal anaglyph, 3, 6, 8, 2*9*mmsize, dst, lsrc, rsrc, dst_linesize, o, cnt
+%define l_linesizeq r4mp
+%endif ; HAVE_ALIGNED_STACK
+%define ana_matrix_rq r3q
+%define ana_matrix_gq r4q
+%define ana_matrix_bq r5q
+%define r_linesizeq r5mp
+%define widthd r6mp
+%define heightd r7mp
+%define m8 [rsp+mmsize*12]
+%define m9 [rsp+mmsize*13]
+%define m10 [rsp+mmsize*14]
+%define m11 [rsp+mmsize*15]
+%define m12 [rsp+mmsize*16]
+%define m13 [rsp+mmsize*17]
+%endif ; ARCH
+
+ mov ana_matrix_rq, r8m
+ mov ana_matrix_gq, r9m
+ mov ana_matrix_bq, r10m
+ movu m3, [ana_matrix_rq+ 0]
+ movq m5, [ana_matrix_rq+16]
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ mova [rsp+mmsize*0], m0
+ mova [rsp+mmsize*1], m1
+ mova [rsp+mmsize*2], m2
+ mova [rsp+mmsize*3], m3
+ mova [rsp+mmsize*4], m4
+ mova [rsp+mmsize*5], m5
+
+ movu m3, [ana_matrix_gq+ 0]
+ movq m5, [ana_matrix_gq+16]
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ mova [rsp+mmsize*6 ], m0
+ mova [rsp+mmsize*7 ], m1
+ mova [rsp+mmsize*8 ], m2
+ mova [rsp+mmsize*9 ], m3
+ mova [rsp+mmsize*10], m4
+ mova [rsp+mmsize*11], m5
+
+%if ARCH_X86_64
+ movu m11, [ana_matrix_bq+ 0]
+ movq m13, [ana_matrix_bq+16]
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ pshufd m12, m13, q0000
+ pshufd m13, m13, q1111
+ mov widthd, dword widthm
+ mov heightd, dword heightm
+%else
+ movu m3, [ana_matrix_bq+ 0]
+ movq m5, [ana_matrix_bq+16]
+ pshufd m0, m3, q0000
+ pshufd m1, m3, q1111
+ pshufd m2, m3, q2222
+ pshufd m3, m3, q3333
+ pshufd m4, m5, q0000
+ pshufd m5, m5, q1111
+ mova [rsp+mmsize*12], m0
+ mova [rsp+mmsize*13], m1
+ mova [rsp+mmsize*14], m2
+ mova [rsp+mmsize*15], m3
+ mova [rsp+mmsize*16], m4
+ mova [rsp+mmsize*17], m5
+ mov dst_linesizeq, r3m
+%if HAVE_ALIGNED_STACK
+ mov l_linesizeq, r4m
+%endif
+%endif ; ARCH
+
+.nextrow:
+ mov od, widthd
+ xor cntd, cntd
+
+ .loop:
+ movu m3, [lsrcq+cntq]
+ pshufb m1, m3, [ex_r]
+ pshufb m2, m3, [ex_g]
+ pshufb m3, [ex_b]
+ movu m0, [rsrcq+cntq]
+ pshufb m4, m0, [ex_r]
+ pshufb m5, m0, [ex_g]
+ pshufb m0, [ex_b]
+ pmulld m1, [rsp+mmsize*0]
+ pmulld m2, [rsp+mmsize*1]
+ pmulld m3, [rsp+mmsize*2]
+ pmulld m4, [rsp+mmsize*3]
+ pmulld m5, [rsp+mmsize*4]
+ pmulld m0, [rsp+mmsize*5]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m5, m0
+ paddd m1, m3
+ paddd m1, m5
+
+ movu m3, [lsrcq+cntq]
+ pshufb m7, m3, [ex_r]
+ pshufb m2, m3, [ex_g]
+ pshufb m3, [ex_b]
+ movu m0, [rsrcq+cntq]
+ pshufb m4, m0, [ex_r]
+ pshufb m5, m0, [ex_g]
+ pshufb m0, [ex_b]
+ pmulld m7, [rsp+mmsize*6]
+ pmulld m2, [rsp+mmsize*7]
+ pmulld m3, [rsp+mmsize*8]
+ pmulld m4, [rsp+mmsize*9]
+ pmulld m5, [rsp+mmsize*10]
+ pmulld m0, [rsp+mmsize*11]
+ paddd m7, m2
+ paddd m3, m4
+ paddd m5, m0
+ paddd m7, m3
+ paddd m7, m5
+
+ movu m4, [lsrcq+cntq]
+ pshufb m2, m4, [ex_r]
+ pshufb m3, m4, [ex_g]
+ pshufb m4, [ex_b]
+ movu m0, [rsrcq+cntq]
+ pshufb m5, m0, [ex_r]
+ pshufb m6, m0, [ex_g]
+ pshufb m0, [ex_b]
+ pmulld m2, m8
+ pmulld m3, m9
+ pmulld m4, m10
+ pmulld m5, m11
+ pmulld m6, m12
+ pmulld m0, m13
+ paddd m2, m3
+ paddd m4, m5
+ paddd m6, m0
+ paddd m2, m4
+ paddd m2, m6
+
+ psrld m1, 16
+ psrld m7, 16
+ psrld m2, 16
+
+ packusdw m1, m7
+ packusdw m2, m2
+ packuswb m1, m2
+ pshufb m1, [shuf]
+
+ movq [dstq+cntq+0], m1
+ psrldq m1, 8
+ movd [dstq+cntq+8], m1
+ add cntd, 12
+ sub od, 4
+ jg .loop
+
+ add dstq, dst_linesizeq
+ add lsrcq, l_linesizeq
+ add rsrcq, r_linesizeq
+ sub heightd, 1
+ jg .nextrow
+REP_RET