diff options
Diffstat (limited to 'libavfilter/x86/vf_stereo3d.asm')
-rw-r--r-- | libavfilter/x86/vf_stereo3d.asm | 216 |
1 files changed, 216 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_stereo3d.asm b/libavfilter/x86/vf_stereo3d.asm new file mode 100644 index 0000000000..a057e495f1 --- /dev/null +++ b/libavfilter/x86/vf_stereo3d.asm @@ -0,0 +1,216 @@ +;***************************************************************************** +;* x86-optimized functions for stereo3d filter +;* +;* Copyright (C) 2015 Paul B Mahol +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +; rgbrgbrgbrgb +; rrrrggggbbbb + +shuf: db 0, 4, 8, 1,5, 9, 2, 6,10,3, 7,11,-1,-1,-1,-1 +ex_r: db 0,-1,-1,-1,3,-1,-1,-1,6,-1,-1,-1, 9,-1,-1,-1 +ex_g: db 1,-1,-1,-1,4,-1,-1,-1,7,-1,-1,-1,10,-1,-1,-1 +ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1 + +SECTION .text + +INIT_XMM sse4 +%if ARCH_X86_64 +cglobal anaglyph, 6, 10, 14, 2*6*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, r_linesize, width, height, o, cnt +%define ana_matrix_rq r6q +%define ana_matrix_gq r7q +%define ana_matrix_bq r8q + +%else ; ARCH_X86_32 +%if HAVE_ALIGNED_STACK +cglobal anaglyph, 3, 7, 8, 2*9*mmsize, dst, lsrc, rsrc, dst_linesize, l_linesize, o, cnt +%else +cglobal anaglyph, 3, 6, 8, 2*9*mmsize, dst, lsrc, rsrc, dst_linesize, o, cnt +%define l_linesizeq r4mp +%endif ; HAVE_ALIGNED_STACK +%define ana_matrix_rq r3q +%define ana_matrix_gq r4q +%define ana_matrix_bq r5q +%define r_linesizeq r5mp +%define widthd r6mp +%define heightd r7mp +%define m8 [rsp+mmsize*12] +%define m9 [rsp+mmsize*13] +%define m10 [rsp+mmsize*14] +%define m11 [rsp+mmsize*15] +%define m12 [rsp+mmsize*16] +%define m13 [rsp+mmsize*17] +%endif ; ARCH + + mov ana_matrix_rq, r8m + mov ana_matrix_gq, r9m + mov ana_matrix_bq, r10m + movu m3, [ana_matrix_rq+ 0] + movq m5, [ana_matrix_rq+16] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + mova [rsp+mmsize*0], m0 + mova [rsp+mmsize*1], m1 + mova [rsp+mmsize*2], m2 + mova [rsp+mmsize*3], m3 + mova [rsp+mmsize*4], m4 + mova [rsp+mmsize*5], m5 + + movu m3, [ana_matrix_gq+ 0] + movq m5, [ana_matrix_gq+16] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + mova [rsp+mmsize*6 ], m0 + mova [rsp+mmsize*7 ], m1 + mova [rsp+mmsize*8 ], m2 + mova [rsp+mmsize*9 ], m3 + mova [rsp+mmsize*10], m4 + mova [rsp+mmsize*11], m5 + +%if ARCH_X86_64 + movu m11, [ana_matrix_bq+ 0] + movq m13, [ana_matrix_bq+16] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + pshufd m12, m13, q0000 + pshufd m13, m13, q1111 + mov widthd, dword widthm + mov heightd, dword heightm +%else + movu m3, [ana_matrix_bq+ 0] + movq m5, [ana_matrix_bq+16] + pshufd m0, m3, q0000 + pshufd m1, m3, q1111 + pshufd m2, m3, q2222 + pshufd m3, m3, q3333 + pshufd m4, m5, q0000 + pshufd m5, m5, q1111 + mova [rsp+mmsize*12], m0 + mova [rsp+mmsize*13], m1 + mova [rsp+mmsize*14], m2 + mova [rsp+mmsize*15], m3 + mova [rsp+mmsize*16], m4 + mova [rsp+mmsize*17], m5 + mov dst_linesizeq, r3m +%if HAVE_ALIGNED_STACK + mov l_linesizeq, r4m +%endif +%endif ; ARCH + +.nextrow: + mov od, widthd + xor cntd, cntd + + .loop: + movu m3, [lsrcq+cntq] + pshufb m1, m3, [ex_r] + pshufb m2, m3, [ex_g] + pshufb m3, [ex_b] + movu m0, [rsrcq+cntq] + pshufb m4, m0, [ex_r] + pshufb m5, m0, [ex_g] + pshufb m0, [ex_b] + pmulld m1, [rsp+mmsize*0] + pmulld m2, [rsp+mmsize*1] + pmulld m3, [rsp+mmsize*2] + pmulld m4, [rsp+mmsize*3] + pmulld m5, [rsp+mmsize*4] + pmulld m0, [rsp+mmsize*5] + paddd m1, m2 + paddd m3, m4 + paddd m5, m0 + paddd m1, m3 + paddd m1, m5 + + movu m3, [lsrcq+cntq] + pshufb m7, m3, [ex_r] + pshufb m2, m3, [ex_g] + pshufb m3, [ex_b] + movu m0, [rsrcq+cntq] + pshufb m4, m0, [ex_r] + pshufb m5, m0, [ex_g] + pshufb m0, [ex_b] + pmulld m7, [rsp+mmsize*6] + pmulld m2, [rsp+mmsize*7] + pmulld m3, [rsp+mmsize*8] + pmulld m4, [rsp+mmsize*9] + pmulld m5, [rsp+mmsize*10] + pmulld m0, [rsp+mmsize*11] + paddd m7, m2 + paddd m3, m4 + paddd m5, m0 + paddd m7, m3 + paddd m7, m5 + + movu m4, [lsrcq+cntq] + pshufb m2, m4, [ex_r] + pshufb m3, m4, [ex_g] + pshufb m4, [ex_b] + movu m0, [rsrcq+cntq] + pshufb m5, m0, [ex_r] + pshufb m6, m0, [ex_g] + pshufb m0, [ex_b] + pmulld m2, m8 + pmulld m3, m9 + pmulld m4, m10 + pmulld m5, m11 + pmulld m6, m12 + pmulld m0, m13 + paddd m2, m3 + paddd m4, m5 + paddd m6, m0 + paddd m2, m4 + paddd m2, m6 + + psrld m1, 16 + psrld m7, 16 + psrld m2, 16 + + packusdw m1, m7 + packusdw m2, m2 + packuswb m1, m2 + pshufb m1, [shuf] + + movq [dstq+cntq+0], m1 + psrldq m1, 8 + movd [dstq+cntq+8], m1 + add cntd, 12 + sub od, 4 + jg .loop + + add dstq, dst_linesizeq + add lsrcq, l_linesizeq + add rsrcq, r_linesizeq + sub heightd, 1 + jg .nextrow +REP_RET |