diff options
Diffstat (limited to 'libavfilter/x86/vf_interlace.asm')
-rw-r--r-- | libavfilter/x86/vf_interlace.asm | 183 |
1 files changed, 160 insertions, 23 deletions
diff --git a/libavfilter/x86/vf_interlace.asm b/libavfilter/x86/vf_interlace.asm index f2344216b3..7c0065d4d9 100644 --- a/libavfilter/x86/vf_interlace.asm +++ b/libavfilter/x86/vf_interlace.asm @@ -3,21 +3,22 @@ ;* ;* Copyright (C) 2014 Kieran Kunhya <kierank@obe.tv> ;* Copyright (c) 2014 Michael Niedermayer <michaelni@gmx.at> +;* Copyright (c) 2017 Thomas Mundt <tmundt75@gmail.com> ;* -;* This file is part of Libav. +;* This file is part of FFmpeg. ;* -;* Libav is free software; you can redistribute it and/or modify +;* FFmpeg is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* -;* Libav is distributed in the hope that it will be useful, +;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License along -;* with Libav; if not, write to the Free Software Foundation, Inc., +;* with FFmpeg; if not, write to the Free Software Foundation, Inc., ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ;****************************************************************************** @@ -25,41 +26,177 @@ SECTION_RODATA +pw_4: times 8 dw 4 + SECTION .text -%macro LOWPASS_LINE 0 -cglobal lowpass_line, 5, 5, 7 - add r0, r1 - add r2, r1 - add r3, r1 - add r4, r1 - neg r1 +%macro LOWPASS 1 + add dstq, hq + add srcq, hq + add mrefq, srcq + add prefq, srcq + neg hq - pcmpeqb m6, m6 + pcmpeq%1 m6, m6 .loop: - mova m0, [r3+r1] - mova m1, [r3+r1+mmsize] - pavgb m0, [r4+r1] - pavgb m1, [r4+r1+mmsize] + mova m0, [mrefq+hq] + mova m1, [mrefq+hq+mmsize] + pavg%1 m0, [prefq+hq] + pavg%1 m1, [prefq+hq+mmsize] pxor m0, m6 pxor m1, m6 - pxor m2, m6, [r2+r1] - pxor m3, m6, [r2+r1+mmsize] - pavgb m0, m2 - pavgb m1, m3 + pxor m2, m6, [srcq+hq] + pxor m3, m6, [srcq+hq+mmsize] + pavg%1 m0, m2 + pavg%1 m1, m3 pxor m0, m6 pxor m1, m6 - mova [r0+r1], m0 - mova [r0+r1+mmsize], m1 + mova [dstq+hq], m0 + mova [dstq+hq+mmsize], m1 - add r1, 2*mmsize + add hq, 2*mmsize jl .loop REP_RET %endmacro +%macro LOWPASS_LINE 0 +cglobal lowpass_line, 5, 5, 7, dst, h, src, mref, pref + LOWPASS b + +cglobal lowpass_line_16, 5, 5, 7, dst, h, src, mref, pref + shl hq, 1 + LOWPASS w +%endmacro + +%macro LOWPASS_LINE_COMPLEX 0 +cglobal lowpass_line_complex, 5, 5, 8, dst, h, src, mref, pref + pxor m7, m7 +.loop: + mova m0, [srcq+mrefq] + mova m2, [srcq+prefq] + mova m1, m0 + mova m3, m2 + punpcklbw m0, m7 + punpcklbw m2, m7 + punpckhbw m1, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m1, m3 + mova m6, m0 + mova m5, m1 + mova m2, [srcq] + mova m3, m2 + punpcklbw m2, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m1, m3 + psllw m2, 1 + psllw m3, 1 + paddw m0, m2 + paddw m1, m3 + psllw m0, 1 + psllw m1, 1 + pcmpgtw m6, m2 + pcmpgtw m5, m3 + packsswb m6, m5 + mova m2, [srcq+mrefq*2] + mova m4, [srcq+prefq*2] + mova m3, m2 + mova m5, m4 + punpcklbw m2, m7 + punpcklbw m4, m7 + punpckhbw m3, m7 + punpckhbw m5, m7 + paddw m2, m4 + paddw m3, m5 + paddw m0, [pw_4] + paddw m1, [pw_4] + psubusw m0, m2 + psubusw m1, m3 + psrlw m0, 3 + psrlw m1, 3 + packuswb m0, m1 + mova m1, m0 + pmaxub m0, [srcq] + pminub m1, [srcq] + pand m0, m6 + pandn m6, m1 + por m0, m6 + mova [dstq], m0 + + add dstq, mmsize + add srcq, mmsize + sub hd, mmsize + jg .loop +REP_RET + +cglobal lowpass_line_complex_12, 5, 5, 8, 16, dst, h, src, mref, pref, clip_max + movd m7, DWORD clip_maxm + SPLATW m7, m7, 0 + mova [rsp], m7 +.loop: + mova m0, [srcq+mrefq] + mova m1, [srcq+mrefq+mmsize] + mova m2, [srcq+prefq] + mova m3, [srcq+prefq+mmsize] + paddw m0, m2 + paddw m1, m3 + mova m6, m0 + mova m7, m1 + mova m2, [srcq] + mova m3, [srcq+mmsize] + paddw m0, m2 + paddw m1, m3 + psllw m2, 1 + psllw m3, 1 + paddw m0, m2 + paddw m1, m3 + psllw m0, 1 + psllw m1, 1 + pcmpgtw m6, m2 + pcmpgtw m7, m3 + mova m2, [srcq+2*mrefq] + mova m3, [srcq+2*mrefq+mmsize] + mova m4, [srcq+2*prefq] + mova m5, [srcq+2*prefq+mmsize] + paddw m2, m4 + paddw m3, m5 + paddw m0, [pw_4] + paddw m1, [pw_4] + psubusw m0, m2 + psubusw m1, m3 + psrlw m0, 3 + psrlw m1, 3 + pminsw m0, [rsp] + pminsw m1, [rsp] + mova m2, m0 + mova m3, m1 + pmaxsw m0, [srcq] + pmaxsw m1, [srcq+mmsize] + pminsw m2, [srcq] + pminsw m3, [srcq+mmsize] + pand m0, m6 + pand m1, m7 + pandn m6, m2 + pandn m7, m3 + por m0, m6 + por m1, m7 + mova [dstq], m0 + mova [dstq+mmsize], m1 + + add dstq, 2*mmsize + add srcq, 2*mmsize + sub hd, mmsize + jg .loop +REP_RET +%endmacro + INIT_XMM sse2 LOWPASS_LINE INIT_XMM avx LOWPASS_LINE + +INIT_XMM sse2 +LOWPASS_LINE_COMPLEX |