;****************************************************************************** ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders ;* Copyright (C) 2012 Christophe Gisquet ;* ;* This file is part of Libav. ;* ;* Libav is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* Libav is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with Libav; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "x86inc.asm" %include "x86util.asm" SECTION .text %macro IDCT_DC_NOROUND 1 imul %1, 13*13*3 sar %1, 11 %endmacro %macro IDCT_DC_ROUND 1 imul %1, 13*13 add %1, 0x200 sar %1, 10 %endmacro %macro rv34_idct 1 cglobal rv34_idct_%1_mmx2, 1, 2, 0 movsx r1, word [r0] IDCT_DC r1 movd m0, r1 pshufw m0, m0, 0 movq [r0+ 0], m0 movq [r0+ 8], m0 movq [r0+16], m0 movq [r0+24], m0 REP_RET %endmacro INIT_MMX %define IDCT_DC IDCT_DC_ROUND rv34_idct dc %define IDCT_DC IDCT_DC_NOROUND rv34_idct dc_noround ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); cglobal rv34_idct_dc_add_mmx, 3, 3 ; calculate DC IDCT_DC_ROUND r2 pxor m1, m1 movd m0, r2 psubw m1, m0 packuswb m0, m0 packuswb m1, m1 punpcklbw m0, m0 punpcklbw m1, m1 punpcklwd m0, m0 punpcklwd m1, m1 ; add DC lea r2, [r0+r1*2] movh m2, [r0] movh m3, [r0+r1] movh m4, [r2] movh m5, [r2+r1] paddusb m2, m0 paddusb m3, m0 paddusb m4, m0 paddusb m5, m0 psubusb m2, m1 psubusb m3, m1 psubusb m4, m1 psubusb m5, m1 movh [r0], m2 movh [r0+r1], m3 movh [r2], m4 movh [r2+r1], m5 RET ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); INIT_XMM cglobal rv34_idct_dc_add_sse4, 3, 3, 6 ; load data IDCT_DC_ROUND r2 pxor m1, m1 ; calculate DC movd m0, r2 lea r2, [r0+r1*2] movd m2, [r0] movd m3, [r0+r1] pshuflw m0, m0, 0 movd m4, [r2] movd m5, [r2+r1] punpcklqdq m0, m0 punpckldq m2, m3 punpckldq m4, m5 punpcklbw m2, m1 punpcklbw m4, m1 paddw m2, m0 paddw m4, m0 packuswb m2, m4 movd [r0], m2 pextrd [r0+r1], m2, 1 pextrd [r2], m2, 2 pextrd [r2+r1], m2, 3 RET