summaryrefslogtreecommitdiff
path: root/libavcodec/x86/h264_intrapred.asm
diff options
context:
space:
mode:
authorJason Garrett-Glaser <darkshikari@gmail.com>2010-06-25 18:25:49 +0000
committerJason Garrett-Glaser <darkshikari@gmail.com>2010-06-25 18:25:49 +0000
commit4af8cdfc3f4a1f777e769cb97c61ef13674699f7 (patch)
treeb4349669fac4eda292f4aaae28cabfe83c80efef /libavcodec/x86/h264_intrapred.asm
parentd6f8476be4895c620d58e021ab880823d2fe25bf (diff)
16x16 and 8x8c x86 SIMD intra pred functions for VP8 and H.264
Originally committed as revision 23783 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/x86/h264_intrapred.asm')
-rw-r--r--libavcodec/x86/h264_intrapred.asm486
1 files changed, 486 insertions, 0 deletions
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
new file mode 100644
index 0000000000..4f0a43fc2b
--- /dev/null
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -0,0 +1,486 @@
+;******************************************************************************
+;* H.264 intra prediction asm optimizations
+;* Copyright (c) 2010 Jason Garrett-Glaser
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+
+SECTION_RODATA
+
+tm_shuf: times 8 db 0x03, 0x80
+
+SECTION .text
+
+cextern pb_3
+
+;-----------------------------------------------------------------------------
+; void pred16x16_vertical(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+cglobal pred16x16_vertical_mmx, 2,3
+ sub r0, r1
+ mov r2, 8
+ movq mm0, [r0+0]
+ movq mm1, [r0+8]
+.loop:
+ movq [r0+r1*1+0], mm0
+ movq [r0+r1*1+8], mm1
+ movq [r0+r1*2+0], mm0
+ movq [r0+r1*2+8], mm1
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+
+cglobal pred16x16_vertical_sse, 2,3
+ sub r0, r1
+ mov r2, 4
+ movaps xmm0, [r0]
+.loop:
+ movaps [r0+r1*1], xmm0
+ movaps [r0+r1*2], xmm0
+ lea r0, [r0+r1*2]
+ movaps [r0+r1*1], xmm0
+ movaps [r0+r1*2], xmm0
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void pred16x16_horizontal(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_H 1
+cglobal pred16x16_horizontal_%1, 2,3
+ mov r2, 8
+%ifidn %1, ssse3
+ mova m2, [pb_3]
+%endif
+.loop:
+ movd m0, [r0+r1*0-4]
+ movd m1, [r0+r1*1-4]
+
+%ifidn %1, ssse3
+ pshufb m0, m2
+ pshufb m1, m2
+%else
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+%ifidn %1, mmxext
+ pshufw m0, m0, 0xff
+ pshufw m1, m1, 0xff
+%else
+ punpckhwd m0, m0
+ punpckhwd m1, m1
+ punpckhdq m0, m0
+ punpckhdq m1, m1
+%endif
+ mova [r0+r1*0+8], m0
+ mova [r0+r1*1+8], m1
+%endif
+
+ mova [r0+r1*0], m0
+ mova [r0+r1*1], m1
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+PRED16x16_H mmx
+PRED16x16_H mmxext
+INIT_XMM
+PRED16x16_H ssse3
+
+;-----------------------------------------------------------------------------
+; void pred16x16_dc(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_DC 2
+cglobal pred16x16_dc_%1, 2,7
+ mov r4, r0
+ sub r0, r1
+ pxor mm0, mm0
+ pxor mm1, mm1
+ psadbw mm0, [r0+0]
+ psadbw mm1, [r0+8]
+ dec r0
+ movzx r5d, byte [r0+r1*1]
+ paddw mm0, mm1
+ movd r6d, mm0
+ lea r0, [r0+r1*2]
+%rep 7
+ movzx r2d, byte [r0+r1*0]
+ movzx r3d, byte [r0+r1*1]
+ add r5d, r2d
+ add r6d, r3d
+ lea r0, [r0+r1*2]
+%endrep
+ movzx r2d, byte [r0+r1*0]
+ add r5d, r6d
+ lea r2d, [r2+r5+16]
+ shr r2d, 5
+%ifidn %1, mmx
+ movd m0, r2d
+ punpcklbw m0, m0
+ punpcklwd m0, m0
+ punpckldq m0, m0
+%elifidn %1, mmxext
+ movd m0, r2d
+ punpcklbw m0, m0
+ pshufw m0, m0, 0
+%elifidn %1, sse
+ imul r2d, 0x01010101
+ movd m0, r2d
+ shufps m0, m0, 0
+%elifidn %1, sse2
+ movd m0, r2d
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0
+ punpcklqdq m0, m0
+%elifidn %1, ssse3
+ pxor m1, m1
+ movd m0, r2d
+ pshufb m0, m1
+%endif
+
+%if mmsize==8
+ mov r3d, 8
+.loop:
+ %2 [r4+r1*0+0], m0
+ %2 [r4+r1*0+8], m0
+ %2 [r4+r1*1+0], m0
+ %2 [r4+r1*1+8], m0
+%else
+ mov r3d, 4
+.loop:
+ %2 [r4+r1*0], m0
+ %2 [r4+r1*1], m0
+ lea r4, [r4+r1*2]
+ %2 [r4+r1*0], m0
+ %2 [r4+r1*1], m0
+%endif
+ lea r4, [r4+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+PRED16x16_DC mmx, movq
+PRED16x16_DC mmxext, movq
+INIT_XMM
+PRED16x16_DC sse, movaps
+PRED16x16_DC sse2, movdqa
+PRED16x16_DC ssse3, movdqa
+
+;-----------------------------------------------------------------------------
+; void pred16x16_tm_vp8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED16x16_TM_MMX 1
+cglobal pred16x16_tm_vp8_%1, 2,5
+ sub r0, r1
+ pxor mm7, mm7
+ movq mm0, [r0+0]
+ movq mm2, [r0+8]
+ movq mm1, mm0
+ movq mm3, mm2
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ movzx r3d, byte [r0-1]
+ mov r4d, 16
+.loop:
+ movzx r2d, byte [r0+r1-1]
+ sub r2d, r3d
+ movd mm4, r2d
+%ifidn %1, mmx
+ punpcklwd mm4, mm4
+ punpckldq mm4, mm4
+%else
+ pshufw mm4, mm4, 0
+%endif
+ movq mm5, mm4
+ movq mm6, mm4
+ movq mm7, mm4
+ paddw mm4, mm0
+ paddw mm5, mm1
+ paddw mm6, mm2
+ paddw mm7, mm3
+ packuswb mm4, mm5
+ packuswb mm6, mm7
+ movq [r0+r1+0], mm4
+ movq [r0+r1+8], mm6
+ add r0, r1
+ dec r4d
+ jg .loop
+ REP_RET
+%endmacro
+
+PRED16x16_TM_MMX mmx
+PRED16x16_TM_MMX mmxext
+
+cglobal pred16x16_tm_vp8_sse2, 2,6,6
+ sub r0, r1
+ pxor xmm2, xmm2
+ movdqa xmm0, [r0]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ movzx r4d, byte [r0-1]
+ mov r5d, 8
+.loop:
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ sub r2d, r4d
+ sub r3d, r4d
+ movd xmm2, r2d
+ movd xmm4, r3d
+ pshuflw xmm2, xmm2, 0
+ pshuflw xmm4, xmm4, 0
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm4, xmm4
+ movdqa xmm3, xmm2
+ movdqa xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm3, xmm1
+ paddw xmm4, xmm0
+ paddw xmm5, xmm1
+ packuswb xmm2, xmm3
+ packuswb xmm4, xmm5
+ movdqa [r0+r1*1], xmm2
+ movdqa [r0+r1*2], xmm4
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void pred8x8_vertical(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+cglobal pred8x8_vertical_mmx, 2,2
+ sub r0, r1
+ movq mm0, [r0]
+%rep 3
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ lea r0, [r0+r1*2]
+%endrep
+ movq [r0+r1*1], mm0
+ movq [r0+r1*2], mm0
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred8x8_horizontal(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8_H 1
+cglobal pred8x8_horizontal_%1, 2,3
+ mov r2, 4
+%ifidn %1, ssse3
+ mova m2, [pb_3]
+%endif
+.loop:
+ movd m0, [r0+r1*0-4]
+ movd m1, [r0+r1*1-4]
+%ifidn %1, ssse3
+ pshufb m0, m2
+ pshufb m1, m2
+%else
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+%ifidn %1, mmxext
+ pshufw m0, m0, 0xff
+ pshufw m1, m1, 0xff
+%else
+ punpckhwd m0, m0
+ punpckhwd m1, m1
+ punpckhdq m0, m0
+ punpckhdq m1, m1
+%endif
+%endif
+ mova [r0+r1*0], m0
+ mova [r0+r1*1], m1
+ lea r0, [r0+r1*2]
+ dec r2
+ jg .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+PRED8x8_H mmx
+PRED8x8_H mmxext
+PRED8x8_H ssse3
+
+;-----------------------------------------------------------------------------
+; void pred8x8_dc_rv40(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8_DC 1
+cglobal pred8x8_dc_rv40_%1, 2,7
+ mov r4, r0
+ sub r0, r1
+ pxor mm0, mm0
+ psadbw mm0, [r0]
+ dec r0
+ movzx r5d, byte [r0+r1*1]
+ movd r6d, mm0
+ lea r0, [r0+r1*2]
+%rep 3
+ movzx r2d, byte [r0+r1*0]
+ movzx r3d, byte [r0+r1*1]
+ add r5d, r2d
+ add r6d, r3d
+ lea r0, [r0+r1*2]
+%endrep
+ movzx r2d, byte [r0+r1*0]
+ add r5d, r6d
+ lea r2d, [r2+r5+8]
+ shr r2d, 4
+%ifidn %1, mmx
+ movd mm0, r2d
+ punpcklbw mm0, mm0
+ punpcklwd mm0, mm0
+ punpckldq mm0, mm0
+%else
+ movd mm0, r2d
+ punpcklbw mm0, mm0
+ pshufw mm0, mm0, 0
+%endif
+ mov r3d, 4
+.loop:
+ movq [r4+r1*0], mm0
+ movq [r4+r1*1], mm0
+ lea r4, [r4+r1*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro
+
+
+PRED8x8_DC mmx
+PRED8x8_DC mmxext
+
+;-----------------------------------------------------------------------------
+; void pred8x8_tm_vp8(uint8_t *src, int stride)
+;-----------------------------------------------------------------------------
+
+%macro PRED8x8_TM_MMX 1
+cglobal pred8x8_tm_vp8_%1, 2,6
+ sub r0, r1
+ pxor mm7, mm7
+ movq mm0, [r0]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ movzx r4d, byte [r0-1]
+ mov r5d, 4
+.loop:
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ sub r2d, r4d
+ sub r3d, r4d
+ movd mm2, r2d
+ movd mm4, r3d
+%ifidn %1, mmx
+ punpcklwd mm2, mm2
+ punpcklwd mm4, mm4
+ punpckldq mm2, mm2
+ punpckldq mm4, mm4
+%else
+ pshufw mm2, mm2, 0
+ pshufw mm4, mm4, 0
+%endif
+ movq mm3, mm2
+ movq mm5, mm4
+ paddw mm2, mm0
+ paddw mm3, mm1
+ paddw mm4, mm0
+ paddw mm5, mm1
+ packuswb mm2, mm3
+ packuswb mm4, mm5
+ movq [r0+r1*1], mm2
+ movq [r0+r1*2], mm4
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .loop
+ REP_RET
+%endmacro
+
+PRED8x8_TM_MMX mmx
+PRED8x8_TM_MMX mmxext
+
+cglobal pred8x8_tm_vp8_sse2, 2,6,4
+ sub r0, r1
+ pxor xmm1, xmm1
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm1
+ movzx r4d, byte [r0-1]
+ mov r5d, 4
+.loop:
+ movzx r2d, byte [r0+r1*1-1]
+ movzx r3d, byte [r0+r1*2-1]
+ sub r2d, r4d
+ sub r3d, r4d
+ movd xmm2, r2d
+ movd xmm3, r3d
+ pshuflw xmm2, xmm2, 0
+ pshuflw xmm3, xmm3, 0
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+ paddw xmm2, xmm0
+ paddw xmm3, xmm0
+ packuswb xmm2, xmm3
+ movq [r0+r1*1], xmm2
+ movhps [r0+r1*2], xmm2
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .loop
+ REP_RET
+
+cglobal pred8x8_tm_vp8_ssse3, 2,3,6
+ sub r0, r1
+ movdqa xmm4, [tm_shuf]
+ pxor xmm1, xmm1
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm1
+ movd xmm5, [r0-4]
+ pshufb xmm5, xmm4
+ mov r2d, 4
+.loop:
+ movd xmm2, [r0+r1*1-4]
+ movd xmm3, [r0+r1*2-4]
+ pshufb xmm2, xmm4
+ pshufb xmm3, xmm4
+ psubw xmm2, xmm5
+ psubw xmm3, xmm5
+ paddw xmm2, xmm0
+ paddw xmm3, xmm0
+ packuswb xmm2, xmm3
+ movq [r0+r1*1], xmm2
+ movhps [r0+r1*2], xmm2
+ lea r0, [r0+r1*2]
+ dec r2d
+ jg .loop
+ REP_RET