summaryrefslogtreecommitdiff
path: root/libavcodec
diff options
context:
space:
mode:
authorRonald S. Bultje <rsbultje@gmail.com>2010-07-15 23:02:34 +0000
committerRonald S. Bultje <rsbultje@gmail.com>2010-07-15 23:02:34 +0000
commita711eb48295dfa6c8556bbd7aa55e7cc4d0e19d6 (patch)
tree88f3630b06921a1d263d30ae2c07ef59be837379 /libavcodec
parent751484372d0dbf8428ba327ce5e515005d2e89b5 (diff)
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
Originally committed as revision 24250 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/x86/dsputil_mmx.c2
-rw-r--r--libavcodec/x86/dsputil_mmx.h2
-rw-r--r--libavcodec/x86/vp8dsp-init.c22
-rw-r--r--libavcodec/x86/vp8dsp.asm477
4 files changed, 488 insertions, 15 deletions
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index c4202d53d7..57f701a438 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -61,7 +61,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
-DECLARE_ALIGNED(8, const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
+DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h
index ca3713c6e5..ccf1590cf1 100644
--- a/libavcodec/x86/dsputil_mmx.h
+++ b/libavcodec/x86/dsputil_mmx.h
@@ -47,7 +47,7 @@ extern const uint64_t ff_pw_96;
extern const uint64_t ff_pw_128;
extern const uint64_t ff_pw_255;
-extern const uint64_t ff_pb_1;
+extern const xmm_reg ff_pb_1;
extern const xmm_reg ff_pb_3;
extern const uint64_t ff_pb_7;
extern const uint64_t ff_pb_1F;
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index 9cd72b6955..edff2883f7 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -229,6 +229,19 @@ extern void ff_vp8_v_loop_filter_simple_sse2 (uint8_t *dst, int stride, int fli
extern void ff_vp8_h_loop_filter_simple_mmx (uint8_t *dst, int stride, int flim);
extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim);
extern void ff_vp8_h_loop_filter_simple_sse2 (uint8_t *dst, int stride, int flim);
+
+extern void ff_vp8_v_loop_filter16_inner_mmx (uint8_t *dst, int stride,
+ int e, int i, int hvt);
+extern void ff_vp8_v_loop_filter16_inner_mmxext(uint8_t *dst, int stride,
+ int e, int i, int hvt);
+extern void ff_vp8_v_loop_filter16_inner_sse2 (uint8_t *dst, int stride,
+ int e, int i, int hvt);
+extern void ff_vp8_h_loop_filter16_inner_mmx (uint8_t *dst, int stride,
+ int e, int i, int hvt);
+extern void ff_vp8_h_loop_filter16_inner_mmxext(uint8_t *dst, int stride,
+ int e, int i, int hvt);
+extern void ff_vp8_h_loop_filter16_inner_sse2 (uint8_t *dst, int stride,
+ int e, int i, int hvt);
#endif
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
@@ -270,6 +283,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
+
+ c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_mmx;
+ c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_mmx;
}
/* note that 4-tap width=16 functions are missing because w=16
@@ -285,6 +301,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
+
+ c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_mmxext;
+ c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_mmxext;
}
if (mm_flags & FF_MM_SSE) {
@@ -300,6 +319,9 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
+
+ c->vp8_v_loop_filter16_inner = ff_vp8_v_loop_filter16_inner_sse2;
+ c->vp8_h_loop_filter16_inner = ff_vp8_h_loop_filter16_inner_sse2;
}
if (mm_flags & FF_MM_SSSE3) {
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index aedd09e5ac..5d855b8e4d 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -145,6 +145,7 @@ filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
pw_20091: times 4 dw 20091
pw_17734: times 4 dw 17734
+cextern pb_1
cextern pw_3
cextern pb_3
cextern pw_4
@@ -1202,6 +1203,20 @@ cglobal vp8_luma_dc_wht_mmxext, 2,3
movd [%7+%9*2], m%4
%endmacro
+%macro SPLATB_REG 3
+ movd %1, %2
+ punpcklbw %1, %1
+%if mmsize == 16 ; sse2
+ punpcklwd %1, %1
+ pshufd %1, %1, 0x0
+%elifidn %3, mmx
+ punpcklwd %1, %1
+ punpckldq %1, %1
+%else ; mmxext
+ pshufw %1, %1, 0x0
+%endif
+%endmacro
+
%macro SIMPLE_LOOPFILTER 3
cglobal vp8_%2_loop_filter_simple_%1, 3, %3
%ifidn %2, h
@@ -1211,19 +1226,7 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3
%if mmsize == 8 ; mmx/mmxext
mov r3, 2
%endif
-
- ; splat register with "flim"
- movd m7, r2
- punpcklbw m7, m7
-%if mmsize == 16 ; sse2
- punpcklwd m7, m7
- pshufd m7, m7, 0x0
-%elifidn %1, mmx
- punpcklwd m7, m7
- punpckldq m7, m7
-%else ; mmxext
- pshufw m7, m7, 0x0
-%endif
+ SPLATB_REG m7, r2, %1 ; splat "flim" into register
; set up indexes to address 4 rows
mov r2, r1
@@ -1369,3 +1372,451 @@ SIMPLE_LOOPFILTER mmxext, h, 6
INIT_XMM
SIMPLE_LOOPFILTER sse2, v, 3
SIMPLE_LOOPFILTER sse2, h, 6
+
+;-----------------------------------------------------------------------------
+; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, int stride,
+; int flimE, int flimI, int hev_thr);
+;-----------------------------------------------------------------------------
+
+%macro INNER_LOOPFILTER 4
+cglobal vp8_%2_loop_filter16_inner_%1, 5, %3, %4
+%ifndef m8 ; mmx/mmxext or sse2 on x86-32
+ ; splat function arguments
+ SPLATB_REG m0, r2, %1 ; E
+ SPLATB_REG m1, r3, %1 ; I
+ SPLATB_REG m2, r4, %1 ; hev_thresh
+
+ ; align stack
+ mov r4, rsp ; backup stack pointer
+ and rsp, ~(mmsize-1) ; align stack
+%ifidn %2, v
+ sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
+ ; [3]=hev() result
+%else ; h
+ sub rsp, mmsize * 6 ; extra storage space for transposes
+%endif
+
+%define flim_E [rsp]
+%define flim_I [rsp+mmsize]
+%define hev_thr [rsp+mmsize*2]
+%define mask_res [rsp+mmsize*3]
+
+ mova flim_E, m0
+ mova flim_I, m1
+ mova hev_thr, m2
+
+%else ; sse2 on x86-64
+
+%define flim_E m9
+%define flim_I m10
+%define hev_thr m11
+%define mask_res m12
+
+ ; splat function arguments
+ SPLATB_REG flim_E, r2, %1 ; E
+ SPLATB_REG flim_I, r3, %1 ; I
+ SPLATB_REG hev_thr, r4, %1 ; hev_thresh
+%endif
+
+%if mmsize == 8 ; mmx/mmxext
+ mov r5, 2
+%endif
+ mov r2, r1
+ neg r1
+%ifidn %2, h
+ lea r0, [r0+4*r2-4]
+%endif
+
+%if mmsize == 8
+.next8px
+%endif
+ ; read
+ lea r3, [r0+r2]
+%ifidn %2, v
+ mova m0, [r0+r1*4] ; p3
+ mova m1, [r3+r1*4] ; p2
+ mova m2, [r0+r1*2] ; p1
+ mova m5, [r3] ; q1
+ mova m6, [r3+r2] ; q2
+ mova m7, [r3+r2*2] ; q3
+%elif mmsize == 8 ; mmx/mmxext (h)
+ ; read 8 rows of 8px each
+ movu m0, [r0+r1*4]
+ movu m1, [r3+r1*4]
+ movu m2, [r0+r1*2]
+ movu m3, [r0+r1]
+ movu m4, [r0]
+ movu m5, [r3]
+ movu m6, [r3+r2]
+
+ ; 8x8 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+%ifdef m13
+ SWAP 1, 13
+%else
+ mova [rsp+mmsize*4], m1
+%endif
+ movu m7, [r3+r2*2]
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+%ifdef m13
+ SWAP 1, 13
+ SWAP 2, 13
+%else
+ mova m1, [rsp+mmsize*4]
+ mova [rsp+mmsize*4], m2 ; store q0
+%endif
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+%ifdef m14
+ SWAP 5, 14
+%else
+ mova [rsp+mmsize*5], m5 ; store p0
+%endif
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%else ; sse2 (h)
+ lea r5, [r0+r2*8]
+
+ ; read 16 rows of 8px each, interleave
+ movh m0, [r0+r1*4]
+ movh m1, [r5+r1*4]
+ movh m2, [r0+r1*2]
+ movh m5, [r5+r1*2]
+ movh m3, [r0+r1]
+ movh m6, [r5+r1]
+ movh m4, [r0]
+ movh m7, [r5]
+ punpcklbw m0, m1 ; A/I
+ punpcklbw m2, m5 ; C/K
+ punpcklbw m3, m6 ; D/L
+ punpcklbw m4, m7 ; E/M
+
+ add r5, r2
+ movh m1, [r3+r1*4]
+ movh m6, [r5+r1*4]
+ movh m5, [r3]
+ movh m7, [r5]
+ punpcklbw m1, m6 ; B/J
+ punpcklbw m5, m7 ; F/N
+ movh m6, [r3+r2]
+ movh m7, [r5+r2]
+ punpcklbw m6, m7 ; G/O
+
+ ; 8x16 transpose
+ TRANSPOSE4x4B 0, 1, 2, 3, 7
+%ifdef m13
+ SWAP 1, 13
+%else
+ mova [rsp+mmsize*4], m1
+%endif
+ movh m7, [r3+r2*2]
+ movh m1, [r5+r2*2]
+ punpcklbw m7, m1 ; H/P
+ TRANSPOSE4x4B 4, 5, 6, 7, 1
+ SBUTTERFLY dq, 0, 4, 1 ; p3/p2
+ SBUTTERFLY dq, 2, 6, 1 ; q0/q1
+ SBUTTERFLY dq, 3, 7, 1 ; q2/q3
+%ifdef m13
+ SWAP 1, 13
+ SWAP 2, 13
+%else
+ mova m1, [rsp+mmsize*4]
+ mova [rsp+mmsize*4], m2 ; store q0
+%endif
+ SBUTTERFLY dq, 1, 5, 2 ; p1/p0
+%ifdef m14
+ SWAP 5, 14
+%else
+ mova [rsp+mmsize*5], m5 ; store p0
+%endif
+ SWAP 1, 4
+ SWAP 2, 4
+ SWAP 6, 3
+ SWAP 5, 3
+%endif
+
+ ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
+ mova m4, m1
+ SWAP 4, 1
+ psubusb m4, m0 ; p2-p3
+ psubusb m0, m1 ; p3-p2
+ por m0, m4 ; abs(p3-p2)
+
+ mova m4, m2
+ SWAP 4, 2
+ psubusb m4, m1 ; p1-p2
+ psubusb m1, m2 ; p2-p1
+ por m1, m4 ; abs(p2-p1)
+
+ mova m4, m6
+ SWAP 4, 6
+ psubusb m4, m7 ; q2-q3
+ psubusb m7, m6 ; q3-q2
+ por m7, m4 ; abs(q3-q2)
+
+ mova m4, m5
+ SWAP 4, 5
+ psubusb m4, m6 ; q1-q2
+ psubusb m6, m5 ; q2-q1
+ por m6, m4 ; abs(q2-q1)
+
+%ifidn %1, mmx
+%ifdef m10
+ SWAP 4, 10
+%else
+ mova m4, [rsp+mmsize]
+%endif
+ pxor m3, m3
+ psubusb m0, m4
+ psubusb m1, m4
+ psubusb m7, m4
+ psubusb m6, m4
+ pcmpeqb m0, m3 ; abs(p3-p2) <= I
+ pcmpeqb m1, m3 ; abs(p2-p1) <= I
+ pcmpeqb m7, m3 ; abs(q3-q2) <= I
+ pcmpeqb m6, m3 ; abs(q2-q1) <= I
+ pand m0, m1
+ pand m7, m6
+ pand m0, m7
+%else ; mmxext/sse2
+ pmaxub m0, m1
+ pmaxub m6, m7
+ pmaxub m0, m6
+%endif
+
+ ; normal_limit and high_edge_variance for p1-p0, q1-q0
+ SWAP 7, 3 ; now m7 is zero
+%ifidn %2, v
+ mova m3, [r0+r1] ; p0
+%elifdef m14
+ SWAP 3, 14
+%else
+ mova m3, [rsp+mmsize*5]
+%endif
+
+ mova m1, m2
+ SWAP 1, 2
+ mova m6, m3
+ SWAP 3, 6
+ psubusb m1, m3 ; p1-p0
+ psubusb m6, m2 ; p0-p1
+ por m1, m6 ; abs(p1-p0)
+%ifidn %1, mmx
+ mova m6, m1
+ psubusb m1, m4
+ psubusb m6, hev_thr
+ pcmpeqb m1, m7 ; abs(p1-p0) <= I
+ pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
+ pand m0, m1
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova [rsp+mmsize*3], m6
+%endif
+%else ; mmxext/sse2
+ pmaxub m0, m1 ; max_I
+ SWAP 1, 4 ; max_hev_thresh
+%endif
+
+ SWAP 6, 4 ; now m6 is I
+%ifidn %2, v
+ mova m4, [r0] ; q0
+%elifdef m13
+ SWAP 4, 13
+%else
+ mova m4, [rsp+mmsize*4]
+%endif
+ mova m1, m4
+ SWAP 1, 4
+ mova m7, m5
+ SWAP 7, 5
+ psubusb m1, m5 ; q0-q1
+ psubusb m7, m4 ; q1-q0
+ por m1, m7 ; abs(q1-q0)
+%ifidn %1, mmx
+ mova m7, m1
+ psubusb m1, m6
+ psubusb m7, hev_thr
+ pxor m6, m6
+ pcmpeqb m1, m6 ; abs(q1-q0) <= I
+ pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova m6, [rsp+mmsize*3]
+%endif
+ pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
+ pand m6, m7
+%else ; mmxext/sse2
+ pxor m7, m7
+ pmaxub m0, m1
+ pmaxub m6, m1
+ psubusb m0, flim_I
+ psubusb m6, hev_thr
+ pcmpeqb m0, m7 ; max(abs(..)) <= I
+ pcmpeqb m6, m7 ; !(max(abs..) > thresh)
+%endif
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova [rsp+mmsize*3], m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
+%endif
+
+ ; simple_limit
+ mova m1, m3
+ SWAP 1, 3
+ mova m6, m4 ; keep copies of p0/q0 around for later use
+ SWAP 6, 4
+ psubusb m1, m4 ; p0-q0
+ psubusb m6, m3 ; q0-p0
+ por m1, m6 ; abs(q0-p0)
+ paddusb m1, m1 ; m1=2*abs(q0-p0)
+
+ mova m7, m2
+ SWAP 7, 2
+ mova m6, m5
+ SWAP 6, 5
+ psubusb m7, m5 ; p1-q1
+ psubusb m6, m2 ; q1-p1
+ por m7, m6 ; abs(q1-p1)
+ pxor m6, m6
+ pand m7, [pb_FE]
+ psrlq m7, 1 ; abs(q1-p1)/2
+ paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
+ psubusb m7, flim_E
+ pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
+ pand m0, m7 ; normal_limit result
+
+ ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
+%ifdef m8 ; x86-64 && sse2
+ mova m8, [pb_80]
+%define pb_80_var m8
+%else ; x86-32 or mmx/mmxext
+%define pb_80_var [pb_80]
+%endif
+ mova m1, m4
+ mova m7, m3
+ pxor m1, pb_80_var
+ pxor m7, pb_80_var
+ psubsb m1, m7 ; (signed) q0-p0
+ mova m6, m2
+ mova m7, m5
+ pxor m6, pb_80_var
+ pxor m7, pb_80_var
+ psubsb m6, m7 ; (signed) p1-q1
+ mova m7, mask_res
+ pandn m7, m6
+ paddsb m7, m1
+ paddsb m7, m1
+ paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
+
+ pand m7, m0
+ mova m1, [pb_F8]
+ mova m6, m7
+ paddsb m7, [pb_3]
+ paddsb m6, [pb_4]
+ pand m7, m1
+ pand m6, m1
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m1, m7
+ psubb m0, m7
+ psrlq m7, 3 ; +f2
+ psrlq m0, 3 ; -f2
+ pand m0, m1
+ pandn m1, m7
+ psubusb m3, m0
+ paddusb m3, m1 ; p0+f2
+
+ pxor m1, m1
+ pxor m0, m0
+ pcmpgtb m0, m6
+ psubb m1, m6
+ psrlq m6, 3 ; +f1
+ psrlq m1, 3 ; -f1
+ pand m1, m0
+ pandn m0, m6
+ psubusb m4, m0
+ paddusb m4, m1 ; q0-f1
+
+%ifdef m12
+ SWAP 6, 12
+%else
+ mova m6, [rsp+mmsize*3]
+%endif
+%ifidn %1, mmx
+ mova m7, [pb_1]
+%else ; mmxext/sse2
+ pxor m7, m7
+%endif
+ pand m0, m6
+ pand m1, m6
+%ifidn %1, mmx
+ paddusb m0, m7
+ pand m1, [pb_FE]
+ pandn m7, m0
+ psrlq m1, 1
+ psrlq m7, 1
+ SWAP 0, 7
+%else ; mmxext/sse2
+ psubusb m1, [pb_1]
+ pavgb m0, m7 ; a
+ pavgb m1, m7 ; -a
+%endif
+ psubusb m5, m0
+ psubusb m2, m1
+ paddusb m5, m1 ; q1-a
+ paddusb m2, m0 ; p1+a
+
+ ; store
+%ifidn %2, v
+ mova [r0+r1*2], m2
+ mova [r0+r1], m3
+ mova [r0], m4
+ mova [r0+r2], m5
+%else ; h
+ add r0, 2
+ add r3, 2
+
+ ; 4x8/16 transpose
+ TRANSPOSE4x4B 2, 3, 4, 5, 6
+
+%if mmsize == 8 ; mmx/mmxext (h)
+ WRITE_4x2D 2, 3, 4, 5, r0, r3, r1, r2
+%else ; sse2 (h)
+ lea r5, [r5+r1+2]
+ WRITE_4x4D 2, 3, 4, 5, r0, r3, r5, r1, r2
+%endif
+%endif
+
+%if mmsize == 8
+%ifidn %2, h
+ lea r0, [r0+8*r2-2]
+%else ; v
+ add r0, 8
+%endif
+ dec r5
+ jg .next8px
+%endif
+
+%ifndef m8 ; sse2 on x86-32 or mmx/mmxext
+ mov rsp, r4 ; restore stack pointer
+ RET
+%else ; sse2 on x86-64
+ REP_RET
+%endif
+%endmacro
+
+INIT_MMX
+INNER_LOOPFILTER mmx, v, 6, 8
+INNER_LOOPFILTER mmx, h, 6, 8
+INNER_LOOPFILTER mmxext, v, 6, 8
+INNER_LOOPFILTER mmxext, h, 6, 8
+INIT_XMM
+INNER_LOOPFILTER sse2, v, 5, 13
+INNER_LOOPFILTER sse2, h, 6, 15