summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Garrett-Glaser <darkshikari@gmail.com>2010-07-23 02:58:27 +0000
committerJason Garrett-Glaser <darkshikari@gmail.com>2010-07-23 02:58:27 +0000
commit8a467b2d44d20c1a0b731dce9edeff772732a558 (patch)
tree259b69473524a9db002d79b90009817f84af4113
parentef38842f0bc97ce5b158f51f3e65aae4164fc6a5 (diff)
VP8: 30% faster idct_mb
Take shortcuts based on statistically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks? Originally committed as revision 24452 to svn://svn.ffmpeg.org/ffmpeg/trunk
-rw-r--r--libavcodec/vp8.c64
-rw-r--r--libavcodec/vp8dsp.c26
-rw-r--r--libavcodec/vp8dsp.h1
-rw-r--r--libavcodec/x86/vp8dsp-init.c5
-rw-r--r--libavcodec/x86/vp8dsp.asm181
5 files changed, 190 insertions, 87 deletions
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index 18b69b8de1..92de1bc605 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -1186,45 +1186,49 @@ static void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
}
}
-static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst,
- VP8Macroblock *mb)
+static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
{
- int x, y, nnz;
+ int x, y, ch, nnz;
- if (mb->mode != MODE_I4x4)
+ if (mb->mode != MODE_I4x4) {
+ uint8_t *y_dst = dst[0];
for (y = 0; y < 4; y++) {
- for (x = 0; x < 4; x++) {
- nnz = s->non_zero_count_cache[y][x];
- if (nnz) {
- if (nnz == 1)
- s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
- else
- s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+ uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]);
+ if (nnz) {
+ if (nnz&~0x01010101) {
+ for (x = 0; x < 4; x++) {
+ nnz = s->non_zero_count_cache[y][x];
+ if (nnz) {
+ if (nnz == 1)
+ s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
+ else
+ s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+ }
+ }
+ } else {
+ s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize);
}
}
y_dst += 4*s->linesize;
}
+ }
- for (y = 0; y < 2; y++) {
- for (x = 0; x < 2; x++) {
- nnz = s->non_zero_count_cache[4][(y<<1)+x];
- if (nnz) {
- if (nnz == 1)
- s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
- else
- s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
- }
-
- nnz = s->non_zero_count_cache[5][(y<<1)+x];
- if (nnz) {
- if (nnz == 1)
- s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
- else
- s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
+ for (ch = 0; ch < 2; ch++) {
+ if (AV_RN32A(s->non_zero_count_cache[4+ch])) {
+ uint8_t *ch_dst = dst[1+ch];
+ for (y = 0; y < 2; y++) {
+ for (x = 0; x < 2; x++) {
+ nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
+ if (nnz) {
+ if (nnz == 1)
+ s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+ else
+ s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+ }
+ }
+ ch_dst += 4*s->uvlinesize;
}
}
- u_dst += 4*s->uvlinesize;
- v_dst += 4*s->uvlinesize;
}
}
@@ -1511,7 +1515,7 @@ static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
if (!mb->skip) {
- idct_mb(s, dst[0], dst[1], dst[2], mb);
+ idct_mb(s, dst, mb);
} else {
AV_ZERO64(s->left_nnz);
AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c
index 3e6463d598..64b09d52ee 100644
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -109,6 +109,25 @@ static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], int stride)
}
}
+static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride)
+{
+ int i, j;
+ for (j = 0; j < 4; j++) {
+ uint8_t *pix = dst+j*4;
+ int dc = (block[j][0] + 4) >> 3;
+ uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
+ block[j][0] = 0;
+ if (!dc)
+ continue;
+ for (i = 0; i < 4; i++) {
+ pix[0] = cm[pix[0]];
+ pix[1] = cm[pix[1]];
+ pix[2] = cm[pix[2]];
+ pix[3] = cm[pix[3]];
+ pix += stride;
+ }
+ }
+}
// because I like only having two parameters to pass functions...
#define LOAD_PIXELS\
@@ -460,9 +479,10 @@ VP8_BILINEAR(4)
av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
{
- dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
- dsp->vp8_idct_add = vp8_idct_add_c;
- dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
+ dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
+ dsp->vp8_idct_add = vp8_idct_add_c;
+ dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
+ dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c;
dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;
diff --git a/libavcodec/vp8dsp.h b/libavcodec/vp8dsp.h
index 64a3bfbc57..82ea684eab 100644
--- a/libavcodec/vp8dsp.h
+++ b/libavcodec/vp8dsp.h
@@ -33,6 +33,7 @@ typedef struct VP8DSPContext {
void (*vp8_luma_dc_wht)(DCTELEM block[4][4][16], DCTELEM dc[16]);
void (*vp8_idct_add)(uint8_t *dst, DCTELEM block[16], int stride);
void (*vp8_idct_dc_add)(uint8_t *dst, DCTELEM block[16], int stride);
+ void (*vp8_idct_dc_add4)(uint8_t *dst, DCTELEM block[4][16], int stride);
// loop filter applied to edges between macroblocks
void (*vp8_v_loop_filter16y)(uint8_t *dst, int stride,
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c
index 6cf1704594..5da70824fc 100644
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -220,6 +220,8 @@ HVBILIN(ssse3, 8, 16, 16)
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride);
+extern void ff_vp8_idct_dc_add4_mmx(uint8_t *dst, DCTELEM block[4][16], int stride);
+extern void ff_vp8_idct_dc_add4_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
@@ -283,6 +285,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
#if HAVE_YASM
if (mm_flags & FF_MM_MMX) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
+ c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
c->put_vp8_epel_pixels_tab[0][0][0] =
@@ -351,6 +354,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
}
if (mm_flags & FF_MM_SSE2) {
+ c->vp8_idct_dc_add4 = ff_vp8_idct_dc_add4_sse2;
+
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 0cf4771abd..305bd71d08 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -900,75 +900,148 @@ cglobal put_vp8_pixels16_sse, 5,5,2
REP_RET
;-----------------------------------------------------------------------------
-; IDCT functions:
-;
; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
;-----------------------------------------------------------------------------
+%macro ADD_DC 4
+ %4 m2, [r0+%3]
+ %4 m3, [r0+r2+%3]
+ %4 m4, [r1+%3]
+ %4 m5, [r1+r2+%3]
+ paddusb m2, %1
+ paddusb m3, %1
+ paddusb m4, %1
+ paddusb m5, %1
+ psubusb m2, %2
+ psubusb m3, %2
+ psubusb m4, %2
+ psubusb m5, %2
+ %4 [r0+%3], m2
+ %4 [r0+r2+%3], m3
+ %4 [r1+%3], m4
+ %4 [r1+r2+%3], m5
+%endmacro
+
+INIT_MMX
cglobal vp8_idct_dc_add_mmx, 3, 3
; load data
- movd mm0, [r1]
+ movd m0, [r1]
; calculate DC
- paddw mm0, [pw_4]
- pxor mm1, mm1
- psraw mm0, 3
- movd [r1], mm1
- psubw mm1, mm0
- packuswb mm0, mm0
- packuswb mm1, mm1
- punpcklbw mm0, mm0
- punpcklbw mm1, mm1
- punpcklwd mm0, mm0
- punpcklwd mm1, mm1
+ paddw m0, [pw_4]
+ pxor m1, m1
+ psraw m0, 3
+ movd [r1], m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
; add DC
- lea r1, [r0+r2*2]
- movd mm2, [r0]
- movd mm3, [r0+r2]
- movd mm4, [r1]
- movd mm5, [r1+r2]
- paddusb mm2, mm0
- paddusb mm3, mm0
- paddusb mm4, mm0
- paddusb mm5, mm0
- psubusb mm2, mm1
- psubusb mm3, mm1
- psubusb mm4, mm1
- psubusb mm5, mm1
- movd [r0], mm2
- movd [r0+r2], mm3
- movd [r1], mm4
- movd [r1+r2], mm5
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m1, 0, movh
RET
+INIT_XMM
cglobal vp8_idct_dc_add_sse4, 3, 3, 6
; load data
- movd xmm0, [r1]
- pxor xmm1, xmm1
+ movd m0, [r1]
+ pxor m1, m1
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [r1], m1
+ lea r1, [r0+r2*2]
+ movd m2, [r0]
+ movd m3, [r0+r2]
+ movd m4, [r1]
+ movd m5, [r1+r2]
+ psraw m0, 3
+ pshuflw m0, m0, 0
+ punpcklqdq m0, m0
+ punpckldq m2, m3
+ punpckldq m4, m5
+ punpcklbw m2, m1
+ punpcklbw m4, m1
+ paddw m2, m0
+ paddw m4, m0
+ packuswb m2, m4
+ movd [r0], m2
+ pextrd [r0+r2], m2, 1
+ pextrd [r1], m2, 2
+ pextrd [r1+r2], m2, 3
+ RET
+
+;-----------------------------------------------------------------------------
+; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+cglobal vp8_idct_dc_add4_mmx, 3, 3
+ ; load data
+ movd m0, [r1+32*0] ; A
+ movd m1, [r1+32*2] ; C
+ punpcklwd m0, [r1+32*1] ; A B
+ punpcklwd m1, [r1+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m6, m6
; calculate DC
- paddw xmm0, [pw_4]
- movd [r1], xmm1
- lea r1, [r0+r2*2]
- movd xmm2, [r0]
- movd xmm3, [r0+r2]
- movd xmm4, [r1]
- movd xmm5, [r1+r2]
- psraw xmm0, 3
- pshuflw xmm0, xmm0, 0
- punpcklqdq xmm0, xmm0
- punpckldq xmm2, xmm3
- punpckldq xmm4, xmm5
- punpcklbw xmm2, xmm1
- punpcklbw xmm4, xmm1
- paddw xmm2, xmm0
- paddw xmm4, xmm0
- packuswb xmm2, xmm4
- movd [r0], xmm2
- pextrd [r0+r2], xmm2, 1
- pextrd [r1], xmm2, 2
- pextrd [r1+r2], xmm2, 3
+ paddw m0, [pw_4]
+ movd [r1+32*0], m6
+ movd [r1+32*1], m6
+ movd [r1+32*2], m6
+ movd [r1+32*3], m6
+ psraw m0, 3
+ psubw m6, m0
+ packuswb m0, m0
+ packuswb m6, m6
+ punpcklbw m0, m0 ; AABBCCDD
+ punpcklbw m6, m6 ; AABBCCDD
+ movq m1, m0
+ movq m7, m6
+ punpcklbw m0, m0 ; AAAABBBB
+ punpckhbw m1, m1 ; CCCCDDDD
+ punpcklbw m6, m6 ; AAAABBBB
+ punpckhbw m7, m7 ; CCCCDDDD
+
+ ; add DC
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m6, 0, mova
+ ADD_DC m1, m7, 8, mova
+ RET
+
+INIT_XMM
+cglobal vp8_idct_dc_add4_sse2, 3, 3
+ ; load data
+ movd m0, [r1+32*0] ; A
+ movd m1, [r1+32*2] ; C
+ punpcklwd m0, [r1+32*1] ; A B
+ punpcklwd m1, [r1+32*3] ; C D
+ punpckldq m0, m1 ; A B C D
+ pxor m1, m1
+
+ ; calculate DC
+ paddw m0, [pw_4]
+ movd [r1+32*0], m1
+ movd [r1+32*1], m1
+ movd [r1+32*2], m1
+ movd [r1+32*3], m1
+ psraw m0, 3
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+
+ ; add DC
+ lea r1, [r0+r2*2]
+ ADD_DC m0, m1, 0, mova
RET
;-----------------------------------------------------------------------------