summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libavcodec/dsputil.c4
-rw-r--r--libavcodec/dsputil.h8
-rw-r--r--libavcodec/h264idct.c52
-rw-r--r--libavcodec/i386/dsputil_mmx.c14
-rw-r--r--libavcodec/i386/h264dsp_mmx.c95
5 files changed, 171 insertions, 2 deletions
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 386d54a4d5..b3b54d9d6f 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -4281,6 +4281,10 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->h264_idct8_add= ff_h264_idct8_add_c;
c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
+ c->h264_idct_add16 = ff_h264_idct_add16_c;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_c;
+ c->h264_idct_add8 = ff_h264_idct_add8_c;
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
}
c->get_pixels = get_pixels_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index c94e443c30..aeb467f105 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -60,6 +60,10 @@ void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
+void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
const float *src2, int src3, int blocksize, int step);
@@ -441,6 +445,10 @@ typedef struct DSPContext {
void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
void (*h264_dct)(DCTELEM block[4][4]);
+ void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+ void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+ void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
+ void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
/* snow wavelet */
void (*vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c
index 571e2e91d1..7ef7a2a4d5 100644
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@@ -165,3 +165,55 @@ void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
dst += stride;
}
}
+
+//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+};
+
+void ff_h264_idct_add16_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ int nnz = nnzc[ scan8[i] ];
+ if(nnz){
+ if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
+ else idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
+ }
+ }
+}
+
+void ff_h264_idct_add16intra_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ if(nnzc[ scan8[i] ]) idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1);
+ else if(block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride);
+ }
+}
+
+void ff_h264_idct8_add4_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i+=4){
+ int nnz = nnzc[ scan8[i] ];
+ if(nnz){
+ if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_c(dst + block_offset[i], block + i*16, stride);
+ else ff_h264_idct8_add_c (dst + block_offset[i], block + i*16, stride);
+ }
+ }
+}
+
+void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=16; i<16+8; i++){
+ if(nnzc[ scan8[i] ])
+ ff_h264_idct_add_c (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+ else if(block[i*16])
+ ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+ }
+}
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index 19bdd3a655..3560720eea 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2629,8 +2629,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->h264_idct_add= ff_h264_idct_add_mmx;
c->h264_idct8_dc_add=
c->h264_idct8_add= ff_h264_idct8_add_mmx;
- if (mm_flags & FF_MM_SSE2)
- c->h264_idct8_add= ff_h264_idct8_add_sse2;
+
+ c->h264_idct_add16 = ff_h264_idct_add16_mmx;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
+ c->h264_idct_add8 = ff_h264_idct_add8_mmx;
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
if (mm_flags & FF_MM_MMXEXT) {
c->prefetch = prefetch_mmx2;
@@ -2651,6 +2654,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
+ c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
+ c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
+ c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
@@ -2807,6 +2814,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
H264_QPEL_FUNCS(0, 0, sse2);
}
if(mm_flags & FF_MM_SSE2){
+ c->h264_idct8_add = ff_h264_idct8_add_sse2;
+ c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
+
H264_QPEL_FUNCS(0, 1, sse2);
H264_QPEL_FUNCS(0, 2, sse2);
H264_QPEL_FUNCS(0, 3, sse2);
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c
index bb9c82d612..7d19f995ec 100644
--- a/libavcodec/i386/h264dsp_mmx.c
+++ b/libavcodec/i386/h264dsp_mmx.c
@@ -376,6 +376,101 @@ static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
}
}
+//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
+static const uint8_t scan8[16 + 2*4]={
+ 4+1*8, 5+1*8, 4+2*8, 5+2*8,
+ 6+1*8, 7+1*8, 6+2*8, 7+2*8,
+ 4+3*8, 5+3*8, 4+4*8, 5+4*8,
+ 6+3*8, 7+3*8, 6+4*8, 7+4*8,
+ 1+1*8, 2+1*8,
+ 1+2*8, 2+2*8,
+ 1+4*8, 2+4*8,
+ 1+5*8, 2+5*8,
+};
+
+static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ if(nnzc[ scan8[i] ])
+ ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
+ }
+}
+
+static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i+=4){
+ if(nnzc[ scan8[i] ])
+ ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
+ }
+}
+
+
+static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ int nnz = nnzc[ scan8[i] ];
+ if(nnz){
+ if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+ else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
+ }
+ }
+}
+
+static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ if(nnzc[ scan8[i] ] || block[i*16])
+ ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
+ }
+}
+
+static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i++){
+ if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
+ else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+ }
+}
+
+static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i+=4){
+ int nnz = nnzc[ scan8[i] ];
+ if(nnz){
+ if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+ else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride);
+ }
+ }
+}
+
+static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=0; i<16; i+=4){
+ int nnz = nnzc[ scan8[i] ];
+ if(nnz){
+ if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
+ else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride);
+ }
+ }
+}
+
+static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=16; i<16+8; i++){
+ if(nnzc[ scan8[i] ] || block[i*16])
+ ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+ }
+}
+
+static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+ int i;
+ for(i=16; i<16+8; i++){
+ if(nnzc[ scan8[i] ])
+ ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+ else if(block[i*16])
+ ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+ }
+}
/***********************************/
/* deblocking */