summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libavcodec/dsputil.h4
-rw-r--r--libavcodec/vc1.c4
-rw-r--r--libavcodec/vc1dec.c23
-rw-r--r--libavcodec/vc1dsp.c76
-rw-r--r--libavcodec/x86/vc1dsp_mmx.c203
5 files changed, 305 insertions, 5 deletions
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index f54d74d285..2d15bd3538 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -486,6 +486,10 @@ typedef struct DSPContext {
void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block);
void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block);
void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);
+ void (*vc1_inv_trans_8x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
+ void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
+ void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, int line_size, DCTELEM *block);
+ void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, int line_size, DCTELEM *block);
void (*vc1_v_overlap)(uint8_t* src, int stride);
void (*vc1_h_overlap)(uint8_t* src, int stride);
void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq);
diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
index 57982444fe..ec6a660ad9 100644
--- a/libavcodec/vc1.c
+++ b/libavcodec/vc1.c
@@ -337,6 +337,10 @@ int vc1_decode_sequence_header(AVCodecContext *avctx, VC1Context *v, GetBitConte
v->s.dsp.vc1_inv_trans_8x4 = ff_simple_idct84_add;
v->s.dsp.vc1_inv_trans_4x8 = ff_simple_idct48_add;
v->s.dsp.vc1_inv_trans_4x4 = ff_simple_idct44_add;
+ v->s.dsp.vc1_inv_trans_8x8_dc = ff_simple_idct_add;
+ v->s.dsp.vc1_inv_trans_8x4_dc = ff_simple_idct84_add;
+ v->s.dsp.vc1_inv_trans_4x8_dc = ff_simple_idct48_add;
+ v->s.dsp.vc1_inv_trans_4x4_dc = ff_simple_idct44_add;
}
v->fastuvmc = get_bits1(gb); //common
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 5d4dd0633b..6172e0c047 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -2028,8 +2028,12 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
block[idx] += (block[idx] < 0) ? -mquant : mquant;
}
if(!skip_block){
- s->dsp.vc1_inv_trans_8x8(block);
- s->dsp.add_pixels_clamped(block, dst, linesize);
+ if(i==1)
+ s->dsp.vc1_inv_trans_8x8_dc(dst, linesize, block);
+ else{
+ s->dsp.vc1_inv_trans_8x8(block);
+ s->dsp.add_pixels_clamped(block, dst, linesize);
+ }
if(apply_filter && cbp_top & 0xC)
s->dsp.vc1_v_loop_filter8(dst, linesize, v->pq);
if(apply_filter && cbp_left & 0xA)
@@ -2053,7 +2057,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
block[idx + off] += (block[idx + off] < 0) ? -mquant : mquant;
}
if(!(subblkpat & (1 << (3 - j))) && !skip_block){
- s->dsp.vc1_inv_trans_4x4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off);
+ if(i==1)
+ s->dsp.vc1_inv_trans_4x4_dc(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off);
+ else
+ s->dsp.vc1_inv_trans_4x4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, block + off);
if(apply_filter && (j&2 ? pat & (1<<(j-2)) : (cbp_top & (1 << (j + 2)))))
s->dsp.vc1_v_loop_filter4(dst + (j&1)*4 + (j&2)*2*linesize, linesize, v->pq);
if(apply_filter && (j&1 ? pat & (1<<(j-1)) : (cbp_left & (1 << (j + 1)))))
@@ -2078,7 +2085,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
block[idx] += (block[idx] < 0) ? -mquant : mquant;
}
if(!(subblkpat & (1 << (1 - j))) && !skip_block){
- s->dsp.vc1_inv_trans_8x4(dst + j*4*linesize, linesize, block + off);
+ if(i==1)
+ s->dsp.vc1_inv_trans_8x4_dc(dst + j*4*linesize, linesize, block + off);
+ else
+ s->dsp.vc1_inv_trans_8x4(dst + j*4*linesize, linesize, block + off);
if(apply_filter && j ? pat & 0x3 : (cbp_top & 0xC))
s->dsp.vc1_v_loop_filter8(dst + j*4*linesize, linesize, v->pq);
if(apply_filter && cbp_left & (2 << j))
@@ -2103,7 +2113,10 @@ static int vc1_decode_p_block(VC1Context *v, DCTELEM block[64], int n, int mquan
block[idx] += (block[idx] < 0) ? -mquant : mquant;
}
if(!(subblkpat & (1 << (1 - j))) && !skip_block){
- s->dsp.vc1_inv_trans_4x8(dst + j*4, linesize, block + off);
+ if(i==1)
+ s->dsp.vc1_inv_trans_4x8_dc(dst + j*4, linesize, block + off);
+ else
+ s->dsp.vc1_inv_trans_4x8(dst + j*4, linesize, block + off);
if(apply_filter && cbp_top & (2 << j))
s->dsp.vc1_v_loop_filter4(dst + j*4, linesize, v->pq);
if(apply_filter && j ? pat & 0x5 : (cbp_left & 0xA))
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index 5773ab1e3c..5e79736a50 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -178,6 +178,26 @@ static void vc1_h_loop_filter16_c(uint8_t *src, int stride, int pq)
/** Do inverse transform on 8x8 block
*/
+static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int i;
+ int dc = block[0];
+ const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+ dc = (3 * dc + 1) >> 1;
+ dc = (3 * dc + 16) >> 5;
+ for(i = 0; i < 8; i++){
+ dest[0] = cm[dest[0]+dc];
+ dest[1] = cm[dest[1]+dc];
+ dest[2] = cm[dest[2]+dc];
+ dest[3] = cm[dest[3]+dc];
+ dest[4] = cm[dest[4]+dc];
+ dest[5] = cm[dest[5]+dc];
+ dest[6] = cm[dest[6]+dc];
+ dest[7] = cm[dest[7]+dc];
+ dest += linesize;
+ }
+}
+
static void vc1_inv_trans_8x8_c(DCTELEM block[64])
{
int i;
@@ -249,6 +269,26 @@ static void vc1_inv_trans_8x8_c(DCTELEM block[64])
/** Do inverse transform on 8x4 part of block
*/
+static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int i;
+ int dc = block[0];
+ const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+ dc = ( 3 * dc + 1) >> 1;
+ dc = (17 * dc + 64) >> 7;
+ for(i = 0; i < 4; i++){
+ dest[0] = cm[dest[0]+dc];
+ dest[1] = cm[dest[1]+dc];
+ dest[2] = cm[dest[2]+dc];
+ dest[3] = cm[dest[3]+dc];
+ dest[4] = cm[dest[4]+dc];
+ dest[5] = cm[dest[5]+dc];
+ dest[6] = cm[dest[6]+dc];
+ dest[7] = cm[dest[7]+dc];
+ dest += linesize;
+ }
+}
+
static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, DCTELEM *block)
{
int i;
@@ -306,6 +346,22 @@ static void vc1_inv_trans_8x4_c(uint8_t *dest, int linesize, DCTELEM *block)
/** Do inverse transform on 4x8 parts of block
*/
+static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int i;
+ int dc = block[0];
+ const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+ dc = (17 * dc + 4) >> 3;
+ dc = (12 * dc + 64) >> 7;
+ for(i = 0; i < 8; i++){
+ dest[0] = cm[dest[0]+dc];
+ dest[1] = cm[dest[1]+dc];
+ dest[2] = cm[dest[2]+dc];
+ dest[3] = cm[dest[3]+dc];
+ dest += linesize;
+ }
+}
+
static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, DCTELEM *block)
{
int i;
@@ -363,6 +419,22 @@ static void vc1_inv_trans_4x8_c(uint8_t *dest, int linesize, DCTELEM *block)
/** Do inverse transform on 4x4 part of block
*/
+static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int i;
+ int dc = block[0];
+ const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+ dc = (17 * dc + 4) >> 3;
+ dc = (17 * dc + 64) >> 7;
+ for(i = 0; i < 4; i++){
+ dest[0] = cm[dest[0]+dc];
+ dest[1] = cm[dest[1]+dc];
+ dest[2] = cm[dest[2]+dc];
+ dest[3] = cm[dest[3]+dc];
+ dest += linesize;
+ }
+}
+
static void vc1_inv_trans_4x4_c(uint8_t *dest, int linesize, DCTELEM *block)
{
int i;
@@ -545,6 +617,10 @@ void ff_vc1dsp_init(DSPContext* dsp, AVCodecContext *avctx) {
dsp->vc1_inv_trans_4x8 = vc1_inv_trans_4x8_c;
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_c;
dsp->vc1_inv_trans_4x4 = vc1_inv_trans_4x4_c;
+ dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_c;
+ dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_c;
+ dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_c;
+ dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_c;
dsp->vc1_h_overlap = vc1_h_overlap_c;
dsp->vc1_v_overlap = vc1_v_overlap_c;
dsp->vc1_v_loop_filter4 = vc1_v_loop_filter4_c;
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index 2c98b37b97..3071d02808 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -494,6 +494,204 @@ DECLARE_FUNCTION(3, 1)
DECLARE_FUNCTION(3, 2)
DECLARE_FUNCTION(3, 3)
+static void vc1_inv_trans_4x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int dc = block[0];
+ dc = (17 * dc + 4) >> 3;
+ dc = (17 * dc + 64) >> 7;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+static void vc1_inv_trans_4x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int dc = block[0];
+ dc = (17 * dc + 4) >> 3;
+ dc = (12 * dc + 64) >> 7;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+ dest += 4*linesize;
+ __asm__ volatile(
+ "movd %0, %%mm2 \n\t"
+ "movd %1, %%mm3 \n\t"
+ "movd %2, %%mm4 \n\t"
+ "movd %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movd %%mm2, %0 \n\t"
+ "movd %%mm3, %1 \n\t"
+ "movd %%mm4, %2 \n\t"
+ "movd %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+static void vc1_inv_trans_8x4_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int dc = block[0];
+ dc = ( 3 * dc + 1) >> 1;
+ dc = (17 * dc + 64) >> 7;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
+static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int linesize, DCTELEM *block)
+{
+ int dc = block[0];
+ dc = (3 * dc + 1) >> 1;
+ dc = (3 * dc + 16) >> 5;
+ __asm__ volatile(
+ "movd %0, %%mm0 \n\t"
+ "pshufw $0, %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "psubw %%mm0, %%mm1 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "packuswb %%mm1, %%mm1 \n\t"
+ ::"r"(dc)
+ );
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+ dest += 4*linesize;
+ __asm__ volatile(
+ "movq %0, %%mm2 \n\t"
+ "movq %1, %%mm3 \n\t"
+ "movq %2, %%mm4 \n\t"
+ "movq %3, %%mm5 \n\t"
+ "paddusb %%mm0, %%mm2 \n\t"
+ "paddusb %%mm0, %%mm3 \n\t"
+ "paddusb %%mm0, %%mm4 \n\t"
+ "paddusb %%mm0, %%mm5 \n\t"
+ "psubusb %%mm1, %%mm2 \n\t"
+ "psubusb %%mm1, %%mm3 \n\t"
+ "psubusb %%mm1, %%mm4 \n\t"
+ "psubusb %%mm1, %%mm5 \n\t"
+ "movq %%mm2, %0 \n\t"
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %3 \n\t"
+ :"+m"(*(uint32_t*)(dest+0*linesize)),
+ "+m"(*(uint32_t*)(dest+1*linesize)),
+ "+m"(*(uint32_t*)(dest+2*linesize)),
+ "+m"(*(uint32_t*)(dest+3*linesize))
+ );
+}
+
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
mm_flags = mm_support();
@@ -537,5 +735,10 @@ void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx) {
dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2;
dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2;
dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2;
+
+ dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2;
+ dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2;
+ dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2;
+ dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2;
}
}