summaryrefslogtreecommitdiff
path: root/libavcodec/h264idct.c
diff options
context:
space:
mode:
authorJason Garrett-Glaser <darkshikari@gmail.com>2011-01-14 21:34:25 +0000
committerJason Garrett-Glaser <darkshikari@gmail.com>2011-01-14 21:34:25 +0000
commit19fb234e4af1ff9f58ff2fdd604ac6f6bb87ad6b (patch)
tree220be84d79d9c771c1afeab43fdd2aaa82fea01d /libavcodec/h264idct.c
parent6c18f1cda2e2b2471ebf75d30d552cb0cb61b6ad (diff)
H.264: split luma dc idct out and implement MMX/SSE2 versions
About 2.5x the speed. NOTE: the way that the asm code handles large qmuls is a bit suboptimal. If x264-style dequant was used (separate shift and qmul values), it might be possible to get some extra speed. Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/h264idct.c')
-rw-r--r--libavcodec/h264idct.c35
1 files changed, 35 insertions, 0 deletions
diff --git a/libavcodec/h264idct.c b/libavcodec/h264idct.c
index 86c5ef2559..f5b05ac24f 100644
--- a/libavcodec/h264idct.c
+++ b/libavcodec/h264idct.c
@@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block
ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
}
}
+/**
+ * IDCT transforms the 16 dc values and dequantizes them.
+ * @param qp quantization parameter
+ */
+void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){
+#define stride 16
+ int i;
+ int temp[16];
+ static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride};
+
+ for(i=0; i<4; i++){
+ const int z0= input[4*i+0] + input[4*i+1];
+ const int z1= input[4*i+0] - input[4*i+1];
+ const int z2= input[4*i+2] - input[4*i+3];
+ const int z3= input[4*i+2] + input[4*i+3];
+
+ temp[4*i+0]= z0+z3;
+ temp[4*i+1]= z0-z3;
+ temp[4*i+2]= z1-z2;
+ temp[4*i+3]= z1+z2;
+ }
+
+ for(i=0; i<4; i++){
+ const int offset= x_offset[i];
+ const int z0= temp[4*0+i] + temp[4*2+i];
+ const int z1= temp[4*0+i] - temp[4*2+i];
+ const int z2= temp[4*1+i] - temp[4*3+i];
+ const int z3= temp[4*1+i] + temp[4*3+i];
+
+ output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
+ output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
+ output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
+ output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
+ }
+}