summaryrefslogtreecommitdiff
path: root/libavcodec/i386
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2006-02-07 06:25:14 +0000
committerLoren Merritt <lorenm@u.washington.edu>2006-02-07 06:25:14 +0000
commit0331f092379bba24f6117497c62197818330dc15 (patch)
tree2c3df8a586e426a7a9a5d603780c2e13f1dfea2c /libavcodec/i386
parent1a44a8b65454a147854f150a64d7c355c3e94a6b (diff)
15% faster put_h264_qpel16_hv_lowpass_mmx2
Originally committed as revision 4949 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386')
-rw-r--r--libavcodec/i386/h264dsp_mmx.c71
1 files changed, 45 insertions, 26 deletions
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c
index 8ab58f389e..19fb508b93 100644
--- a/libavcodec/i386/h264dsp_mmx.c
+++ b/libavcodec/i386/h264dsp_mmx.c
@@ -630,9 +630,9 @@ static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, i
dst += 4-8*dstStride;\
}\
}\
-static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- int h=8;\
- int w=4;\
+static inline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+ int h = size;\
+ int w = (size+8)>>2;\
src -= 2*srcStride+2;\
while(w--){\
asm volatile(\
@@ -652,23 +652,40 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
- QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\
- QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\
- QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\
- QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\
- QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\
- QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\
- QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\
- QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
- \
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
+ QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
+ QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
: "+a"(src)\
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
: "memory"\
);\
+ if(size==16){\
+ asm volatile(\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
+ QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
+ QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
+ QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
+ QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
+ QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
+ QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
+ : "+a"(src)\
+ : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
+ : "memory"\
+ );\
+ }\
tmp += 4;\
- src += 4 - 13*srcStride;\
+ src += 4 - (size+5)*srcStride;\
}\
- tmp -= 4*4;\
+ tmp -= size+8;\
+ w = size>>4;\
+ do{\
+ h = size;\
asm volatile(\
"movq %4, %%mm6 \n\t"\
"1: \n\t"\
@@ -702,7 +719,7 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
"psraw $6, %%mm3 \n\t"\
"packuswb %%mm3, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm7, q)\
- "add $32, %0 \n\t"\
+ "add $48, %0 \n\t"\
"add %3, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
@@ -710,6 +727,9 @@ static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
: "S"((long)dstStride), "m"(ff_pw_32)\
: "memory"\
);\
+ tmp += 8 - size*24;\
+ dst += 8 - size*dstStride;\
+ }while(w--);\
}\
static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
@@ -729,13 +749,12 @@ static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
+}\
+\
static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
- OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
- OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
- src += 8*srcStride;\
- dst += 8*dstStride;\
- OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
- OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
+ OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
}\
#define H264_MC(OPNAME, SIZE, MMX) \
@@ -816,13 +835,13 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- uint64_t temp[SIZE*(SIZE+8)/4];\
+ uint64_t temp[SIZE*(SIZE<8?12:24)/4];\
int16_t * const tmp= (int16_t*)temp;\
OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
+ uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
uint8_t * const halfH= (uint8_t*)temp;\
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
@@ -832,7 +851,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
+ uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
uint8_t * const halfH= (uint8_t*)temp;\
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
@@ -842,7 +861,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
+ uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
uint8_t * const halfV= (uint8_t*)temp;\
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
@@ -852,7 +871,7 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
- uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
+ uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/4];\
uint8_t * const halfV= (uint8_t*)temp;\
uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\