summaryrefslogtreecommitdiff
path: root/libavcodec/i386
diff options
context:
space:
mode:
authorLoren Merritt <lorenm@u.washington.edu>2006-08-28 09:33:01 +0000
committerLoren Merritt <lorenm@u.washington.edu>2006-08-28 09:33:01 +0000
commit3e20143ee7b2e3b9b11eaaa4bda971281ae0efbf (patch)
tree4c991508d67a3b40fb302a0ffb842c7bb32fc0c6 /libavcodec/i386
parent001299bfe8142740d4260d5443fa91eb951dbdcb (diff)
mmx implementation of deblocking strength decision.
2-3% faster h264. Originally committed as revision 6113 to svn://svn.ffmpeg.org/ffmpeg/trunk
Diffstat (limited to 'libavcodec/i386')
-rw-r--r--libavcodec/i386/dsputil_mmx.c4
-rw-r--r--libavcodec/i386/h264dsp_mmx.c95
2 files changed, 99 insertions, 0 deletions
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index cf67ca4dda..9e6c88721c 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -53,6 +53,9 @@ static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0
static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
+static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
+static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
+static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
@@ -3282,6 +3285,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
+ c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c
index ac4ad64016..83ee362137 100644
--- a/libavcodec/i386/h264dsp_mmx.c
+++ b/libavcodec/i386/h264dsp_mmx.c
@@ -561,6 +561,101 @@ static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
transpose4x4(pix-2+4*stride, trans+4, stride, 8);
}
+static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
+ int bidir, int edges, int step, int mask_mv0, int mask_mv1 ) {
+ int dir;
+ asm volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "movq %0, %%mm6 \n\t"
+ "movq %1, %%mm5 \n\t"
+ "movq %2, %%mm4 \n\t"
+ ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
+ );
+ // could do a special case for dir==0 && edges==1, but it only reduces the
+ // average filter time by 1.2%
+ for( dir=1; dir>=0; dir-- ) {
+ const int d_idx = dir ? -8 : -1;
+ const int mask_mv = dir ? mask_mv1 : mask_mv0;
+ const uint64_t mask_dir = dir ? 0 : 0xffffffffffffffffULL;
+ int b_idx, edge, l;
+ for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
+ asm volatile(
+ "pand %0, %%mm0 \n\t"
+ ::"m"(mask_dir)
+ );
+ if(!(mask_mv & edge)) {
+ asm volatile("pxor %%mm0, %%mm0 \n\t":);
+ for( l = bidir; l >= 0; l-- ) {
+ asm volatile(
+ "movd %0, %%mm1 \n\t"
+ "punpckldq %1, %%mm1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ "psrlw $7, %%mm2 \n\t"
+ "pand %%mm6, %%mm2 \n\t"
+ "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1
+ "punpckldq %%mm1, %%mm2 \n\t"
+ "pcmpeqb %%mm2, %%mm1 \n\t"
+ "paddb %%mm6, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
+ "por %%mm1, %%mm0 \n\t"
+
+ "movq %2, %%mm1 \n\t"
+ "movq %3, %%mm2 \n\t"
+ "psubw %4, %%mm1 \n\t"
+ "psubw %5, %%mm2 \n\t"
+ "packsswb %%mm2, %%mm1 \n\t"
+ "paddb %%mm5, %%mm1 \n\t"
+ "pminub %%mm4, %%mm1 \n\t"
+ "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
+ "por %%mm1, %%mm0 \n\t"
+ ::"m"(ref[l][b_idx]),
+ "m"(ref[l][b_idx+d_idx]),
+ "m"(mv[l][b_idx][0]),
+ "m"(mv[l][b_idx+2][0]),
+ "m"(mv[l][b_idx+d_idx][0]),
+ "m"(mv[l][b_idx+d_idx+2][0])
+ );
+ }
+ }
+ asm volatile(
+ "movd %0, %%mm1 \n\t"
+ "por %1, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
+ ::"m"(nnz[b_idx]),
+ "m"(nnz[b_idx+d_idx])
+ );
+ asm volatile(
+ "pcmpeqw %%mm7, %%mm0 \n\t"
+ "pcmpeqw %%mm7, %%mm0 \n\t"
+ "psrlw $15, %%mm0 \n\t" // nonzero -> 1
+ "psrlw $14, %%mm1 \n\t"
+ "movq %%mm0, %%mm2 \n\t"
+ "por %%mm1, %%mm2 \n\t"
+ "psrlw $1, %%mm1 \n\t"
+ "pandn %%mm2, %%mm1 \n\t"
+ "movq %%mm1, %0 \n\t"
+ :"=m"(*bS[dir][edge])
+ ::"memory"
+ );
+ }
+ edges = 4;
+ step = 1;
+ }
+ asm volatile(
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm1 \n\t"
+ "movq 16(%0), %%mm2 \n\t"
+ "movq 24(%0), %%mm3 \n\t"
+ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
+ "movq %%mm0, (%0) \n\t"
+ "movq %%mm3, 8(%0) \n\t"
+ "movq %%mm4, 16(%0) \n\t"
+ "movq %%mm2, 24(%0) \n\t"
+ ::"r"(bS[0])
+ :"memory"
+ );
+}
/***********************************/
/* motion compensation */