summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/dsputil.asm163
-rw-r--r--libavcodec/x86/dsputil_mmx.c185
2 files changed, 171 insertions, 177 deletions
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 8002779a2e..9647c558c8 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -22,6 +22,8 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
+cextern pb_FC
+cextern h263_loop_filter_strength
pb_f: times 16 db 15
pb_zzzzzzzz77777777: times 8 db -1
pb_7: times 8 db 7
@@ -648,3 +650,164 @@ BSWAP32_BUF
INIT_XMM ssse3
BSWAP32_BUF
+
+
+%macro H263_LOOP_FILTER 5
+ pxor m7, m7
+ mova m0, [%1]
+ mova m1, [%1]
+ mova m2, [%4]
+ mova m3, [%4]
+ punpcklbw m0, m7
+ punpckhbw m1, m7
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ psubw m0, m2
+ psubw m1, m3
+ mova m2, [%2]
+ mova m3, [%2]
+ mova m4, [%3]
+ mova m5, [%3]
+ punpcklbw m2, m7
+ punpckhbw m3, m7
+ punpcklbw m4, m7
+ punpckhbw m5, m7
+ psubw m4, m2
+ psubw m5, m3
+ psllw m4, 2
+ psllw m5, 2
+ paddw m4, m0
+ paddw m5, m1
+ pxor m6, m6
+ pcmpgtw m6, m4
+ pcmpgtw m7, m5
+ pxor m4, m6
+ pxor m5, m7
+ psubw m4, m6
+ psubw m5, m7
+ psrlw m4, 3
+ psrlw m5, 3
+ packuswb m4, m5
+ packsswb m6, m7
+ pxor m7, m7
+ movd m2, %5
+ punpcklbw m2, m2
+ punpcklbw m2, m2
+ punpcklbw m2, m2
+ psubusb m2, m4
+ mova m3, m2
+ psubusb m3, m4
+ psubb m2, m3
+ mova m3, [%2]
+ mova m4, [%3]
+ pxor m3, m6
+ pxor m4, m6
+ paddusb m3, m2
+ psubusb m4, m2
+ pxor m3, m6
+ pxor m4, m6
+ paddusb m2, m2
+ packsswb m0, m1
+ pcmpgtb m7, m0
+ pxor m0, m7
+ psubb m0, m7
+ mova m1, m0
+ psubusb m0, m2
+ psubb m1, m0
+ pand m1, [pb_FC]
+ psrlw m1, 2
+ pxor m1, m7
+ psubb m1, m7
+ mova m5, [%1]
+ mova m6, [%4]
+ psubb m5, m1
+ paddb m6, m1
+%endmacro
+
+INIT_MMX mmx
+; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
+cglobal h263_v_loop_filter, 3,5
+ movsxdifnidn r1, r1d
+ movsxdifnidn r2, r2d
+
+ lea r4, [ff_h263_loop_filter_strength]
+ movzx r3d, BYTE [r4+r2]
+ movsx r2, r3b
+ shl r2, 1
+
+ mov r3, r0
+ sub r3, r1
+ mov r4, r3
+ sub r4, r1
+ H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
+
+ mova [r3], m3
+ mova [r0], m4
+ mova [r4], m5
+ mova [r0+r1], m6
+ RET
+
+%macro TRANSPOSE4X4 2
+ movd m0, [%1]
+ movd m1, [%1+r1]
+ movd m2, [%1+r1*2]
+ movd m3, [%1+r3]
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ mova m1, m0
+ punpcklwd m0, m2
+ punpckhwd m1, m2
+ movd [%2+ 0], m0
+ punpckhdq m0, m0
+ movd [%2+ 8], m0
+ movd [%2+16], m1
+ punpckhdq m1, m1
+ movd [%2+24], m1
+%endmacro
+
+
+; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
+INIT_MMX mmx
+cglobal h263_h_loop_filter, 3,5,0,32
+ movsxdifnidn r1, r1d
+ movsxdifnidn r2, r2d
+
+ lea r4, [ff_h263_loop_filter_strength]
+ movzx r3d, BYTE [r4+r2]
+ movsx r2, r3b
+ shl r2, 1
+
+ sub r0, 2
+ lea r3, [r1*3]
+
+ TRANSPOSE4X4 r0, rsp
+ lea r4, [r0+r1*4]
+ TRANSPOSE4X4 r4, rsp+4
+
+ H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
+
+ mova m1, m5
+ mova m0, m4
+ punpcklbw m5, m3
+ punpcklbw m4, m6
+ punpckhbw m1, m3
+ punpckhbw m0, m6
+ mova m3, m5
+ mova m6, m1
+ punpcklwd m5, m4
+ punpcklwd m1, m0
+ punpckhwd m3, m4
+ punpckhwd m6, m0
+ movd [r0], m5
+ punpckhdq m5, m5
+ movd [r0+r1*1], m5
+ movd [r0+r1*2], m3
+ punpckhdq m3, m3
+ movd [r0+r3], m3
+ movd [r4], m1
+ punpckhdq m1, m1
+ movd [r4+r1*1], m1
+ movd [r4+r1*2], m6
+ punpckhdq m6, m6
+ movd [r4+r3], m6
+ RET
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 39383863af..c011a21d5b 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -651,181 +651,12 @@ static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
*left_top = tl;
}
#endif
+#endif /* HAVE_INLINE_ASM */
-static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
- __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
- "movd (%1), %%mm0 \n\t"
- "add %3, %1 \n\t"
- "movd (%1), %%mm1 \n\t"
- "movd (%1,%3,1), %%mm2 \n\t"
- "movd (%1,%3,2), %%mm3 \n\t"
- "punpcklbw %%mm1, %%mm0 \n\t"
- "punpcklbw %%mm3, %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "punpcklwd %%mm2, %%mm0 \n\t"
- "punpckhwd %%mm2, %%mm1 \n\t"
- "movd %%mm0, (%0) \n\t"
- "add %2, %0 \n\t"
- "punpckhdq %%mm0, %%mm0 \n\t"
- "movd %%mm0, (%0) \n\t"
- "movd %%mm1, (%0,%2,1) \n\t"
- "punpckhdq %%mm1, %%mm1 \n\t"
- "movd %%mm1, (%0,%2,2) \n\t"
-
- : "+&r" (dst),
- "+&r" (src)
- : "r" (dst_stride),
- "r" (src_stride)
- : "memory"
- );
-}
-
-#define H263_LOOP_FILTER \
- "pxor %%mm7, %%mm7 \n\t" \
- "movq %0, %%mm0 \n\t" \
- "movq %0, %%mm1 \n\t" \
- "movq %3, %%mm2 \n\t" \
- "movq %3, %%mm3 \n\t" \
- "punpcklbw %%mm7, %%mm0 \n\t" \
- "punpckhbw %%mm7, %%mm1 \n\t" \
- "punpcklbw %%mm7, %%mm2 \n\t" \
- "punpckhbw %%mm7, %%mm3 \n\t" \
- "psubw %%mm2, %%mm0 \n\t" \
- "psubw %%mm3, %%mm1 \n\t" \
- "movq %1, %%mm2 \n\t" \
- "movq %1, %%mm3 \n\t" \
- "movq %2, %%mm4 \n\t" \
- "movq %2, %%mm5 \n\t" \
- "punpcklbw %%mm7, %%mm2 \n\t" \
- "punpckhbw %%mm7, %%mm3 \n\t" \
- "punpcklbw %%mm7, %%mm4 \n\t" \
- "punpckhbw %%mm7, %%mm5 \n\t" \
- "psubw %%mm2, %%mm4 \n\t" \
- "psubw %%mm3, %%mm5 \n\t" \
- "psllw $2, %%mm4 \n\t" \
- "psllw $2, %%mm5 \n\t" \
- "paddw %%mm0, %%mm4 \n\t" \
- "paddw %%mm1, %%mm5 \n\t" \
- "pxor %%mm6, %%mm6 \n\t" \
- "pcmpgtw %%mm4, %%mm6 \n\t" \
- "pcmpgtw %%mm5, %%mm7 \n\t" \
- "pxor %%mm6, %%mm4 \n\t" \
- "pxor %%mm7, %%mm5 \n\t" \
- "psubw %%mm6, %%mm4 \n\t" \
- "psubw %%mm7, %%mm5 \n\t" \
- "psrlw $3, %%mm4 \n\t" \
- "psrlw $3, %%mm5 \n\t" \
- "packuswb %%mm5, %%mm4 \n\t" \
- "packsswb %%mm7, %%mm6 \n\t" \
- "pxor %%mm7, %%mm7 \n\t" \
- "movd %4, %%mm2 \n\t" \
- "punpcklbw %%mm2, %%mm2 \n\t" \
- "punpcklbw %%mm2, %%mm2 \n\t" \
- "punpcklbw %%mm2, %%mm2 \n\t" \
- "psubusb %%mm4, %%mm2 \n\t" \
- "movq %%mm2, %%mm3 \n\t" \
- "psubusb %%mm4, %%mm3 \n\t" \
- "psubb %%mm3, %%mm2 \n\t" \
- "movq %1, %%mm3 \n\t" \
- "movq %2, %%mm4 \n\t" \
- "pxor %%mm6, %%mm3 \n\t" \
- "pxor %%mm6, %%mm4 \n\t" \
- "paddusb %%mm2, %%mm3 \n\t" \
- "psubusb %%mm2, %%mm4 \n\t" \
- "pxor %%mm6, %%mm3 \n\t" \
- "pxor %%mm6, %%mm4 \n\t" \
- "paddusb %%mm2, %%mm2 \n\t" \
- "packsswb %%mm1, %%mm0 \n\t" \
- "pcmpgtb %%mm0, %%mm7 \n\t" \
- "pxor %%mm7, %%mm0 \n\t" \
- "psubb %%mm7, %%mm0 \n\t" \
- "movq %%mm0, %%mm1 \n\t" \
- "psubusb %%mm2, %%mm0 \n\t" \
- "psubb %%mm0, %%mm1 \n\t" \
- "pand %5, %%mm1 \n\t" \
- "psrlw $2, %%mm1 \n\t" \
- "pxor %%mm7, %%mm1 \n\t" \
- "psubb %%mm7, %%mm1 \n\t" \
- "movq %0, %%mm5 \n\t" \
- "movq %3, %%mm6 \n\t" \
- "psubb %%mm1, %%mm5 \n\t" \
- "paddb %%mm1, %%mm6 \n\t"
-
-static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
-{
- if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
- const int strength = ff_h263_loop_filter_strength[qscale];
-
- __asm__ volatile (
- H263_LOOP_FILTER
-
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %0 \n\t"
- "movq %%mm6, %3 \n\t"
- : "+m"(*(uint64_t*)(src - 2 * stride)),
- "+m"(*(uint64_t*)(src - 1 * stride)),
- "+m"(*(uint64_t*)(src + 0 * stride)),
- "+m"(*(uint64_t*)(src + 1 * stride))
- : "g"(2 * strength), "m"(ff_pb_FC)
- );
- }
-}
-
-static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
-{
- if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
- const int strength = ff_h263_loop_filter_strength[qscale];
- DECLARE_ALIGNED(8, uint64_t, temp)[4];
- uint8_t *btemp = (uint8_t*)temp;
-
- src -= 2;
-
- transpose4x4(btemp, src, 8, stride);
- transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
- __asm__ volatile (
- H263_LOOP_FILTER // 5 3 4 6
-
- : "+m"(temp[0]),
- "+m"(temp[1]),
- "+m"(temp[2]),
- "+m"(temp[3])
- : "g"(2 * strength), "m"(ff_pb_FC)
- );
-
- __asm__ volatile (
- "movq %%mm5, %%mm1 \n\t"
- "movq %%mm4, %%mm0 \n\t"
- "punpcklbw %%mm3, %%mm5 \n\t"
- "punpcklbw %%mm6, %%mm4 \n\t"
- "punpckhbw %%mm3, %%mm1 \n\t"
- "punpckhbw %%mm6, %%mm0 \n\t"
- "movq %%mm5, %%mm3 \n\t"
- "movq %%mm1, %%mm6 \n\t"
- "punpcklwd %%mm4, %%mm5 \n\t"
- "punpcklwd %%mm0, %%mm1 \n\t"
- "punpckhwd %%mm4, %%mm3 \n\t"
- "punpckhwd %%mm0, %%mm6 \n\t"
- "movd %%mm5, (%0) \n\t"
- "punpckhdq %%mm5, %%mm5 \n\t"
- "movd %%mm5, (%0, %2) \n\t"
- "movd %%mm3, (%0, %2, 2) \n\t"
- "punpckhdq %%mm3, %%mm3 \n\t"
- "movd %%mm3, (%0, %3) \n\t"
- "movd %%mm1, (%1) \n\t"
- "punpckhdq %%mm1, %%mm1 \n\t"
- "movd %%mm1, (%1, %2) \n\t"
- "movd %%mm6, (%1, %2, 2) \n\t"
- "punpckhdq %%mm6, %%mm6 \n\t"
- "movd %%mm6, (%1, %3) \n\t"
- :: "r"(src),
- "r"(src + 4 * stride),
- "r"((x86_reg)stride),
- "r"((x86_reg)(3 * stride))
- );
- }
-}
+void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
+void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
+#if HAVE_INLINE_ASM
/* Draw the edges of width 'w' of an image of size width, height
* this MMX version can only handle w == 8 || w == 16. */
static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
@@ -1653,14 +1484,14 @@ static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
c->gmc = gmc_mmx;
c->add_bytes = add_bytes_mmx;
+#endif /* HAVE_INLINE_ASM */
+#if HAVE_YASM
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
- c->h263_v_loop_filter = h263_v_loop_filter_mmx;
- c->h263_h_loop_filter = h263_h_loop_filter_mmx;
+ c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
+ c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
}
-#endif /* HAVE_INLINE_ASM */
-#if HAVE_YASM
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif