From 708ab7dd69d5c98221882a8086f68f1bb02a44a3 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 9 Oct 2011 19:12:09 -0400
Subject: fmtconvert: port float_to_int16() x86 inline asm to yasm

---
 libavcodec/x86/fmtconvert.asm | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'libavcodec/x86/fmtconvert.asm')

diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index efab87d570..d314a4e14e 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -24,6 +24,48 @@
 
 SECTION_TEXT
 
+;------------------------------------------------------------------------------
+; void ff_float_to_int16(int16_t *dst, const float *src, long len);
+;------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16 2
+cglobal float_to_int16_%1, 3,3,%2, dst, src, len
+    add       lenq, lenq
+    lea       srcq, [srcq+2*lenq]
+    add       dstq, lenq
+    neg       lenq
+.loop:
+%ifidn %1, sse2
+    cvtps2dq    m0, [srcq+2*lenq   ]
+    cvtps2dq    m1, [srcq+2*lenq+16]
+    packssdw    m0, m1
+    mova  [dstq+lenq], m0
+%else
+    cvtps2pi    m0, [srcq+2*lenq   ]
+    cvtps2pi    m1, [srcq+2*lenq+ 8]
+    cvtps2pi    m2, [srcq+2*lenq+16]
+    cvtps2pi    m3, [srcq+2*lenq+24]
+    packssdw    m0, m1
+    packssdw    m2, m3
+    mova  [dstq+lenq  ], m0
+    mova  [dstq+lenq+8], m2
+%endif
+    add       lenq, 16
+    js .loop
+%ifnidn %1, sse2
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_XMM
+FLOAT_TO_INT16 sse2, 2
+INIT_MMX
+FLOAT_TO_INT16 sse, 0
+%define cvtps2pi pf2id
+FLOAT_TO_INT16 3dnow, 0
+%undef cvtps2pi
+
+
 %macro PSWAPD_SSE 2
     pshufw %1, %2, 0x4e
 %endmacro
-- 
cgit v1.2.3


From 4e8e2624767f4af0eaa932c543d072fed96fd586 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sun, 9 Oct 2011 23:52:03 -0400
Subject: fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm

---
 libavcodec/x86/dsputil_yasm.asm |  8 ------
 libavcodec/x86/fmtconvert.asm   | 46 ++++++++++++++++++++++++++++++++
 libavcodec/x86/fmtconvert_mmx.c | 59 +++++------------------------------------
 libavutil/x86/x86util.asm       | 12 +++++++++
 4 files changed, 65 insertions(+), 60 deletions(-)

(limited to 'libavcodec/x86/fmtconvert.asm')

diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 2a2108404a..fe96d8b12b 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1055,14 +1055,6 @@ emu_edge mmx
 ;                           int32_t max, unsigned int len)
 ;-----------------------------------------------------------------------------
 
-%macro SPLATD_MMX 1
-    punpckldq  %1, %1
-%endmacro
-
-%macro SPLATD_SSE2 1
-    pshufd  %1, %1, 0
-%endmacro
-
 %macro VECTOR_CLIP_INT32 4
 cglobal vector_clip_int32_%1, 5,5,%2, dst, src, min, max, len
 %ifidn %1, sse2
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index d314a4e14e..e3eb5d2286 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -24,6 +24,52 @@
 
 SECTION_TEXT
 
+;---------------------------------------------------------------------------------
+; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
+;---------------------------------------------------------------------------------
+%macro INT32_TO_FLOAT_FMUL_SCALAR 2
+%ifdef ARCH_X86_64
+cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
+%else
+cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
+    movss   m0, mulm
+%endif
+    SPLATD  m0
+    shl     lenq, 2
+    add     srcq, lenq
+    add     dstq, lenq
+    neg     lenq
+.loop:
+%ifidn %1, sse2
+    cvtdq2ps  m1, [srcq+lenq   ]
+    cvtdq2ps  m2, [srcq+lenq+16]
+%else
+    cvtpi2ps  m1, [srcq+lenq   ]
+    cvtpi2ps  m3, [srcq+lenq+ 8]
+    cvtpi2ps  m2, [srcq+lenq+16]
+    cvtpi2ps  m4, [srcq+lenq+24]
+    movlhps   m1, m3
+    movlhps   m2, m4
+%endif
+    mulps     m1, m0
+    mulps     m2, m0
+    mova  [dstq+lenq   ], m1
+    mova  [dstq+lenq+16], m2
+    add     lenq, 32
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM
+%define SPLATD SPLATD_SSE
+%define movdqa movaps
+INT32_TO_FLOAT_FMUL_SCALAR sse, 5
+%undef movdqa
+%define SPLATD SPLATD_SSE2
+INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
+%undef SPLATD
+
+
 ;------------------------------------------------------------------------------
 ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
 ;------------------------------------------------------------------------------
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 6e43280d66..86957b45ff 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -26,52 +26,11 @@
 #include "libavutil/x86_cpu.h"
 #include "libavcodec/fmtconvert.h"
 
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtpi2ps   (%2,%0), %%xmm0 \n"
-        "cvtpi2ps  8(%2,%0), %%xmm1 \n"
-        "cvtpi2ps 16(%2,%0), %%xmm2 \n"
-        "cvtpi2ps 24(%2,%0), %%xmm3 \n"
-        "movlhps  %%xmm1,    %%xmm0 \n"
-        "movlhps  %%xmm3,    %%xmm2 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm2 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm2, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
-    x86_reg i = -4*len;
-    __asm__ volatile(
-        "movss  %3, %%xmm4 \n"
-        "shufps $0, %%xmm4, %%xmm4 \n"
-        "1: \n"
-        "cvtdq2ps   (%2,%0), %%xmm0 \n"
-        "cvtdq2ps 16(%2,%0), %%xmm1 \n"
-        "mulps    %%xmm4,    %%xmm0 \n"
-        "mulps    %%xmm4,    %%xmm1 \n"
-        "movaps   %%xmm0,   (%1,%0) \n"
-        "movaps   %%xmm1, 16(%1,%0) \n"
-        "add $32, %0 \n"
-        "jl 1b \n"
-        :"+r"(i)
-        :"r"(dst+len), "r"(src+len), "m"(mul)
-    );
-}
-
 #if HAVE_YASM
 
+void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
+void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
+
 void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
@@ -204,8 +163,8 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 {
     int mm_flags = av_get_cpu_flags();
 
-    if (mm_flags & AV_CPU_FLAG_MMX) {
 #if HAVE_YASM
+    if (mm_flags & AV_CPU_FLAG_MMX) {
         c->float_interleave = float_interleave_mmx;
 
         if (HAVE_AMD3DNOW && mm_flags & AV_CPU_FLAG_3DNOW) {
@@ -219,21 +178,17 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
                 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
             }
         }
-#endif
         if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE) {
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
-#if HAVE_YASM
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
             c->float_to_int16 = ff_float_to_int16_sse;
             c->float_to_int16_interleave = float_to_int16_interleave_sse;
             c->float_interleave = float_interleave_sse;
-#endif
         }
         if (HAVE_SSE && mm_flags & AV_CPU_FLAG_SSE2) {
-            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
-#if HAVE_YASM
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
             c->float_to_int16 = ff_float_to_int16_sse2;
             c->float_to_int16_interleave = float_to_int16_interleave_sse2;
-#endif
         }
     }
+#endif
 }
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index 7e16c15db2..874443a2ef 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -536,6 +536,18 @@
 %endif
 %endmacro
 
+%macro SPLATD_MMX 1
+    punpckldq  %1, %1
+%endmacro
+
+%macro SPLATD_SSE 1
+    shufps  %1, %1, 0
+%endmacro
+
+%macro SPLATD_SSE2 1
+    pshufd  %1, %1, 0
+%endmacro
+
 %macro CLIPW 3 ;(dst, min, max)
     pmaxsw %1, %2
     pminsw %1, %3
-- 
cgit v1.2.3


From aad3429d4e34b74e4eb0b37b17f32804e217cf02 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Mon, 10 Oct 2011 00:43:08 -0400
Subject: fmtconvert: port float_to_int16_interleave() 2-channel x86 inline asm
 to yasm

---
 libavcodec/x86/fmtconvert.asm   | 52 +++++++++++++++++++++++++++++++
 libavcodec/x86/fmtconvert_mmx.c | 69 ++++++-----------------------------------
 2 files changed, 61 insertions(+), 60 deletions(-)

(limited to 'libavcodec/x86/fmtconvert.asm')

diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index e3eb5d2286..854954835c 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -112,6 +112,58 @@ FLOAT_TO_INT16 3dnow, 0
 %undef cvtps2pi
 
 
+;-------------------------------------------------------------------------------
+; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
+;-------------------------------------------------------------------------------
+%macro FLOAT_TO_INT16_INTERLEAVE2 1
+cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
+    lea      lenq, [4*r2q]
+    mov     src1q, [src0q+gprsize]
+    mov     src0q, [src0q]
+    add      dstq, lenq
+    add     src0q, lenq
+    add     src1q, lenq
+    neg      lenq
+.loop:
+%ifidn %1, sse2
+    cvtps2dq   m0, [src0q+lenq]
+    cvtps2dq   m1, [src1q+lenq]
+    packssdw   m0, m1
+    movhlps    m1, m0
+    punpcklwd  m0, m1
+    mova  [dstq+lenq], m0
+%else
+    cvtps2pi   m0, [src0q+lenq  ]
+    cvtps2pi   m1, [src0q+lenq+8]
+    cvtps2pi   m2, [src1q+lenq  ]
+    cvtps2pi   m3, [src1q+lenq+8]
+    packssdw   m0, m1
+    packssdw   m2, m3
+    mova       m1, m0
+    punpcklwd  m0, m2
+    punpckhwd  m1, m2
+    mova  [dstq+lenq  ], m0
+    mova  [dstq+lenq+8], m1
+%endif
+    add      lenq, 16
+    js .loop
+%ifnidn %1, sse2
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_MMX
+%define cvtps2pi pf2id
+FLOAT_TO_INT16_INTERLEAVE2 3dnow
+%undef cvtps2pi
+%define movdqa movaps
+FLOAT_TO_INT16_INTERLEAVE2 sse
+%undef movdqa
+INIT_XMM
+FLOAT_TO_INT16_INTERLEAVE2 sse2
+
+
 %macro PSWAPD_SSE 2
     pshufw %1, %2, 0x4e
 %endmacro
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 86957b45ff..17079d3c82 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -35,13 +35,17 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse  (int16_t *dst, const float *src, long len);
 void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
 
+void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse  (int16_t *dst, const float **src, long len);
+void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
+
 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
 
 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
 
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
     DECLARE_ALIGNED(16, int16_t, tmp)[len];\
@@ -57,71 +61,16 @@ static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, lon
     if(channels==1)\
         ff_float_to_int16_##cpu(dst, src[0], len);\
     else if(channels==2){\
-        x86_reg reglen = len; \
-        const float *src0 = src[0];\
-        const float *src1 = src[1];\
-        __asm__ volatile(\
-            "shl $2, %0 \n"\
-            "add %0, %1 \n"\
-            "add %0, %2 \n"\
-            "add %0, %3 \n"\
-            "neg %0 \n"\
-            body\
-            :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
-        );\
+        ff_float_to_int16_interleave2_##cpu(dst, src, len);\
     }else if(channels==6){\
         ff_float_to_int16_interleave6_##cpu(dst, src, len);\
     }else\
         float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
 }
 
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
-    "1:                         \n"
-    "pf2id     (%2,%0), %%mm0   \n"
-    "pf2id    8(%2,%0), %%mm1   \n"
-    "pf2id     (%3,%0), %%mm2   \n"
-    "pf2id    8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "femms                      \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse,
-    "1:                         \n"
-    "cvtps2pi  (%2,%0), %%mm0   \n"
-    "cvtps2pi 8(%2,%0), %%mm1   \n"
-    "cvtps2pi  (%3,%0), %%mm2   \n"
-    "cvtps2pi 8(%3,%0), %%mm3   \n"
-    "packssdw    %%mm1, %%mm0   \n"
-    "packssdw    %%mm3, %%mm2   \n"
-    "movq        %%mm0, %%mm1   \n"
-    "punpcklwd   %%mm2, %%mm0   \n"
-    "punpckhwd   %%mm2, %%mm1   \n"
-    "movq        %%mm0,  (%1,%0)\n"
-    "movq        %%mm1, 8(%1,%0)\n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-    "emms                       \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse2,
-    "1:                         \n"
-    "cvtps2dq  (%2,%0), %%xmm0  \n"
-    "cvtps2dq  (%3,%0), %%xmm1  \n"
-    "packssdw   %%xmm1, %%xmm0  \n"
-    "movhlps    %%xmm0, %%xmm1  \n"
-    "punpcklwd  %%xmm1, %%xmm0  \n"
-    "movdqa     %%xmm0, (%1,%0) \n"
-    "add $16, %0                \n"
-    "js 1b                      \n"
-)
+FLOAT_TO_INT16_INTERLEAVE(3dnow)
+FLOAT_TO_INT16_INTERLEAVE(sse)
+FLOAT_TO_INT16_INTERLEAVE(sse2)
 
 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
     if(channels==6)
-- 
cgit v1.2.3