summaryrefslogtreecommitdiff
path: root/libavutil
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2012-06-19 20:52:00 +0200
committerMichael Niedermayer <michaelni@gmx.at>2012-06-19 20:53:27 +0200
commitcabbd271a5f37042291c06b9f8bd6c641fbddfde (patch)
tree110238d357631f95c4849d0d99d978a61b2a1ee7 /libavutil
parent6b9446e93296ed236d497fe3f493d8956571f888 (diff)
parent4cc2920dd2c0ce4e64e709da4f78508e1ec9871e (diff)
Merge remote-tracking branch 'qatar/master'
* qatar/master: (24 commits) flvdec: remove incomplete, disabled seeking code mem: add support for _aligned_malloc() as found on Windows lavc: Extend the documentation for avcodec_init_packet flvdec: remove incomplete, disabled seeking code http: replace atoll() with strtoll() mpegts: remove unused/incomplete/broken seeking code af_amix: allow float planar sample format as input af_amix: use AVFloatDSPContext.vector_fmac_scalar() float_dsp: add x86-optimized functions for vector_fmac_scalar() float_dsp: Move vector_fmac_scalar() from libavcodec to libavutil lavr: Add x86-optimized function for flt to s32 conversion lavr: Add x86-optimized function for flt to s16 conversion lavr: Add x86-optimized functions for s32 to flt conversion lavr: Add x86-optimized functions for s32 to s16 conversion lavr: Add x86-optimized functions for s16 to flt conversion lavr: Add x86-optimized function for s16 to s32 conversion rtpenc: Support packetizing iLBC rtpdec: Add a depacketizer for iLBC Implement the iLBC storage file format mov: Support muxing/demuxing iLBC ... Conflicts: Changelog configure libavcodec/avcodec.h libavcodec/dsputil.c libavcodec/version.h libavformat/movenc.c libavformat/mpegts.c libavformat/version.h libavutil/mem.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavutil')
-rw-r--r--libavutil/arm/float_dsp_init_neon.c4
-rw-r--r--libavutil/arm/float_dsp_neon.S48
-rw-r--r--libavutil/float_dsp.c9
-rw-r--r--libavutil/float_dsp.h16
-rw-r--r--libavutil/mem.c6
-rw-r--r--libavutil/x86/float_dsp.asm47
-rw-r--r--libavutil/x86/float_dsp_init.c7
7 files changed, 137 insertions, 0 deletions
diff --git a/libavutil/arm/float_dsp_init_neon.c b/libavutil/arm/float_dsp_init_neon.c
index fa6d0d7d15..3ca0288b31 100644
--- a/libavutil/arm/float_dsp_init_neon.c
+++ b/libavutil/arm/float_dsp_init_neon.c
@@ -26,7 +26,11 @@
void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len);
+void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul,
+ int len);
+
void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
{
fdsp->vector_fmul = ff_vector_fmul_neon;
+ fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_neon;
}
diff --git a/libavutil/arm/float_dsp_neon.S b/libavutil/arm/float_dsp_neon.S
index d66fa09424..03b164388f 100644
--- a/libavutil/arm/float_dsp_neon.S
+++ b/libavutil/arm/float_dsp_neon.S
@@ -62,3 +62,51 @@ function ff_vector_fmul_neon, export=1
3: vst1.32 {d16-d19},[r0,:128]!
bx lr
endfunc
+
+function ff_vector_fmac_scalar_neon, export=1
+VFP len .req r2
+VFP acc .req r3
+NOVFP len .req r3
+NOVFP acc .req r2
+VFP vdup.32 q15, d0[0]
+NOVFP vdup.32 q15, r2
+ bics r12, len, #15
+ mov acc, r0
+ beq 3f
+ vld1.32 {q0}, [r1,:128]!
+ vld1.32 {q8}, [acc,:128]!
+ vld1.32 {q1}, [r1,:128]!
+ vld1.32 {q9}, [acc,:128]!
+1: vmla.f32 q8, q0, q15
+ vld1.32 {q2}, [r1,:128]!
+ vld1.32 {q10}, [acc,:128]!
+ vmla.f32 q9, q1, q15
+ vld1.32 {q3}, [r1,:128]!
+ vld1.32 {q11}, [acc,:128]!
+ vmla.f32 q10, q2, q15
+ vst1.32 {q8}, [r0,:128]!
+ vmla.f32 q11, q3, q15
+ vst1.32 {q9}, [r0,:128]!
+ subs r12, r12, #16
+ beq 2f
+ vld1.32 {q0}, [r1,:128]!
+ vld1.32 {q8}, [acc,:128]!
+ vst1.32 {q10}, [r0,:128]!
+ vld1.32 {q1}, [r1,:128]!
+ vld1.32 {q9}, [acc,:128]!
+ vst1.32 {q11}, [r0,:128]!
+ b 1b
+2: vst1.32 {q10}, [r0,:128]!
+ vst1.32 {q11}, [r0,:128]!
+ ands len, len, #15
+ it eq
+ bxeq lr
+3: vld1.32 {q0}, [r1,:128]!
+ vld1.32 {q8}, [acc,:128]!
+ vmla.f32 q8, q0, q15
+ vst1.32 {q8}, [r0,:128]!
+ subs len, len, #4
+ bgt 3b
+ bx lr
+ .unreq len
+endfunc
diff --git a/libavutil/float_dsp.c b/libavutil/float_dsp.c
index 87cfd88268..f5a8360c86 100644
--- a/libavutil/float_dsp.c
+++ b/libavutil/float_dsp.c
@@ -31,9 +31,18 @@ static void vector_fmul_c(float *dst, const float *src0, const float *src1,
dst[i] = src0[i] * src1[i];
}
+static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
+ int len)
+{
+ int i;
+ for (i = 0; i < len; i++)
+ dst[i] += src[i] * mul;
+}
+
void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact)
{
fdsp->vector_fmul = vector_fmul_c;
+ fdsp->vector_fmac_scalar = vector_fmac_scalar_c;
#if ARCH_ARM
ff_float_dsp_init_arm(fdsp);
diff --git a/libavutil/float_dsp.h b/libavutil/float_dsp.h
index 02c4ab7bde..735eb34c36 100644
--- a/libavutil/float_dsp.h
+++ b/libavutil/float_dsp.h
@@ -35,6 +35,22 @@ typedef struct AVFloatDSPContext {
*/
void (*vector_fmul)(float *dst, const float *src0, const float *src1,
int len);
+
+ /**
+ * Multiply a vector of floats by a scalar float and add to
+ * destination vector. Source and destination vectors must
+ * overlap exactly or not at all.
+ *
+ * @param dst result vector
+ * constraints: 32-byte aligned
+ * @param src input vector
+ * constraints: 32-byte aligned
+ * @param mul scalar value
+ * @param len length of vector
+ * constraints: multiple of 16
+ */
+ void (*vector_fmac_scalar)(float *dst, const float *src, float mul,
+ int len);
} AVFloatDSPContext;
/**
diff --git a/libavutil/mem.c b/libavutil/mem.c
index de22ad8db8..385ace0702 100644
--- a/libavutil/mem.c
+++ b/libavutil/mem.c
@@ -94,6 +94,8 @@ void *av_malloc(size_t size)
if (size) //OS X on SDK 10.6 has a broken posix_memalign implementation
if (posix_memalign(&ptr,ALIGN,size))
ptr = NULL;
+#elif HAVE_ALIGNED_MALLOC
+ ptr = _aligned_malloc(size, ALIGN);
#elif HAVE_MEMALIGN
ptr = memalign(ALIGN,size);
/* Why 64?
@@ -145,6 +147,8 @@ void *av_realloc(void *ptr, size_t size)
ptr= realloc((char*)ptr - diff, size + diff);
if(ptr) ptr = (char*)ptr + diff;
return ptr;
+#elif HAVE_ALIGNED_MALLOC
+ return _aligned_realloc(ptr, size + !size, ALIGN);
#else
return realloc(ptr, size + !size);
#endif
@@ -170,6 +174,8 @@ void av_free(void *ptr)
#if CONFIG_MEMALIGN_HACK
if (ptr)
free((char*)ptr - ((char*)ptr)[-1]);
+#elif HAVE_ALIGNED_MALLOC
+ _aligned_free(ptr);
#else
free(ptr);
#endif
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 6ed716c026..f68e0bfe2d 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -21,6 +21,7 @@
;******************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
SECTION .text
@@ -55,3 +56,49 @@ VECTOR_FMUL
INIT_YMM avx
VECTOR_FMUL
%endif
+
+;------------------------------------------------------------------------------
+; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
+;------------------------------------------------------------------------------
+
+%macro VECTOR_FMAC_SCALAR 0
+%if UNIX64
+cglobal vector_fmac_scalar, 3,3,3, dst, src, len
+%else
+cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
+%endif
+%if WIN64
+ SWAP 0, 2
+%endif
+%if ARCH_X86_32
+ VBROADCASTSS m0, mulm
+%else
+ shufps xmm0, xmm0, 0
+%if cpuflag(avx)
+ vinsertf128 m0, m0, xmm0, 1
+%endif
+%endif
+ lea lenq, [lend*4-2*mmsize]
+.loop
+ mulps m1, m0, [srcq+lenq ]
+ mulps m2, m0, [srcq+lenq+mmsize]
+ addps m1, m1, [dstq+lenq ]
+ addps m2, m2, [dstq+lenq+mmsize]
+ mova [dstq+lenq ], m1
+ mova [dstq+lenq+mmsize], m2
+ sub lenq, 2*mmsize
+ jge .loop
+%if mmsize == 32
+ vzeroupper
+ RET
+%else
+ REP_RET
+%endif
+%endmacro
+
+INIT_XMM sse
+VECTOR_FMAC_SCALAR
+%if HAVE_AVX
+INIT_YMM avx
+VECTOR_FMAC_SCALAR
+%endif
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index 8f6980cbc2..3e05b9d4ca 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -26,6 +26,11 @@ extern void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
extern void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
int len);
+extern void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
+ int len);
+extern void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
+ int len);
+
void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
#if HAVE_YASM
@@ -33,9 +38,11 @@ void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
fdsp->vector_fmul = ff_vector_fmul_sse;
+ fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
}
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
fdsp->vector_fmul = ff_vector_fmul_avx;
+ fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
}
#endif
}