summaryrefslogtreecommitdiff
path: root/libavcodec
diff options
context:
space:
mode:
authorJustin Ruggles <justin.ruggles@gmail.com>2011-01-30 15:06:46 +0000
committerMans Rullgard <mans@mansr.com>2011-02-02 02:44:53 +0000
commitc73d99e672329c8f2df290736ffc474c360ac4ae (patch)
tree59e330229ee0746b5c466da278430e682fc0371b /libavcodec
parent770c410fbb8e1b87ce8ad7f3d7eddaa55e2b8295 (diff)
Separate format conversion DSP functions from DSPContext.
This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com>
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/Makefile1
-rw-r--r--libavcodec/aac.h2
-rw-r--r--libavcodec/aacdec.c4
-rw-r--r--libavcodec/ac3dec.c5
-rw-r--r--libavcodec/ac3dec.h2
-rw-r--r--libavcodec/arm/Makefile5
-rw-r--r--libavcodec/arm/dsputil_init_neon.c10
-rw-r--r--libavcodec/arm/dsputil_init_vfp.c4
-rw-r--r--libavcodec/arm/dsputil_neon.S365
-rw-r--r--libavcodec/arm/dsputil_vfp.S55
-rw-r--r--libavcodec/arm/fmtconvert_init_arm.c48
-rw-r--r--libavcodec/arm/fmtconvert_neon.S391
-rw-r--r--libavcodec/arm/fmtconvert_vfp.S77
-rw-r--r--libavcodec/binkaudio.c6
-rw-r--r--libavcodec/dca.c7
-rw-r--r--libavcodec/dsputil.c33
-rw-r--r--libavcodec/dsputil.h5
-rw-r--r--libavcodec/fmtconvert.c68
-rw-r--r--libavcodec/fmtconvert.h79
-rw-r--r--libavcodec/nellymoserdec.c5
-rw-r--r--libavcodec/ppc/Makefile1
-rw-r--r--libavcodec/ppc/float_altivec.c112
-rw-r--r--libavcodec/ppc/fmtconvert_altivec.c142
-rw-r--r--libavcodec/vorbis_dec.c6
-rw-r--r--libavcodec/wma.c1
-rw-r--r--libavcodec/wma.h2
-rw-r--r--libavcodec/wmadec.c2
-rw-r--r--libavcodec/x86/Makefile2
-rw-r--r--libavcodec/x86/dsputil_mmx.c220
-rw-r--r--libavcodec/x86/dsputil_yasm.asm69
-rw-r--r--libavcodec/x86/fmtconvert.asm91
-rw-r--r--libavcodec/x86/fmtconvert_mmx.c266
32 files changed, 1204 insertions, 882 deletions
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index de1bde0737..6a0a05b870 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -12,6 +12,7 @@ OBJS = allcodecs.o \
bitstream_filter.o \
dsputil.o \
faanidct.o \
+ fmtconvert.o \
imgconvert.o \
jrevdct.o \
opt.o \
diff --git a/libavcodec/aac.h b/libavcodec/aac.h
index 714e314cba..cff476a6eb 100644
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@@ -35,6 +35,7 @@
#include "fft.h"
#include "mpeg4audio.h"
#include "sbr.h"
+#include "fmtconvert.h"
#include <stdint.h>
@@ -268,6 +269,7 @@ typedef struct {
FFTContext mdct;
FFTContext mdct_small;
DSPContext dsp;
+ FmtConvertContext fmt_conv;
int random_state;
/** @} */
diff --git a/libavcodec/aacdec.c b/libavcodec/aacdec.c
index 0ea7dc84a5..411c1dfc1b 100644
--- a/libavcodec/aacdec.c
+++ b/libavcodec/aacdec.c
@@ -85,6 +85,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
#include "lpc.h"
#include "aac.h"
@@ -562,6 +563,7 @@ static av_cold int aac_decode_init(AVCodecContext *avctx)
ff_aac_sbr_init();
dsputil_init(&ac->dsp, avctx);
+ ff_fmt_convert_init(&ac->fmt_conv, avctx);
ac->random_state = 0x1f2e3d4c;
@@ -2032,7 +2034,7 @@ static int aac_decode_frame_int(AVCodecContext *avctx, void *data,
*data_size = data_size_tmp;
if (samples)
- ac->dsp.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
+ ac->fmt_conv.float_to_int16_interleave(data, (const float **)ac->output_data, samples, avctx->channels);
if (ac->output_configured)
ac->output_configured = OC_LOCKED;
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index 8e40ce1ccc..5ebee1908d 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -193,6 +193,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
ff_kbd_window_init(s->window, 5.0, 256);
dsputil_init(&s->dsp, avctx);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
av_lfg_init(&s->dith_state, 0);
/* set scale value for float to int16 conversion */
@@ -1255,7 +1256,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk)
} else {
gain *= s->dynamic_range[0];
}
- s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
+ s->fmt_conv.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256);
}
/* apply spectral extension to high frequency bins */
@@ -1407,7 +1408,7 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
err = 1;
}
- s->dsp.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
+ s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
out_samples += 256 * s->out_channels;
}
*data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h
index 55520cdcee..147e5e59bc 100644
--- a/libavcodec/ac3dec.h
+++ b/libavcodec/ac3dec.h
@@ -55,6 +55,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
/* override ac3.h to include coupling channel */
#undef AC3_MAX_CHANNELS
@@ -190,6 +191,7 @@ typedef struct {
///@defgroup opt optimization
DSPContext dsp; ///< for optimization
+ FmtConvertContext fmt_conv; ///< optimized conversion functions
float mul_bias; ///< scaling for float_to_int16 conversion
///@}
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 4c30e0ab9f..014456ee32 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS += arm/dsputil_init_arm.o \
arm/dsputil_arm.o \
arm/fft_init_arm.o \
+ arm/fmtconvert_init_arm.o \
arm/jrevdct_arm.o \
arm/mpegvideo_arm.o \
arm/simple_idct_arm.o \
@@ -22,8 +23,11 @@ OBJS-$(HAVE_ARMV6) += arm/dsputil_init_armv6.o \
arm/dsputil_armv6.o \
arm/simple_idct_armv6.o \
+VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o \
+
OBJS-$(HAVE_ARMVFP) += arm/dsputil_vfp.o \
arm/dsputil_init_vfp.o \
+ $(VFP-OBJS-yes)
OBJS-$(HAVE_IWMMXT) += arm/dsputil_iwmmxt.o \
arm/mpegvideo_iwmmxt.o \
@@ -52,6 +56,7 @@ NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \
OBJS-$(HAVE_NEON) += arm/dsputil_init_neon.o \
arm/dsputil_neon.o \
+ arm/fmtconvert_neon.o \
arm/int_neon.o \
arm/mpegvideo_neon.o \
arm/simple_idct_neon.o \
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 67982048f9..76ae632273 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -153,8 +153,6 @@ void ff_sv_fmul_scalar_4_neon(float *dst, const float **vp, float mul,
int len);
void ff_butterflies_float_neon(float *v1, float *v2, int len);
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
-void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
- float mul, int len);
void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
@@ -162,8 +160,6 @@ void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
int len);
-void ff_float_to_int16_neon(int16_t *, const float *, long);
-void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
@@ -308,7 +304,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
c->butterflies_float = ff_butterflies_float_neon;
c->scalarproduct_float = ff_scalarproduct_float_neon;
- c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
c->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
c->vector_fmul_add = ff_vector_fmul_add_neon;
c->vector_clipf = ff_vector_clipf_neon;
@@ -319,11 +314,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->sv_fmul_scalar[0] = ff_sv_fmul_scalar_2_neon;
c->sv_fmul_scalar[1] = ff_sv_fmul_scalar_4_neon;
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
- c->float_to_int16 = ff_float_to_int16_neon;
- c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
- }
-
if (CONFIG_VORBIS_DECODER)
c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
diff --git a/libavcodec/arm/dsputil_init_vfp.c b/libavcodec/arm/dsputil_init_vfp.c
index 76ef6b4171..bd52315934 100644
--- a/libavcodec/arm/dsputil_init_vfp.c
+++ b/libavcodec/arm/dsputil_init_vfp.c
@@ -25,13 +25,9 @@ void ff_vector_fmul_vfp(float *dst, const float *src0,
const float *src1, int len);
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
const float *src1, int len);
-void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx)
{
c->vector_fmul = ff_vector_fmul_vfp;
c->vector_fmul_reverse = ff_vector_fmul_reverse_vfp;
-#if HAVE_ARMV6
- c->float_to_int16 = ff_float_to_int16_vfp;
-#endif
}
diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S
index 8329f6cc57..05a911502b 100644
--- a/libavcodec/arm/dsputil_neon.S
+++ b/libavcodec/arm/dsputil_neon.S
@@ -400,343 +400,6 @@ function ff_add_pixels_clamped_neon, export=1
bx lr
endfunc
-function ff_float_to_int16_neon, export=1
- subs r2, r2, #8
- vld1.64 {d0-d1}, [r1,:128]!
- vcvt.s32.f32 q8, q0, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vcvt.s32.f32 q9, q1, #16
- beq 3f
- bics ip, r2, #15
- beq 2f
-1: subs ip, ip, #16
- vshrn.s32 d4, q8, #16
- vld1.64 {d0-d1}, [r1,:128]!
- vcvt.s32.f32 q0, q0, #16
- vshrn.s32 d5, q9, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vcvt.s32.f32 q1, q1, #16
- vshrn.s32 d6, q0, #16
- vst1.64 {d4-d5}, [r0,:128]!
- vshrn.s32 d7, q1, #16
- vld1.64 {d16-d17},[r1,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r1,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.64 {d6-d7}, [r0,:128]!
- bne 1b
- ands r2, r2, #15
- beq 3f
-2: vld1.64 {d0-d1}, [r1,:128]!
- vshrn.s32 d4, q8, #16
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vshrn.s32 d5, q9, #16
- vcvt.s32.f32 q1, q1, #16
- vshrn.s32 d6, q0, #16
- vst1.64 {d4-d5}, [r0,:128]!
- vshrn.s32 d7, q1, #16
- vst1.64 {d6-d7}, [r0,:128]!
- bx lr
-3: vshrn.s32 d4, q8, #16
- vshrn.s32 d5, q9, #16
- vst1.64 {d4-d5}, [r0,:128]!
- bx lr
-endfunc
-
-function ff_float_to_int16_interleave_neon, export=1
- cmp r3, #2
- ldrlt r1, [r1]
- blt ff_float_to_int16_neon
- bne 4f
-
- ldr r3, [r1]
- ldr r1, [r1, #4]
-
- subs r2, r2, #8
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q8, q0, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q9, q1, #16
- vld1.64 {d20-d21},[r1,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r1,:128]!
- vcvt.s32.f32 q11, q11, #16
- beq 3f
- bics ip, r2, #15
- beq 2f
-1: subs ip, ip, #16
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 q10, q8, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q1, q1, #16
- vld1.64 {d24-d25},[r1,:128]!
- vcvt.s32.f32 q12, q12, #16
- vld1.64 {d26-d27},[r1,:128]!
- vsri.32 q11, q9, #16
- vst1.64 {d20-d21},[r0,:128]!
- vcvt.s32.f32 q13, q13, #16
- vst1.64 {d22-d23},[r0,:128]!
- vsri.32 q12, q0, #16
- vld1.64 {d16-d17},[r3,:128]!
- vsri.32 q13, q1, #16
- vst1.64 {d24-d25},[r0,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r3,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r1,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r1,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.64 {d26-d27},[r0,:128]!
- bne 1b
- ands r2, r2, #15
- beq 3f
-2: vsri.32 q10, q8, #16
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q1, q1, #16
- vld1.64 {d24-d25},[r1,:128]!
- vcvt.s32.f32 q12, q12, #16
- vsri.32 q11, q9, #16
- vld1.64 {d26-d27},[r1,:128]!
- vcvt.s32.f32 q13, q13, #16
- vst1.64 {d20-d21},[r0,:128]!
- vsri.32 q12, q0, #16
- vst1.64 {d22-d23},[r0,:128]!
- vsri.32 q13, q1, #16
- vst1.64 {d24-d27},[r0,:128]!
- bx lr
-3: vsri.32 q10, q8, #16
- vsri.32 q11, q9, #16
- vst1.64 {d20-d23},[r0,:128]!
- bx lr
-
-4: push {r4-r8,lr}
- cmp r3, #4
- lsl ip, r3, #1
- blt 4f
-
- @ 4 channels
-5: ldmia r1!, {r4-r7}
- mov lr, r2
- mov r8, r0
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r6,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r7,:128]!
- vcvt.s32.f32 q11, q11, #16
-6: subs lr, lr, #8
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 q9, q8, #16
- vld1.64 {d2-d3}, [r5,:128]!
- vcvt.s32.f32 q1, q1, #16
- vsri.32 q11, q10, #16
- vld1.64 {d4-d5}, [r6,:128]!
- vcvt.s32.f32 q2, q2, #16
- vzip.32 d18, d22
- vld1.64 {d6-d7}, [r7,:128]!
- vcvt.s32.f32 q3, q3, #16
- vzip.32 d19, d23
- vst1.64 {d18}, [r8], ip
- vsri.32 q1, q0, #16
- vst1.64 {d22}, [r8], ip
- vsri.32 q3, q2, #16
- vst1.64 {d19}, [r8], ip
- vzip.32 d2, d6
- vst1.64 {d23}, [r8], ip
- vzip.32 d3, d7
- beq 7f
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.64 {d2}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.64 {d6}, [r8], ip
- vld1.64 {d20-d21},[r6,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.64 {d3}, [r8], ip
- vld1.64 {d22-d23},[r7,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.64 {d7}, [r8], ip
- b 6b
-7: vst1.64 {d2}, [r8], ip
- vst1.64 {d6}, [r8], ip
- vst1.64 {d3}, [r8], ip
- vst1.64 {d7}, [r8], ip
- subs r3, r3, #4
- popeq {r4-r8,pc}
- cmp r3, #4
- add r0, r0, #8
- bge 5b
-
- @ 2 channels
-4: cmp r3, #2
- blt 4f
- ldmia r1!, {r4-r5}
- mov lr, r2
- mov r8, r0
- tst lr, #8
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
- beq 6f
- subs lr, lr, #8
- beq 7f
- vsri.32 d18, d16, #16
- vsri.32 d19, d17, #16
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vst1.32 {d19[1]}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.32 {d22[0]}, [r8], ip
- vst1.32 {d22[1]}, [r8], ip
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
-6: subs lr, lr, #16
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 d18, d16, #16
- vld1.64 {d2-d3}, [r5,:128]!
- vcvt.s32.f32 q1, q1, #16
- vsri.32 d19, d17, #16
- vld1.64 {d4-d5}, [r4,:128]!
- vcvt.s32.f32 q2, q2, #16
- vld1.64 {d6-d7}, [r5,:128]!
- vcvt.s32.f32 q3, q3, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vsri.32 d2, d0, #16
- vst1.32 {d19[1]}, [r8], ip
- vsri.32 d3, d1, #16
- vst1.32 {d22[0]}, [r8], ip
- vsri.32 d6, d4, #16
- vst1.32 {d22[1]}, [r8], ip
- vsri.32 d7, d5, #16
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
- beq 6f
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.32 {d2[0]}, [r8], ip
- vst1.32 {d2[1]}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.32 {d3[0]}, [r8], ip
- vst1.32 {d3[1]}, [r8], ip
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.32 {d6[0]}, [r8], ip
- vst1.32 {d6[1]}, [r8], ip
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.32 {d7[0]}, [r8], ip
- vst1.32 {d7[1]}, [r8], ip
- bgt 6b
-6: vst1.32 {d2[0]}, [r8], ip
- vst1.32 {d2[1]}, [r8], ip
- vst1.32 {d3[0]}, [r8], ip
- vst1.32 {d3[1]}, [r8], ip
- vst1.32 {d6[0]}, [r8], ip
- vst1.32 {d6[1]}, [r8], ip
- vst1.32 {d7[0]}, [r8], ip
- vst1.32 {d7[1]}, [r8], ip
- b 8f
-7: vsri.32 d18, d16, #16
- vsri.32 d19, d17, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vst1.32 {d19[1]}, [r8], ip
- vst1.32 {d22[0]}, [r8], ip
- vst1.32 {d22[1]}, [r8], ip
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
-8: subs r3, r3, #2
- add r0, r0, #4
- popeq {r4-r8,pc}
-
- @ 1 channel
-4: ldr r4, [r1],#4
- tst r2, #8
- mov lr, r2
- mov r5, r0
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
- bne 8f
-6: subs lr, lr, #16
- vld1.64 {d4-d5}, [r4,:128]!
- vcvt.s32.f32 q2, q2, #16
- vld1.64 {d6-d7}, [r4,:128]!
- vcvt.s32.f32 q3, q3, #16
- vst1.16 {d0[1]}, [r5,:16], ip
- vst1.16 {d0[3]}, [r5,:16], ip
- vst1.16 {d1[1]}, [r5,:16], ip
- vst1.16 {d1[3]}, [r5,:16], ip
- vst1.16 {d2[1]}, [r5,:16], ip
- vst1.16 {d2[3]}, [r5,:16], ip
- vst1.16 {d3[1]}, [r5,:16], ip
- vst1.16 {d3[3]}, [r5,:16], ip
- beq 7f
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
-7: vst1.16 {d4[1]}, [r5,:16], ip
- vst1.16 {d4[3]}, [r5,:16], ip
- vst1.16 {d5[1]}, [r5,:16], ip
- vst1.16 {d5[3]}, [r5,:16], ip
- vst1.16 {d6[1]}, [r5,:16], ip
- vst1.16 {d6[3]}, [r5,:16], ip
- vst1.16 {d7[1]}, [r5,:16], ip
- vst1.16 {d7[3]}, [r5,:16], ip
- bgt 6b
- pop {r4-r8,pc}
-8: subs lr, lr, #8
- vst1.16 {d0[1]}, [r5,:16], ip
- vst1.16 {d0[3]}, [r5,:16], ip
- vst1.16 {d1[1]}, [r5,:16], ip
- vst1.16 {d1[3]}, [r5,:16], ip
- vst1.16 {d2[1]}, [r5,:16], ip
- vst1.16 {d2[3]}, [r5,:16], ip
- vst1.16 {d3[1]}, [r5,:16], ip
- vst1.16 {d3[3]}, [r5,:16], ip
- popeq {r4-r8,pc}
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
- b 6b
-endfunc
-
function ff_vector_fmul_neon, export=1
subs r3, r3, #8
vld1.64 {d0-d3}, [r1,:128]!
@@ -1050,34 +713,6 @@ NOVFP vmov.32 r0, d0[0]
bx lr
endfunc
-function ff_int32_to_float_fmul_scalar_neon, export=1
-VFP vdup.32 q0, d0[0]
-VFP len .req r2
-NOVFP vdup.32 q0, r2
-NOVFP len .req r3
-
- vld1.32 {q1},[r1,:128]!
- vcvt.f32.s32 q3, q1
- vld1.32 {q2},[r1,:128]!
- vcvt.f32.s32 q8, q2
-1: subs len, len, #8
- pld [r1, #16]
- vmul.f32 q9, q3, q0
- vmul.f32 q10, q8, q0
- beq 2f
- vld1.32 {q1},[r1,:128]!
- vcvt.f32.s32 q3, q1
- vld1.32 {q2},[r1,:128]!
- vcvt.f32.s32 q8, q2
- vst1.32 {q9}, [r0,:128]!
- vst1.32 {q10},[r0,:128]!
- b 1b
-2: vst1.32 {q9}, [r0,:128]!
- vst1.32 {q10},[r0,:128]!
- bx lr
- .unreq len
-endfunc
-
function ff_vector_fmul_reverse_neon, export=1
add r2, r2, r3, lsl #2
sub r2, r2, #32
diff --git a/libavcodec/arm/dsputil_vfp.S b/libavcodec/arm/dsputil_vfp.S
index a65b69e20a..197d500819 100644
--- a/libavcodec/arm/dsputil_vfp.S
+++ b/libavcodec/arm/dsputil_vfp.S
@@ -131,58 +131,3 @@ function ff_vector_fmul_reverse_vfp, export=1
vpop {d8-d15}
bx lr
endfunc
-
-#if HAVE_ARMV6
-/**
- * ARM VFP optimized float to int16 conversion.
- * Assume that len is a positive number and is multiple of 8, destination
- * buffer is at least 4 bytes aligned (8 bytes alignment is better for
- * performance), little endian byte sex
- */
-@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
-function ff_float_to_int16_vfp, export=1
- push {r4-r8,lr}
- vpush {d8-d11}
- vldmia r1!, {s16-s23}
- vcvt.s32.f32 s0, s16
- vcvt.s32.f32 s1, s17
- vcvt.s32.f32 s2, s18
- vcvt.s32.f32 s3, s19
- vcvt.s32.f32 s4, s20
- vcvt.s32.f32 s5, s21
- vcvt.s32.f32 s6, s22
- vcvt.s32.f32 s7, s23
-1:
- subs r2, r2, #8
- vmov r3, r4, s0, s1
- vmov r5, r6, s2, s3
- vmov r7, r8, s4, s5
- vmov ip, lr, s6, s7
- vldmiagt r1!, {s16-s23}
- ssat r4, #16, r4
- ssat r3, #16, r3
- ssat r6, #16, r6
- ssat r5, #16, r5
- pkhbt r3, r3, r4, lsl #16
- pkhbt r4, r5, r6, lsl #16
- vcvtgt.s32.f32 s0, s16
- vcvtgt.s32.f32 s1, s17
- vcvtgt.s32.f32 s2, s18
- vcvtgt.s32.f32 s3, s19
- vcvtgt.s32.f32 s4, s20
- vcvtgt.s32.f32 s5, s21
- vcvtgt.s32.f32 s6, s22
- vcvtgt.s32.f32 s7, s23
- ssat r8, #16, r8
- ssat r7, #16, r7
- ssat lr, #16, lr
- ssat ip, #16, ip
- pkhbt r5, r7, r8, lsl #16
- pkhbt r6, ip, lr, lsl #16
- stmia r0!, {r3-r6}
- bgt 1b
-
- vpop {d8-d11}
- pop {r4-r8,pc}
-endfunc
-#endif
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
new file mode 100644
index 0000000000..4b6e3939f5
--- /dev/null
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -0,0 +1,48 @@
+/*
+ * ARM optimized Format Conversion Utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
+ float mul, int len);
+
+void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
+
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ if (HAVE_ARMVFP && HAVE_ARMV6) {
+ c->float_to_int16 = ff_float_to_int16_vfp;
+ }
+
+ if (HAVE_NEON) {
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
+
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+ c->float_to_int16 = ff_float_to_int16_neon;
+ c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
+ }
+ }
+}
diff --git a/libavcodec/arm/fmtconvert_neon.S b/libavcodec/arm/fmtconvert_neon.S
new file mode 100644
index 0000000000..359e57e40b
--- /dev/null
+++ b/libavcodec/arm/fmtconvert_neon.S
@@ -0,0 +1,391 @@
+/*
+ * ARM NEON optimised Format Conversion Utils
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+ preserve8
+ .text
+
+function ff_float_to_int16_neon, export=1
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r1,:128]!
+ vcvt.s32.f32 q8, q0, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vcvt.s32.f32 q9, q1, #16
+ beq 3f
+ bics ip, r2, #15
+ beq 2f
+1: subs ip, ip, #16
+ vshrn.s32 d4, q8, #16
+ vld1.64 {d0-d1}, [r1,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vshrn.s32 d5, q9, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vshrn.s32 d6, q0, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ vshrn.s32 d7, q1, #16
+ vld1.64 {d16-d17},[r1,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r1,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.64 {d6-d7}, [r0,:128]!
+ bne 1b
+ ands r2, r2, #15
+ beq 3f
+2: vld1.64 {d0-d1}, [r1,:128]!
+ vshrn.s32 d4, q8, #16
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vshrn.s32 d5, q9, #16
+ vcvt.s32.f32 q1, q1, #16
+ vshrn.s32 d6, q0, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ vshrn.s32 d7, q1, #16
+ vst1.64 {d6-d7}, [r0,:128]!
+ bx lr
+3: vshrn.s32 d4, q8, #16
+ vshrn.s32 d5, q9, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ bx lr
+endfunc
+
+function ff_float_to_int16_interleave_neon, export=1
+ cmp r3, #2
+ ldrlt r1, [r1]
+ blt ff_float_to_int16_neon
+ bne 4f
+
+ ldr r3, [r1]
+ ldr r1, [r1, #4]
+
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q8, q0, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q9, q1, #16
+ vld1.64 {d20-d21},[r1,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r1,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ beq 3f
+ bics ip, r2, #15
+ beq 2f
+1: subs ip, ip, #16
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 q10, q8, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vld1.64 {d24-d25},[r1,:128]!
+ vcvt.s32.f32 q12, q12, #16
+ vld1.64 {d26-d27},[r1,:128]!
+ vsri.32 q11, q9, #16
+ vst1.64 {d20-d21},[r0,:128]!
+ vcvt.s32.f32 q13, q13, #16
+ vst1.64 {d22-d23},[r0,:128]!
+ vsri.32 q12, q0, #16
+ vld1.64 {d16-d17},[r3,:128]!
+ vsri.32 q13, q1, #16
+ vst1.64 {d24-d25},[r0,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r3,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r1,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r1,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.64 {d26-d27},[r0,:128]!
+ bne 1b
+ ands r2, r2, #15
+ beq 3f
+2: vsri.32 q10, q8, #16
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vld1.64 {d24-d25},[r1,:128]!
+ vcvt.s32.f32 q12, q12, #16
+ vsri.32 q11, q9, #16
+ vld1.64 {d26-d27},[r1,:128]!
+ vcvt.s32.f32 q13, q13, #16
+ vst1.64 {d20-d21},[r0,:128]!
+ vsri.32 q12, q0, #16
+ vst1.64 {d22-d23},[r0,:128]!
+ vsri.32 q13, q1, #16
+ vst1.64 {d24-d27},[r0,:128]!
+ bx lr
+3: vsri.32 q10, q8, #16
+ vsri.32 q11, q9, #16
+ vst1.64 {d20-d23},[r0,:128]!
+ bx lr
+
+4: push {r4-r8,lr}
+ cmp r3, #4
+ lsl ip, r3, #1
+ blt 4f
+
+ @ 4 channels
+5: ldmia r1!, {r4-r7}
+ mov lr, r2
+ mov r8, r0
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r6,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r7,:128]!
+ vcvt.s32.f32 q11, q11, #16
+6: subs lr, lr, #8
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 q9, q8, #16
+ vld1.64 {d2-d3}, [r5,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vsri.32 q11, q10, #16
+ vld1.64 {d4-d5}, [r6,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vzip.32 d18, d22
+ vld1.64 {d6-d7}, [r7,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vzip.32 d19, d23
+ vst1.64 {d18}, [r8], ip
+ vsri.32 q1, q0, #16
+ vst1.64 {d22}, [r8], ip
+ vsri.32 q3, q2, #16
+ vst1.64 {d19}, [r8], ip
+ vzip.32 d2, d6
+ vst1.64 {d23}, [r8], ip
+ vzip.32 d3, d7
+ beq 7f
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.64 {d2}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.64 {d6}, [r8], ip
+ vld1.64 {d20-d21},[r6,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.64 {d3}, [r8], ip
+ vld1.64 {d22-d23},[r7,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.64 {d7}, [r8], ip
+ b 6b
+7: vst1.64 {d2}, [r8], ip
+ vst1.64 {d6}, [r8], ip
+ vst1.64 {d3}, [r8], ip
+ vst1.64 {d7}, [r8], ip
+ subs r3, r3, #4
+ popeq {r4-r8,pc}
+ cmp r3, #4
+ add r0, r0, #8
+ bge 5b
+
+ @ 2 channels
+4: cmp r3, #2
+ blt 4f
+ ldmia r1!, {r4-r5}
+ mov lr, r2
+ mov r8, r0
+ tst lr, #8
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ beq 6f
+ subs lr, lr, #8
+ beq 7f
+ vsri.32 d18, d16, #16
+ vsri.32 d19, d17, #16
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vst1.32 {d19[1]}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.32 {d22[0]}, [r8], ip
+ vst1.32 {d22[1]}, [r8], ip
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+6: subs lr, lr, #16
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 d18, d16, #16
+ vld1.64 {d2-d3}, [r5,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vsri.32 d19, d17, #16
+ vld1.64 {d4-d5}, [r4,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vld1.64 {d6-d7}, [r5,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vsri.32 d2, d0, #16
+ vst1.32 {d19[1]}, [r8], ip
+ vsri.32 d3, d1, #16
+ vst1.32 {d22[0]}, [r8], ip
+ vsri.32 d6, d4, #16
+ vst1.32 {d22[1]}, [r8], ip
+ vsri.32 d7, d5, #16
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+ beq 6f
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.32 {d2[0]}, [r8], ip
+ vst1.32 {d2[1]}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.32 {d3[0]}, [r8], ip
+ vst1.32 {d3[1]}, [r8], ip
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.32 {d6[0]}, [r8], ip
+ vst1.32 {d6[1]}, [r8], ip
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.32 {d7[0]}, [r8], ip
+ vst1.32 {d7[1]}, [r8], ip
+ bgt 6b
+6: vst1.32 {d2[0]}, [r8], ip
+ vst1.32 {d2[1]}, [r8], ip
+ vst1.32 {d3[0]}, [r8], ip
+ vst1.32 {d3[1]}, [r8], ip
+ vst1.32 {d6[0]}, [r8], ip
+ vst1.32 {d6[1]}, [r8], ip
+ vst1.32 {d7[0]}, [r8], ip
+ vst1.32 {d7[1]}, [r8], ip
+ b 8f
+7: vsri.32 d18, d16, #16
+ vsri.32 d19, d17, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vst1.32 {d19[1]}, [r8], ip
+ vst1.32 {d22[0]}, [r8], ip
+ vst1.32 {d22[1]}, [r8], ip
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+8: subs r3, r3, #2
+ add r0, r0, #4
+ popeq {r4-r8,pc}
+
+ @ 1 channel
+4: ldr r4, [r1],#4
+ tst r2, #8
+ mov lr, r2
+ mov r5, r0
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ bne 8f
+6: subs lr, lr, #16
+ vld1.64 {d4-d5}, [r4,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vld1.64 {d6-d7}, [r4,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vst1.16 {d0[1]}, [r5,:16], ip
+ vst1.16 {d0[3]}, [r5,:16], ip
+ vst1.16 {d1[1]}, [r5,:16], ip
+ vst1.16 {d1[3]}, [r5,:16], ip
+ vst1.16 {d2[1]}, [r5,:16], ip
+ vst1.16 {d2[3]}, [r5,:16], ip
+ vst1.16 {d3[1]}, [r5,:16], ip
+ vst1.16 {d3[3]}, [r5,:16], ip
+ beq 7f
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+7: vst1.16 {d4[1]}, [r5,:16], ip
+ vst1.16 {d4[3]}, [r5,:16], ip
+ vst1.16 {d5[1]}, [r5,:16], ip
+ vst1.16 {d5[3]}, [r5,:16], ip
+ vst1.16 {d6[1]}, [r5,:16], ip
+ vst1.16 {d6[3]}, [r5,:16], ip
+ vst1.16 {d7[1]}, [r5,:16], ip
+ vst1.16 {d7[3]}, [r5,:16], ip
+ bgt 6b
+ pop {r4-r8,pc}
+8: subs lr, lr, #8
+ vst1.16 {d0[1]}, [r5,:16], ip
+ vst1.16 {d0[3]}, [r5,:16], ip
+ vst1.16 {d1[1]}, [r5,:16], ip
+ vst1.16 {d1[3]}, [r5,:16], ip
+ vst1.16 {d2[1]}, [r5,:16], ip
+ vst1.16 {d2[3]}, [r5,:16], ip
+ vst1.16 {d3[1]}, [r5,:16], ip
+ vst1.16 {d3[3]}, [r5,:16], ip
+ popeq {r4-r8,pc}
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ b 6b
+endfunc
+
+function ff_int32_to_float_fmul_scalar_neon, export=1
+VFP vdup.32 q0, d0[0]
+VFP len .req r2
+NOVFP vdup.32 q0, r2
+NOVFP len .req r3
+
+ vld1.32 {q1},[r1,:128]!
+ vcvt.f32.s32 q3, q1
+ vld1.32 {q2},[r1,:128]!
+ vcvt.f32.s32 q8, q2
+1: subs len, len, #8
+ pld [r1, #16]
+ vmul.f32 q9, q3, q0
+ vmul.f32 q10, q8, q0
+ beq 2f
+ vld1.32 {q1},[r1,:128]!
+ vcvt.f32.s32 q3, q1
+ vld1.32 {q2},[r1,:128]!
+ vcvt.f32.s32 q8, q2
+ vst1.32 {q9}, [r0,:128]!
+ vst1.32 {q10},[r0,:128]!
+ b 1b
+2: vst1.32 {q9}, [r0,:128]!
+ vst1.32 {q10},[r0,:128]!
+ bx lr
+ .unreq len
+endfunc
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
new file mode 100644
index 0000000000..1d19e7758b
--- /dev/null
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "asm.S"
+
+ .syntax unified
+
+/**
+ * ARM VFP optimized float to int16 conversion.
+ * Assume that len is a positive number and is multiple of 8, destination
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for
+ * performance), little endian byte sex
+ */
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
+function ff_float_to_int16_vfp, export=1
+ push {r4-r8,lr}
+ vpush {d8-d11}
+ vldmia r1!, {s16-s23}
+ vcvt.s32.f32 s0, s16
+ vcvt.s32.f32 s1, s17
+ vcvt.s32.f32 s2, s18
+ vcvt.s32.f32 s3, s19
+ vcvt.s32.f32 s4, s20
+ vcvt.s32.f32 s5, s21
+ vcvt.s32.f32 s6, s22
+ vcvt.s32.f32 s7, s23
+1:
+ subs r2, r2, #8
+ vmov r3, r4, s0, s1
+ vmov r5, r6, s2, s3
+ vmov r7, r8, s4, s5
+ vmov ip, lr, s6, s7
+ vldmiagt r1!, {s16-s23}
+ ssat r4, #16, r4
+ ssat r3, #16, r3
+ ssat r6, #16, r6
+ ssat r5, #16, r5
+ pkhbt r3, r3, r4, lsl #16
+ pkhbt r4, r5, r6, lsl #16
+ vcvtgt.s32.f32 s0, s16
+ vcvtgt.s32.f32 s1, s17
+ vcvtgt.s32.f32 s2, s18
+ vcvtgt.s32.f32 s3, s19
+ vcvtgt.s32.f32 s4, s20
+ vcvtgt.s32.f32 s5, s21
+ vcvtgt.s32.f32 s6, s22
+ vcvtgt.s32.f32 s7, s23
+ ssat r8, #16, r8
+ ssat r7, #16, r7
+ ssat lr, #16, lr
+ ssat ip, #16, ip
+ pkhbt r5, r7, r8, lsl #16
+ pkhbt r6, ip, lr, lsl #16
+ stmia r0!, {r3-r6}
+ bgt 1b
+
+ vpop {d8-d11}
+ pop {r4-r8,pc}
+endfunc
diff --git a/libavcodec/binkaudio.c b/libavcodec/binkaudio.c
index ae2f6c88b0..53484654db 100644
--- a/libavcodec/binkaudio.c
+++ b/libavcodec/binkaudio.c
@@ -33,6 +33,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
extern const uint16_t ff_wma_critical_freqs[25];
@@ -43,6 +44,7 @@ typedef struct {
AVCodecContext *avctx;
GetBitContext gb;
DSPContext dsp;
+ FmtConvertContext fmt_conv;
int first;
int channels;
int frame_len; ///< transform size (samples)
@@ -71,6 +73,7 @@ static av_cold int decode_init(AVCodecContext *avctx)
s->avctx = avctx;
dsputil_init(&s->dsp, avctx);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
/* determine frame length */
if (avctx->sample_rate < 22050) {
@@ -222,7 +225,8 @@ static void decode_block(BinkAudioContext *s, short *out, int use_dct)
ff_rdft_calc(&s->trans.rdft, coeffs);
}
- s->dsp.float_to_int16_interleave(out, (const float **)s->coeffs_ptr, s->frame_len, s->channels);
+ s->fmt_conv.float_to_int16_interleave(out, (const float **)s->coeffs_ptr,
+ s->frame_len, s->channels);
if (!s->first) {
int count = s->overlap_len * s->channels;
diff --git a/libavcodec/dca.c b/libavcodec/dca.c
index 3a3eb25d0b..63ea32992e 100644
--- a/libavcodec/dca.c
+++ b/libavcodec/dca.c
@@ -40,6 +40,7 @@
#include "dca.h"
#include "synth_filter.h"
#include "dcadsp.h"
+#include "fmtconvert.h"
//#define TRACE
@@ -347,6 +348,7 @@ typedef struct {
FFTContext imdct;
SynthFilterContext synth;
DCADSPContext dcadsp;
+ FmtConvertContext fmt_conv;
} DCAContext;
static const uint16_t dca_vlc_offs[] = {
@@ -1115,7 +1117,7 @@ static int dca_subsubframe(DCAContext * s, int base_channel, int block_index)
block[m] = get_bitalloc(&s->gb, &dca_smpl_bitalloc[abits], sel);
}
- s->dsp.int32_to_float_fmul_scalar(subband_samples[k][l],
+ s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
block, rscale, 8);
}
@@ -1802,7 +1804,7 @@ static int dca_decode_frame(AVCodecContext * avctx,
}
}
- s->dsp.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
+ s->fmt_conv.float_to_int16_interleave(samples, s->samples_chanptr, 256, channels);
samples += 256 * channels;
}
@@ -1835,6 +1837,7 @@ static av_cold int dca_decode_init(AVCodecContext * avctx)
ff_mdct_init(&s->imdct, 6, 1, 1.0);
ff_synth_filter_init(&s->synth);
ff_dcadsp_init(&s->dcadsp);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
for (i = 0; i < DCA_PRIM_CHANNELS_MAX+1; i++)
s->samples_chanptr[i] = s->samples + i * 256;
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 2d4ec72026..84714def41 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3867,12 +3867,6 @@ static float scalarproduct_float_c(const float *v1, const float *v2, int len)
return p;
}
-static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
- int i;
- for(i=0; i<len; i++)
- dst[i] = src[i] * mul;
-}
-
static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
uint32_t maxi, uint32_t maxisign)
{
@@ -3918,30 +3912,6 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
}
}
-static av_always_inline int float_to_int16_one(const float *src){
- return av_clip_int16(lrintf(*src));
-}
-
-static void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
- int i;
- for(i=0; i<len; i++)
- dst[i] = float_to_int16_one(src+i);
-}
-
-static void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
- int i,j,c;
- if(channels==2){
- for(i=0; i<len; i++){
- dst[2*i] = float_to_int16_one(src[0]+i);
- dst[2*i+1] = float_to_int16_one(src[1]+i);
- }
- }else{
- for(c=0; c<channels; c++)
- for(i=0, j=c; i<len; i++, j+=channels)
- dst[j] = float_to_int16_one(src[c]+i);
- }
-}
-
static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
{
int res = 0;
@@ -4437,10 +4407,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
c->vector_fmul_reverse = vector_fmul_reverse_c;
c->vector_fmul_add = vector_fmul_add_c;
c->vector_fmul_window = vector_fmul_window_c;
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
c->vector_clipf = vector_clipf_c;
- c->float_to_int16 = ff_float_to_int16_c;
- c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
c->scalarproduct_int16 = scalarproduct_int16_c;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
c->scalarproduct_float = scalarproduct_float_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index b942e66a37..c8111866c2 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -392,7 +392,6 @@ typedef struct DSPContext {
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
- void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
/**
* Multiply a vector of floats by a scalar float. Source and
@@ -445,10 +444,6 @@ typedef struct DSPContext {
*/
void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
- /* convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
- void (*float_to_int16)(int16_t *dst, const float *src, long len);
- void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
-
/* (I)DCT */
void (*fdct)(DCTELEM *block/* align 16*/);
void (*fdct248)(DCTELEM *block/* align 16*/);
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
new file mode 100644
index 0000000000..e26b8997ab
--- /dev/null
+++ b/libavcodec/fmtconvert.c
@@ -0,0 +1,68 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "avcodec.h"
+#include "fmtconvert.h"
+
+static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
+ int i;
+ for(i=0; i<len; i++)
+ dst[i] = src[i] * mul;
+}
+
+static av_always_inline int float_to_int16_one(const float *src){
+ return av_clip_int16(lrintf(*src));
+}
+
+static void float_to_int16_c(int16_t *dst, const float *src, long len)
+{
+ int i;
+ for(i=0; i<len; i++)
+ dst[i] = float_to_int16_one(src+i);
+}
+
+static void float_to_int16_interleave_c(int16_t *dst, const float **src,
+ long len, int channels)
+{
+ int i,j,c;
+ if(channels==2){
+ for(i=0; i<len; i++){
+ dst[2*i] = float_to_int16_one(src[0]+i);
+ dst[2*i+1] = float_to_int16_one(src[1]+i);
+ }
+ }else{
+ for(c=0; c<channels; c++)
+ for(i=0, j=c; i<len; i++, j+=channels)
+ dst[j] = float_to_int16_one(src[c]+i);
+ }
+}
+
+av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
+ c->float_to_int16 = float_to_int16_c;
+ c->float_to_int16_interleave = float_to_int16_interleave_c;
+
+ if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
+ if (ARCH_PPC) ff_fmt_convert_init_ppc(c, avctx);
+ if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
+}
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
new file mode 100644
index 0000000000..f2ee261f99
--- /dev/null
+++ b/libavcodec/fmtconvert.h
@@ -0,0 +1,79 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_FMTCONVERT_H
+#define AVCODEC_FMTCONVERT_H
+
+#include "avcodec.h"
+
+typedef struct FmtConvertContext {
+ /**
+ * Convert an array of int32_t to float and multiply by a float value.
+ * @param dst destination array of float.
+ * constraints: 16-byte aligned
+ * @param src source array of int32_t.
+ * constraints: 16-byte aligned
+ * @param len number of elements to convert.
+ * constraints: multiple of 8
+ */
+ void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
+
+ /**
+ * Convert an array of float to an array of int16_t.
+ *
+ * Convert floats from in the range [-32768.0,32767.0] to ints
+ * without rescaling
+ *
+ * @param dst destination array of int16_t.
+ * constraints: 16-byte aligned
+ * @param src source array of float.
+ * constraints: 16-byte aligned
+ * @param len number of elements to convert.
+ * constraints: multiple of 8
+ */
+ void (*float_to_int16)(int16_t *dst, const float *src, long len);
+
+ /**
+ * Convert multiple arrays of float to an interleaved array of int16_t.
+ *
+ * Convert floats from in the range [-32768.0,32767.0] to ints
+ * without rescaling
+ *
+ * @param dst destination array of interleaved int16_t.
+ * constraints: 16-byte aligned
+ * @param src source array of float arrays, one for each channel.
+ * constraints: 16-byte aligned
+ * @param len number of elements to convert.
+ * constraints: multiple of 8
+ * @param channels number of channels
+ */
+ void (*float_to_int16_interleave)(int16_t *dst, const float **src,
+ long len, int channels);
+} FmtConvertContext;
+
+void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
+
+void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
+
+#endif /* AVCODEC_FMTCONVERT_H */
diff --git a/libavcodec/nellymoserdec.c b/libavcodec/nellymoserdec.c
index 8b13a5d894..80e04ee0a2 100644
--- a/libavcodec/nellymoserdec.c
+++ b/libavcodec/nellymoserdec.c
@@ -38,6 +38,7 @@
#include "avcodec.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
#define ALT_BITSTREAM_READER_LE
#include "get_bits.h"
@@ -52,6 +53,7 @@ typedef struct NellyMoserDecodeContext {
float scale_bias;
DSPContext dsp;
FFTContext imdct_ctx;
+ FmtConvertContext fmt_conv;
DECLARE_ALIGNED(16, float,imdct_out)[NELLY_BUF_LEN * 2];
} NellyMoserDecodeContext;
@@ -134,6 +136,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
dsputil_init(&s->dsp, avctx);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
s->scale_bias = 1.0/(1*8);
@@ -175,7 +178,7 @@ static int decode_tag(AVCodecContext * avctx,
for (i=0 ; i<blocks ; i++) {
nelly_decode_block(s, &buf[i*NELLY_BLOCK_LEN], s->float_buf);
- s->dsp.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
+ s->fmt_conv.float_to_int16(&samples[i*NELLY_SAMPLES], s->float_buf, NELLY_SAMPLES);
*data_size += NELLY_SAMPLES*sizeof(int16_t);
}
diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile
index 9b2358d49c..35ea0c38f8 100644
--- a/libavcodec/ppc/Makefile
+++ b/libavcodec/ppc/Makefile
@@ -21,6 +21,7 @@ ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \
OBJS-$(HAVE_ALTIVEC) += ppc/dsputil_altivec.o \
ppc/fdct_altivec.o \
ppc/float_altivec.o \
+ ppc/fmtconvert_altivec.o \
ppc/gmc_altivec.o \
ppc/idct_altivec.o \
ppc/int_altivec.o \
diff --git a/libavcodec/ppc/float_altivec.c b/libavcodec/ppc/float_altivec.c
index 60bae9a757..ba97cbfd3b 100644
--- a/libavcodec/ppc/float_altivec.c
+++ b/libavcodec/ppc/float_altivec.c
@@ -122,124 +122,12 @@ static void vector_fmul_window_altivec(float *dst, const float *src0, const floa
}
}
-static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
-{
- union {
- vector float v;
- float s[4];
- } mul_u;
- int i;
- vector float src1, src2, dst1, dst2, mul_v, zero;
-
- zero = (vector float)vec_splat_u32(0);
- mul_u.s[0] = mul;
- mul_v = vec_splat(mul_u.v, 0);
-
- for(i=0; i<len; i+=8) {
- src1 = vec_ctf(vec_ld(0, src+i), 0);
- src2 = vec_ctf(vec_ld(16, src+i), 0);
- dst1 = vec_madd(src1, mul_v, zero);
- dst2 = vec_madd(src2, mul_v, zero);
- vec_st(dst1, 0, dst+i);
- vec_st(dst2, 16, dst+i);
- }
-}
-
-
-static vector signed short
-float_to_int16_one_altivec(const float *src)
-{
- vector float s0 = vec_ld(0, src);
- vector float s1 = vec_ld(16, src);
- vector signed int t0 = vec_cts(s0, 0);
- vector signed int t1 = vec_cts(s1, 0);
- return vec_packs(t0,t1);
-}
-
-static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
-{
- int i;
- vector signed short d0, d1, d;
- vector unsigned char align;
- if(((long)dst)&15) //FIXME
- for(i=0; i<len-7; i+=8) {
- d0 = vec_ld(0, dst+i);
- d = float_to_int16_one_altivec(src+i);
- d1 = vec_ld(15, dst+i);
- d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
- align = vec_lvsr(0, dst+i);
- d0 = vec_perm(d1, d, align);
- d1 = vec_perm(d, d1, align);
- vec_st(d0, 0, dst+i);
- vec_st(d1,15, dst+i);
- }
- else
- for(i=0; i<len-7; i+=8) {
- d = float_to_int16_one_altivec(src+i);
- vec_st(d, 0, dst+i);
- }
-}
-
-static void
-float_to_int16_interleave_altivec(int16_t *dst, const float **src,
- long len, int channels)
-{
- int i;
- vector signed short d0, d1, d2, c0, c1, t0, t1;
- vector unsigned char align;
- if(channels == 1)
- float_to_int16_altivec(dst, src[0], len);
- else
- if (channels == 2) {
- if(((long)dst)&15)
- for(i=0; i<len-7; i+=8) {
- d0 = vec_ld(0, dst + i);
- t0 = float_to_int16_one_altivec(src[0] + i);
- d1 = vec_ld(31, dst + i);
- t1 = float_to_int16_one_altivec(src[1] + i);
- c0 = vec_mergeh(t0, t1);
- c1 = vec_mergel(t0, t1);
- d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
- align = vec_lvsr(0, dst + i);
- d0 = vec_perm(d2, c0, align);
- d1 = vec_perm(c0, c1, align);
- vec_st(d0, 0, dst + i);
- d0 = vec_perm(c1, d2, align);
- vec_st(d1, 15, dst + i);
- vec_st(d0, 31, dst + i);
- dst+=8;
- }
- else
- for(i=0; i<len-7; i+=8) {
- t0 = float_to_int16_one_altivec(src[0] + i);
- t1 = float_to_int16_one_altivec(src[1] + i);
- d0 = vec_mergeh(t0, t1);
- d1 = vec_mergel(t0, t1);
- vec_st(d0, 0, dst + i);
- vec_st(d1, 16, dst + i);
- dst+=8;
- }
- } else {
- DECLARE_ALIGNED(16, int16_t, tmp)[len];
- int c, j;
- for (c = 0; c < channels; c++) {
- float_to_int16_altivec(tmp, src[c], len);
- for (i = 0, j = c; i < len; i++, j+=channels) {
- dst[j] = tmp[i];
- }
- }
- }
-}
-
void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
c->vector_fmul = vector_fmul_altivec;
c->vector_fmul_reverse = vector_fmul_reverse_altivec;
c->vector_fmul_add = vector_fmul_add_altivec;
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->vector_fmul_window = vector_fmul_window_altivec;
- c->float_to_int16 = float_to_int16_altivec;
- c->float_to_int16_interleave = float_to_int16_interleave_altivec;
}
}
diff --git a/libavcodec/ppc/fmtconvert_altivec.c b/libavcodec/ppc/fmtconvert_altivec.c
new file mode 100644
index 0000000000..e5287c96c1
--- /dev/null
+++ b/libavcodec/ppc/fmtconvert_altivec.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/fmtconvert.h"
+
+#include "dsputil_altivec.h"
+#include "util_altivec.h"
+
+static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
+{
+ union {
+ vector float v;
+ float s[4];
+ } mul_u;
+ int i;
+ vector float src1, src2, dst1, dst2, mul_v, zero;
+
+ zero = (vector float)vec_splat_u32(0);
+ mul_u.s[0] = mul;
+ mul_v = vec_splat(mul_u.v, 0);
+
+ for(i=0; i<len; i+=8) {
+ src1 = vec_ctf(vec_ld(0, src+i), 0);
+ src2 = vec_ctf(vec_ld(16, src+i), 0);
+ dst1 = vec_madd(src1, mul_v, zero);
+ dst2 = vec_madd(src2, mul_v, zero);
+ vec_st(dst1, 0, dst+i);
+ vec_st(dst2, 16, dst+i);
+ }
+}
+
+
+static vector signed short
+float_to_int16_one_altivec(const float *src)
+{
+ vector float s0 = vec_ld(0, src);
+ vector float s1 = vec_ld(16, src);
+ vector signed int t0 = vec_cts(s0, 0);
+ vector signed int t1 = vec_cts(s1, 0);
+ return vec_packs(t0,t1);
+}
+
+static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
+{
+ int i;
+ vector signed short d0, d1, d;
+ vector unsigned char align;
+ if(((long)dst)&15) //FIXME
+ for(i=0; i<len-7; i+=8) {
+ d0 = vec_ld(0, dst+i);
+ d = float_to_int16_one_altivec(src+i);
+ d1 = vec_ld(15, dst+i);
+ d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
+ align = vec_lvsr(0, dst+i);
+ d0 = vec_perm(d1, d, align);
+ d1 = vec_perm(d, d1, align);
+ vec_st(d0, 0, dst+i);
+ vec_st(d1,15, dst+i);
+ }
+ else
+ for(i=0; i<len-7; i+=8) {
+ d = float_to_int16_one_altivec(src+i);
+ vec_st(d, 0, dst+i);
+ }
+}
+
+static void
+float_to_int16_interleave_altivec(int16_t *dst, const float **src,
+ long len, int channels)
+{
+ int i;
+ vector signed short d0, d1, d2, c0, c1, t0, t1;
+ vector unsigned char align;
+ if(channels == 1)
+ float_to_int16_altivec(dst, src[0], len);
+ else
+ if (channels == 2) {
+ if(((long)dst)&15)
+ for(i=0; i<len-7; i+=8) {
+ d0 = vec_ld(0, dst + i);
+ t0 = float_to_int16_one_altivec(src[0] + i);
+ d1 = vec_ld(31, dst + i);
+ t1 = float_to_int16_one_altivec(src[1] + i);
+ c0 = vec_mergeh(t0, t1);
+ c1 = vec_mergel(t0, t1);
+ d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
+ align = vec_lvsr(0, dst + i);
+ d0 = vec_perm(d2, c0, align);
+ d1 = vec_perm(c0, c1, align);
+ vec_st(d0, 0, dst + i);
+ d0 = vec_perm(c1, d2, align);
+ vec_st(d1, 15, dst + i);
+ vec_st(d0, 31, dst + i);
+ dst+=8;
+ }
+ else
+ for(i=0; i<len-7; i+=8) {
+ t0 = float_to_int16_one_altivec(src[0] + i);
+ t1 = float_to_int16_one_altivec(src[1] + i);
+ d0 = vec_mergeh(t0, t1);
+ d1 = vec_mergel(t0, t1);
+ vec_st(d0, 0, dst + i);
+ vec_st(d1, 16, dst + i);
+ dst+=8;
+ }
+ } else {
+ DECLARE_ALIGNED(16, int16_t, tmp)[len];
+ int c, j;
+ for (c = 0; c < channels; c++) {
+ float_to_int16_altivec(tmp, src[c], len);
+ for (i = 0, j = c; i < len; i++, j+=channels) {
+ dst[j] = tmp[i];
+ }
+ }
+ }
+}
+
+void ff_fmt_convert_init_ppc(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+ c->float_to_int16 = float_to_int16_altivec;
+ c->float_to_int16_interleave = float_to_int16_interleave_altivec;
+ }
+}
diff --git a/libavcodec/vorbis_dec.c b/libavcodec/vorbis_dec.c
index 9fef5eb26f..bca56ba663 100644
--- a/libavcodec/vorbis_dec.c
+++ b/libavcodec/vorbis_dec.c
@@ -31,6 +31,7 @@
#include "get_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
#include "vorbis.h"
#include "xiph.h"
@@ -127,6 +128,7 @@ typedef struct vorbis_context_s {
AVCodecContext *avccontext;
GetBitContext gb;
DSPContext dsp;
+ FmtConvertContext fmt_conv;
FFTContext mdct[2];
uint_fast8_t first_frame;
@@ -961,6 +963,7 @@ static av_cold int vorbis_decode_init(AVCodecContext *avccontext)
vc->avccontext = avccontext;
dsputil_init(&vc->dsp, avccontext);
+ ff_fmt_convert_init(&vc->fmt_conv, avccontext);
vc->scale_bias = 32768.0f;
@@ -1636,7 +1639,8 @@ static int vorbis_decode_frame(AVCodecContext *avccontext,
len * ff_vorbis_channel_layout_offsets[vc->audio_channels - 1][i];
}
- vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
+ vc->fmt_conv.float_to_int16_interleave(data, channel_ptrs, len,
+ vc->audio_channels);
*data_size = len * 2 * vc->audio_channels;
return buf_size ;
diff --git a/libavcodec/wma.c b/libavcodec/wma.c
index e0b9b68395..a7eacb8c78 100644
--- a/libavcodec/wma.c
+++ b/libavcodec/wma.c
@@ -126,6 +126,7 @@ int ff_wma_init(AVCodecContext *avctx, int flags2)
s->block_align = avctx->block_align;
dsputil_init(&s->dsp, avctx);
+ ff_fmt_convert_init(&s->fmt_conv, avctx);
if (avctx->codec->id == CODEC_ID_WMAV1) {
s->version = 1;
diff --git a/libavcodec/wma.h b/libavcodec/wma.h
index 11274ad970..a51b3e83cf 100644
--- a/libavcodec/wma.h
+++ b/libavcodec/wma.h
@@ -26,6 +26,7 @@
#include "put_bits.h"
#include "dsputil.h"
#include "fft.h"
+#include "fmtconvert.h"
/* size of blocks */
#define BLOCK_MIN_BITS 7
@@ -134,6 +135,7 @@ typedef struct WMACodecContext {
float lsp_pow_m_table1[(1 << LSP_POW_BITS)];
float lsp_pow_m_table2[(1 << LSP_POW_BITS)];
DSPContext dsp;
+ FmtConvertContext fmt_conv;
#ifdef TRACE
int frame_count;
diff --git a/libavcodec/wmadec.c b/libavcodec/wmadec.c
index d85d80d574..83f8dea8bb 100644
--- a/libavcodec/wmadec.c
+++ b/libavcodec/wmadec.c
@@ -791,7 +791,7 @@ static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
incr = s->nb_channels;
for (ch = 0; ch < MAX_CHANNELS; ch++)
output[ch] = s->frame_out[ch];
- s->dsp.float_to_int16_interleave(samples, output, n, incr);
+ s->fmt_conv.float_to_int16_interleave(samples, output, n, incr);
for (ch = 0; ch < incr; ch++) {
/* prepare for next block */
memmove(&s->frame_out[ch][0], &s->frame_out[ch][n], n * sizeof(float));
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 943edcb0ec..83cec00442 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -39,6 +39,7 @@ YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o
MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \
x86/deinterlace.o \
+ x86/fmtconvert.o \
x86/h264_chromamc.o \
$(YASM-OBJS-yes)
@@ -47,6 +48,7 @@ MMX-OBJS-$(CONFIG_FFT) += x86/fft.o
OBJS-$(HAVE_MMX) += x86/dnxhd_mmx.o \
x86/dsputil_mmx.o \
x86/fdct_mmx.o \
+ x86/fmtconvert_mmx.o \
x86/idct_mmx_xvid.o \
x86/idct_sse2_xvid.o \
x86/motion_est_mmx.o \
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 2eb7d85f14..39bf3f2936 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2349,50 +2349,6 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s
}
#endif /* HAVE_6REGS */
-static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtpi2ps (%2,%0), %%xmm0 \n"
- "cvtpi2ps 8(%2,%0), %%xmm1 \n"
- "cvtpi2ps 16(%2,%0), %%xmm2 \n"
- "cvtpi2ps 24(%2,%0), %%xmm3 \n"
- "movlhps %%xmm1, %%xmm0 \n"
- "movlhps %%xmm3, %%xmm2 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm2 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm2, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
-
-static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
-{
- x86_reg i = -4*len;
- __asm__ volatile(
- "movss %3, %%xmm4 \n"
- "shufps $0, %%xmm4, %%xmm4 \n"
- "1: \n"
- "cvtdq2ps (%2,%0), %%xmm0 \n"
- "cvtdq2ps 16(%2,%0), %%xmm1 \n"
- "mulps %%xmm4, %%xmm0 \n"
- "mulps %%xmm4, %%xmm1 \n"
- "movaps %%xmm0, (%1,%0) \n"
- "movaps %%xmm1, 16(%1,%0) \n"
- "add $32, %0 \n"
- "jl 1b \n"
- :"+r"(i)
- :"r"(dst+len), "r"(src+len), "m"(mul)
- );
-}
-
static void vector_clipf_sse(float *dst, const float *src, float min, float max,
int len)
{
@@ -2427,70 +2383,6 @@ static void vector_clipf_sse(float *dst, const float *src, float min, float max,
);
}
-static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- // not bit-exact: pf2id uses different rounding than C and SSE
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "pf2id (%2,%0,2) , %%mm0 \n\t"
- "pf2id 8(%2,%0,2) , %%mm1 \n\t"
- "pf2id 16(%2,%0,2) , %%mm2 \n\t"
- "pf2id 24(%2,%0,2) , %%mm3 \n\t"
- "packssdw %%mm1 , %%mm0 \n\t"
- "packssdw %%mm3 , %%mm2 \n\t"
- "movq %%mm0 , (%1,%0) \n\t"
- "movq %%mm2 , 8(%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- "femms \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
-static void float_to_int16_sse(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
- "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
- "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
- "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
- "packssdw %%mm1 , %%mm0 \n\t"
- "packssdw %%mm3 , %%mm2 \n\t"
- "movq %%mm0 , (%1,%0) \n\t"
- "movq %%mm2 , 8(%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- "emms \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
-
-static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
- x86_reg reglen = len;
- __asm__ volatile(
- "add %0 , %0 \n\t"
- "lea (%2,%0,2) , %2 \n\t"
- "add %0 , %1 \n\t"
- "neg %0 \n\t"
- "1: \n\t"
- "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
- "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
- "packssdw %%xmm1 , %%xmm0 \n\t"
- "movdqa %%xmm0 , (%1,%0) \n\t"
- "add $16 , %0 \n\t"
- " js 1b \n\t"
- :"+r"(reglen), "+r"(dst), "+r"(src)
- );
-}
-
void ff_vp3_idct_mmx(int16_t *input_data);
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
@@ -2504,9 +2396,6 @@ void ff_vp3_idct_sse2(int16_t *input_data);
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
-void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
-void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
@@ -2516,102 +2405,6 @@ void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
-#if !HAVE_YASM
-#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
-#endif
-#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
-
-#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
-/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
-static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
- DECLARE_ALIGNED(16, int16_t, tmp)[len];\
- int i,j,c;\
- for(c=0; c<channels; c++){\
- float_to_int16_##cpu(tmp, src[c], len);\
- for(i=0, j=c; i<len; i++, j+=channels)\
- dst[j] = tmp[i];\
- }\
-}\
-\
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
- if(channels==1)\
- float_to_int16_##cpu(dst, src[0], len);\
- else if(channels==2){\
- x86_reg reglen = len; \
- const float *src0 = src[0];\
- const float *src1 = src[1];\
- __asm__ volatile(\
- "shl $2, %0 \n"\
- "add %0, %1 \n"\
- "add %0, %2 \n"\
- "add %0, %3 \n"\
- "neg %0 \n"\
- body\
- :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
- );\
- }else if(channels==6){\
- ff_float_to_int16_interleave6_##cpu(dst, src, len);\
- }else\
- float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
-}
-
-FLOAT_TO_INT16_INTERLEAVE(3dnow,
- "1: \n"
- "pf2id (%2,%0), %%mm0 \n"
- "pf2id 8(%2,%0), %%mm1 \n"
- "pf2id (%3,%0), %%mm2 \n"
- "pf2id 8(%3,%0), %%mm3 \n"
- "packssdw %%mm1, %%mm0 \n"
- "packssdw %%mm3, %%mm2 \n"
- "movq %%mm0, %%mm1 \n"
- "punpcklwd %%mm2, %%mm0 \n"
- "punpckhwd %%mm2, %%mm1 \n"
- "movq %%mm0, (%1,%0)\n"
- "movq %%mm1, 8(%1,%0)\n"
- "add $16, %0 \n"
- "js 1b \n"
- "femms \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse,
- "1: \n"
- "cvtps2pi (%2,%0), %%mm0 \n"
- "cvtps2pi 8(%2,%0), %%mm1 \n"
- "cvtps2pi (%3,%0), %%mm2 \n"
- "cvtps2pi 8(%3,%0), %%mm3 \n"
- "packssdw %%mm1, %%mm0 \n"
- "packssdw %%mm3, %%mm2 \n"
- "movq %%mm0, %%mm1 \n"
- "punpcklwd %%mm2, %%mm0 \n"
- "punpckhwd %%mm2, %%mm1 \n"
- "movq %%mm0, (%1,%0)\n"
- "movq %%mm1, 8(%1,%0)\n"
- "add $16, %0 \n"
- "js 1b \n"
- "emms \n"
-)
-
-FLOAT_TO_INT16_INTERLEAVE(sse2,
- "1: \n"
- "cvtps2dq (%2,%0), %%xmm0 \n"
- "cvtps2dq (%3,%0), %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "movhlps %%xmm0, %%xmm1 \n"
- "punpcklwd %%xmm1, %%xmm0 \n"
- "movdqa %%xmm0, (%1,%0) \n"
- "add $16, %0 \n"
- "js 1b \n"
-)
-
-static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
- if(channels==6)
- ff_float_to_int16_interleave6_3dn2(dst, src, len);
- else
- float_to_int16_interleave_3dnow(dst, src, len, channels);
-}
-
float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
@@ -2968,19 +2761,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
if(mm_flags & AV_CPU_FLAG_3DNOW){
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->float_to_int16 = float_to_int16_3dnow;
- c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
- }
}
if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_3dnow2;
#endif
- if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
- }
}
if(mm_flags & AV_CPU_FLAG_MMX2){
#if HAVE_YASM
@@ -2997,10 +2783,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_sse;
#endif
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
c->vector_clipf = vector_clipf_sse;
- c->float_to_int16 = float_to_int16_sse;
- c->float_to_int16_interleave = float_to_int16_interleave_sse;
#if HAVE_YASM
c->scalarproduct_float = ff_scalarproduct_float_sse;
#endif
@@ -3008,9 +2791,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
if(mm_flags & AV_CPU_FLAG_3DNOW)
c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
if(mm_flags & AV_CPU_FLAG_SSE2){
- c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
- c->float_to_int16 = float_to_int16_sse2;
- c->float_to_int16_interleave = float_to_int16_interleave_sse2;
#if HAVE_YASM
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 099f0a80df..b1b37e1fb9 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -30,75 +30,6 @@ pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
section .text align=16
-%macro PSWAPD_SSE 2
- pshufw %1, %2, 0x4e
-%endmacro
-%macro PSWAPD_3DN1 2
- movq %1, %2
- psrlq %1, 32
- punpckldq %1, %2
-%endmacro
-
-%macro FLOAT_TO_INT16_INTERLEAVE6 1
-; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
-cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
-%ifdef ARCH_X86_64
- %define lend r10d
- mov lend, r2d
-%else
- %define lend dword r2m
-%endif
- mov src1q, [srcq+1*gprsize]
- mov src2q, [srcq+2*gprsize]
- mov src3q, [srcq+3*gprsize]
- mov src4q, [srcq+4*gprsize]
- mov src5q, [srcq+5*gprsize]
- mov srcq, [srcq]
- sub src1q, srcq
- sub src2q, srcq
- sub src3q, srcq
- sub src4q, srcq
- sub src5q, srcq
-.loop:
- cvtps2pi mm0, [srcq]
- cvtps2pi mm1, [srcq+src1q]
- cvtps2pi mm2, [srcq+src2q]
- cvtps2pi mm3, [srcq+src3q]
- cvtps2pi mm4, [srcq+src4q]
- cvtps2pi mm5, [srcq+src5q]
- packssdw mm0, mm3
- packssdw mm1, mm4
- packssdw mm2, mm5
- pswapd mm3, mm0
- punpcklwd mm0, mm1
- punpckhwd mm1, mm2
- punpcklwd mm2, mm3
- pswapd mm3, mm0
- punpckldq mm0, mm2
- punpckhdq mm2, mm1
- punpckldq mm1, mm3
- movq [dstq ], mm0
- movq [dstq+16], mm2
- movq [dstq+ 8], mm1
- add srcq, 8
- add dstq, 24
- sub lend, 2
- jg .loop
- emms
- RET
-%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
-
-%define pswapd PSWAPD_SSE
-FLOAT_TO_INT16_INTERLEAVE6 sse
-%define cvtps2pi pf2id
-%define pswapd PSWAPD_3DN1
-FLOAT_TO_INT16_INTERLEAVE6 3dnow
-%undef pswapd
-FLOAT_TO_INT16_INTERLEAVE6 3dn2
-%undef cvtps2pi
-
-
-
%macro SCALARPRODUCT 1
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
new file mode 100644
index 0000000000..6c744fc581
--- /dev/null
+++ b/libavcodec/x86/fmtconvert.asm
@@ -0,0 +1,91 @@
+;******************************************************************************
+;* x86 optimized Format Conversion Utils
+;* Copyright (c) 2008 Loren Merritt
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+
+section .text align=16
+
+%macro PSWAPD_SSE 2
+ pshufw %1, %2, 0x4e
+%endmacro
+%macro PSWAPD_3DN1 2
+ movq %1, %2
+ psrlq %1, 32
+ punpckldq %1, %2
+%endmacro
+
+%macro FLOAT_TO_INT16_INTERLEAVE6 1
+; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
+cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
+%ifdef ARCH_X86_64
+ %define lend r10d
+ mov lend, r2d
+%else
+ %define lend dword r2m
+%endif
+ mov src1q, [srcq+1*gprsize]
+ mov src2q, [srcq+2*gprsize]
+ mov src3q, [srcq+3*gprsize]
+ mov src4q, [srcq+4*gprsize]
+ mov src5q, [srcq+5*gprsize]
+ mov srcq, [srcq]
+ sub src1q, srcq
+ sub src2q, srcq
+ sub src3q, srcq
+ sub src4q, srcq
+ sub src5q, srcq
+.loop:
+ cvtps2pi mm0, [srcq]
+ cvtps2pi mm1, [srcq+src1q]
+ cvtps2pi mm2, [srcq+src2q]
+ cvtps2pi mm3, [srcq+src3q]
+ cvtps2pi mm4, [srcq+src4q]
+ cvtps2pi mm5, [srcq+src5q]
+ packssdw mm0, mm3
+ packssdw mm1, mm4
+ packssdw mm2, mm5
+ pswapd mm3, mm0
+ punpcklwd mm0, mm1
+ punpckhwd mm1, mm2
+ punpcklwd mm2, mm3
+ pswapd mm3, mm0
+ punpckldq mm0, mm2
+ punpckhdq mm2, mm1
+ punpckldq mm1, mm3
+ movq [dstq ], mm0
+ movq [dstq+16], mm2
+ movq [dstq+ 8], mm1
+ add srcq, 8
+ add dstq, 24
+ sub lend, 2
+ jg .loop
+ emms
+ RET
+%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
+
+%define pswapd PSWAPD_SSE
+FLOAT_TO_INT16_INTERLEAVE6 sse
+%define cvtps2pi pf2id
+%define pswapd PSWAPD_3DN1
+FLOAT_TO_INT16_INTERLEAVE6 3dnow
+%undef pswapd
+FLOAT_TO_INT16_INTERLEAVE6 3dn2
+%undef cvtps2pi
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
new file mode 100644
index 0000000000..ea41f730e8
--- /dev/null
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -0,0 +1,266 @@
+/*
+ * Format Conversion Utils
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86_cpu.h"
+#include "libavcodec/fmtconvert.h"
+
+static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
+{
+ x86_reg i = -4*len;
+ __asm__ volatile(
+ "movss %3, %%xmm4 \n"
+ "shufps $0, %%xmm4, %%xmm4 \n"
+ "1: \n"
+ "cvtpi2ps (%2,%0), %%xmm0 \n"
+ "cvtpi2ps 8(%2,%0), %%xmm1 \n"
+ "cvtpi2ps 16(%2,%0), %%xmm2 \n"
+ "cvtpi2ps 24(%2,%0), %%xmm3 \n"
+ "movlhps %%xmm1, %%xmm0 \n"
+ "movlhps %%xmm3, %%xmm2 \n"
+ "mulps %%xmm4, %%xmm0 \n"
+ "mulps %%xmm4, %%xmm2 \n"
+ "movaps %%xmm0, (%1,%0) \n"
+ "movaps %%xmm2, 16(%1,%0) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(dst+len), "r"(src+len), "m"(mul)
+ );
+}
+
+static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
+{
+ x86_reg i = -4*len;
+ __asm__ volatile(
+ "movss %3, %%xmm4 \n"
+ "shufps $0, %%xmm4, %%xmm4 \n"
+ "1: \n"
+ "cvtdq2ps (%2,%0), %%xmm0 \n"
+ "cvtdq2ps 16(%2,%0), %%xmm1 \n"
+ "mulps %%xmm4, %%xmm0 \n"
+ "mulps %%xmm4, %%xmm1 \n"
+ "movaps %%xmm0, (%1,%0) \n"
+ "movaps %%xmm1, 16(%1,%0) \n"
+ "add $32, %0 \n"
+ "jl 1b \n"
+ :"+r"(i)
+ :"r"(dst+len), "r"(src+len), "m"(mul)
+ );
+}
+
+static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
+ x86_reg reglen = len;
+ // not bit-exact: pf2id uses different rounding than C and SSE
+ __asm__ volatile(
+ "add %0 , %0 \n\t"
+ "lea (%2,%0,2) , %2 \n\t"
+ "add %0 , %1 \n\t"
+ "neg %0 \n\t"
+ "1: \n\t"
+ "pf2id (%2,%0,2) , %%mm0 \n\t"
+ "pf2id 8(%2,%0,2) , %%mm1 \n\t"
+ "pf2id 16(%2,%0,2) , %%mm2 \n\t"
+ "pf2id 24(%2,%0,2) , %%mm3 \n\t"
+ "packssdw %%mm1 , %%mm0 \n\t"
+ "packssdw %%mm3 , %%mm2 \n\t"
+ "movq %%mm0 , (%1,%0) \n\t"
+ "movq %%mm2 , 8(%1,%0) \n\t"
+ "add $16 , %0 \n\t"
+ " js 1b \n\t"
+ "femms \n\t"
+ :"+r"(reglen), "+r"(dst), "+r"(src)
+ );
+}
+
+static void float_to_int16_sse(int16_t *dst, const float *src, long len){
+ x86_reg reglen = len;
+ __asm__ volatile(
+ "add %0 , %0 \n\t"
+ "lea (%2,%0,2) , %2 \n\t"
+ "add %0 , %1 \n\t"
+ "neg %0 \n\t"
+ "1: \n\t"
+ "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
+ "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
+ "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
+ "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
+ "packssdw %%mm1 , %%mm0 \n\t"
+ "packssdw %%mm3 , %%mm2 \n\t"
+ "movq %%mm0 , (%1,%0) \n\t"
+ "movq %%mm2 , 8(%1,%0) \n\t"
+ "add $16 , %0 \n\t"
+ " js 1b \n\t"
+ "emms \n\t"
+ :"+r"(reglen), "+r"(dst), "+r"(src)
+ );
+}
+
+static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
+ x86_reg reglen = len;
+ __asm__ volatile(
+ "add %0 , %0 \n\t"
+ "lea (%2,%0,2) , %2 \n\t"
+ "add %0 , %1 \n\t"
+ "neg %0 \n\t"
+ "1: \n\t"
+ "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
+ "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
+ "packssdw %%xmm1 , %%xmm0 \n\t"
+ "movdqa %%xmm0 , (%1,%0) \n\t"
+ "add $16 , %0 \n\t"
+ " js 1b \n\t"
+ :"+r"(reglen), "+r"(dst), "+r"(src)
+ );
+}
+
+void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
+void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
+
+#if !HAVE_YASM
+#define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
+#endif
+#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
+
+#define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
+/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
+static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
+ DECLARE_ALIGNED(16, int16_t, tmp)[len];\
+ int i,j,c;\
+ for(c=0; c<channels; c++){\
+ float_to_int16_##cpu(tmp, src[c], len);\
+ for(i=0, j=c; i<len; i++, j+=channels)\
+ dst[j] = tmp[i];\
+ }\
+}\
+\
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
+ if(channels==1)\
+ float_to_int16_##cpu(dst, src[0], len);\
+ else if(channels==2){\
+ x86_reg reglen = len; \
+ const float *src0 = src[0];\
+ const float *src1 = src[1];\
+ __asm__ volatile(\
+ "shl $2, %0 \n"\
+ "add %0, %1 \n"\
+ "add %0, %2 \n"\
+ "add %0, %3 \n"\
+ "neg %0 \n"\
+ body\
+ :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
+ );\
+ }else if(channels==6){\
+ ff_float_to_int16_interleave6_##cpu(dst, src, len);\
+ }else\
+ float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
+}
+
+FLOAT_TO_INT16_INTERLEAVE(3dnow,
+ "1: \n"
+ "pf2id (%2,%0), %%mm0 \n"
+ "pf2id 8(%2,%0), %%mm1 \n"
+ "pf2id (%3,%0), %%mm2 \n"
+ "pf2id 8(%3,%0), %%mm3 \n"
+ "packssdw %%mm1, %%mm0 \n"
+ "packssdw %%mm3, %%mm2 \n"
+ "movq %%mm0, %%mm1 \n"
+ "punpcklwd %%mm2, %%mm0 \n"
+ "punpckhwd %%mm2, %%mm1 \n"
+ "movq %%mm0, (%1,%0)\n"
+ "movq %%mm1, 8(%1,%0)\n"
+ "add $16, %0 \n"
+ "js 1b \n"
+ "femms \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse,
+ "1: \n"
+ "cvtps2pi (%2,%0), %%mm0 \n"
+ "cvtps2pi 8(%2,%0), %%mm1 \n"
+ "cvtps2pi (%3,%0), %%mm2 \n"
+ "cvtps2pi 8(%3,%0), %%mm3 \n"
+ "packssdw %%mm1, %%mm0 \n"
+ "packssdw %%mm3, %%mm2 \n"
+ "movq %%mm0, %%mm1 \n"
+ "punpcklwd %%mm2, %%mm0 \n"
+ "punpckhwd %%mm2, %%mm1 \n"
+ "movq %%mm0, (%1,%0)\n"
+ "movq %%mm1, 8(%1,%0)\n"
+ "add $16, %0 \n"
+ "js 1b \n"
+ "emms \n"
+)
+
+FLOAT_TO_INT16_INTERLEAVE(sse2,
+ "1: \n"
+ "cvtps2dq (%2,%0), %%xmm0 \n"
+ "cvtps2dq (%3,%0), %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "movhlps %%xmm0, %%xmm1 \n"
+ "punpcklwd %%xmm1, %%xmm0 \n"
+ "movdqa %%xmm0, (%1,%0) \n"
+ "add $16, %0 \n"
+ "js 1b \n"
+)
+
+static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
+ if(channels==6)
+ ff_float_to_int16_interleave6_3dn2(dst, src, len);
+ else
+ float_to_int16_interleave_3dnow(dst, src, len, channels);
+}
+
+void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ int mm_flags = av_get_cpu_flags();
+
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+
+ if(mm_flags & AV_CPU_FLAG_3DNOW){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->float_to_int16 = float_to_int16_3dnow;
+ c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
+ }
+ }
+ if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
+ }
+ }
+ if(mm_flags & AV_CPU_FLAG_SSE){
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
+ c->float_to_int16 = float_to_int16_sse;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse;
+ }
+ if(mm_flags & AV_CPU_FLAG_SSE2){
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
+ c->float_to_int16 = float_to_int16_sse2;
+ c->float_to_int16_interleave = float_to_int16_interleave_sse2;
+ }
+ }
+}