summaryrefslogtreecommitdiff
path: root/libavcodec
diff options
context:
space:
mode:
authorJustin Ruggles <justin.ruggles@gmail.com>2011-03-15 22:29:04 -0400
committerJustin Ruggles <justin.ruggles@gmail.com>2011-03-17 16:46:48 -0400
commit0f999cfddb0746602288eabddf38679fd25a2ff7 (patch)
tree055b877b24d0d1df4ff4feca6d0922174c51938f /libavcodec
parent487fef2dcc24d7b4051b4402acf5c619fb082351 (diff)
ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext
and use in scale_coefficients() for the floating-point AC-3 encoder.
Diffstat (limited to 'libavcodec')
-rw-r--r--libavcodec/ac3dsp.c21
-rw-r--r--libavcodec/ac3dsp.h17
-rw-r--r--libavcodec/ac3enc.c2
-rw-r--r--libavcodec/ac3enc_float.c5
-rw-r--r--libavcodec/x86/ac3dsp.asm115
-rw-r--r--libavcodec/x86/ac3dsp_mmx.c15
6 files changed, 166 insertions, 9 deletions
diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 4d9db9be50..9bfa7300e3 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
} while (len > 0);
}
-av_cold void ff_ac3dsp_init(AC3DSPContext *c)
+static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len)
+{
+ const float scale = 1 << 24;
+ do {
+ *dst++ = lrintf(*src++ * scale);
+ *dst++ = lrintf(*src++ * scale);
+ *dst++ = lrintf(*src++ * scale);
+ *dst++ = lrintf(*src++ * scale);
+ *dst++ = lrintf(*src++ * scale);
+ *dst++ = lrintf(*src++ * scale);
+ *dst++ = lrintf(*src++ * scale);
+ *dst++ = lrintf(*src++ * scale);
+ len -= 8;
+ } while (len > 0);
+}
+
+av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
{
c->ac3_exponent_min = ac3_exponent_min_c;
c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
c->ac3_lshift_int16 = ac3_lshift_int16_c;
c->ac3_rshift_int32 = ac3_rshift_int32_c;
+ c->float_to_fixed24 = float_to_fixed24_c;
if (HAVE_MMX)
- ff_ac3dsp_init_x86(c);
+ ff_ac3dsp_init_x86(c, bit_exact);
}
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index 31a0af375d..0a2dedf478 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@@ -68,9 +68,22 @@ typedef struct AC3DSPContext {
* constraints: range [0,31]
*/
void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift);
+
+ /**
+ * Convert an array of float in range [-1.0,1.0] to int32_t with range
+ * [-(1<<24),(1<<24)]
+ *
+ * @param dst destination array of int32_t.
+ * constraints: 16-byte aligned
+ * @param src source array of float.
+ * constraints: 16-byte aligned
+ * @param len number of elements to convert.
+ * constraints: multiple of 32 greater than zero
+ */
+ void (*float_to_fixed24)(int32_t *dst, const float *src, unsigned int len);
} AC3DSPContext;
-void ff_ac3dsp_init (AC3DSPContext *c);
-void ff_ac3dsp_init_x86(AC3DSPContext *c);
+void ff_ac3dsp_init (AC3DSPContext *c, int bit_exact);
+void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
#endif /* AVCODEC_AC3DSP_H */
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 4c01fe3cbd..5b76ae6735 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -1843,7 +1843,7 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx)
avctx->coded_frame= avcodec_alloc_frame();
dsputil_init(&s->dsp, avctx);
- ff_ac3dsp_init(&s->ac3dsp);
+ ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);
return 0;
init_fail:
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 8668b2e033..4b13e4c723 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -103,9 +103,8 @@ static int normalize_samples(AC3EncodeContext *s)
*/
static void scale_coefficients(AC3EncodeContext *s)
{
- int i;
- for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
- s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
+ s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer,
+ AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
}
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index e281791b1e..8b7e826a2d 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -22,6 +22,11 @@
%include "x86inc.asm"
%include "x86util.asm"
+SECTION_RODATA
+
+; 16777216.0f - used in ff_float_to_fixed24()
+pf_1_24: times 4 dd 0x4B800000
+
SECTION .text
;-----------------------------------------------------------------------------
@@ -178,3 +183,113 @@ INIT_MMX
AC3_SHIFT r, 32, psrad, mmx
INIT_XMM
AC3_SHIFT r, 32, psrad, sse2
+
+;-----------------------------------------------------------------------------
+; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
+;-----------------------------------------------------------------------------
+
+; The 3DNow! version is not bit-identical because pf2id uses truncation rather
+; than round-to-nearest.
+INIT_MMX
+cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len
+ movq m0, [pf_1_24]
+.loop:
+ movq m1, [srcq ]
+ movq m2, [srcq+8 ]
+ movq m3, [srcq+16]
+ movq m4, [srcq+24]
+ pfmul m1, m0
+ pfmul m2, m0
+ pfmul m3, m0
+ pfmul m4, m0
+ pf2id m1, m1
+ pf2id m2, m2
+ pf2id m3, m3
+ pf2id m4, m4
+ movq [dstq ], m1
+ movq [dstq+8 ], m2
+ movq [dstq+16], m3
+ movq [dstq+24], m4
+ add srcq, 32
+ add dstq, 32
+ sub lend, 8
+ ja .loop
+ REP_RET
+
+INIT_XMM
+cglobal float_to_fixed24_sse, 3,3,3, dst, src, len
+ movaps m0, [pf_1_24]
+.loop:
+ movaps m1, [srcq ]
+ movaps m2, [srcq+16]
+ mulps m1, m0
+ mulps m2, m0
+ cvtps2pi mm0, m1
+ movhlps m1, m1
+ cvtps2pi mm1, m1
+ cvtps2pi mm2, m2
+ movhlps m2, m2
+ cvtps2pi mm3, m2
+ movq [dstq ], mm0
+ movq [dstq+ 8], mm1
+ movq [dstq+16], mm2
+ movq [dstq+24], mm3
+ add srcq, 32
+ add dstq, 32
+ sub lend, 8
+ ja .loop
+ REP_RET
+
+INIT_XMM
+cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
+ movaps m0, [pf_1_24]
+.loop:
+ movaps m1, [srcq ]
+ movaps m2, [srcq+16 ]
+ movaps m3, [srcq+32 ]
+ movaps m4, [srcq+48 ]
+%ifdef m8
+ movaps m5, [srcq+64 ]
+ movaps m6, [srcq+80 ]
+ movaps m7, [srcq+96 ]
+ movaps m8, [srcq+112]
+%endif
+ mulps m1, m0
+ mulps m2, m0
+ mulps m3, m0
+ mulps m4, m0
+%ifdef m8
+ mulps m5, m0
+ mulps m6, m0
+ mulps m7, m0
+ mulps m8, m0
+%endif
+ cvtps2dq m1, m1
+ cvtps2dq m2, m2
+ cvtps2dq m3, m3
+ cvtps2dq m4, m4
+%ifdef m8
+ cvtps2dq m5, m5
+ cvtps2dq m6, m6
+ cvtps2dq m7, m7
+ cvtps2dq m8, m8
+%endif
+ movdqa [dstq ], m1
+ movdqa [dstq+16 ], m2
+ movdqa [dstq+32 ], m3
+ movdqa [dstq+48 ], m4
+%ifdef m8
+ movdqa [dstq+64 ], m5
+ movdqa [dstq+80 ], m6
+ movdqa [dstq+96 ], m7
+ movdqa [dstq+112], m8
+ add srcq, 128
+ add dstq, 128
+ sub lenq, 32
+%else
+ add srcq, 64
+ add dstq, 64
+ sub lenq, 16
+%endif
+ ja .loop
+ REP_RET
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 835b10696d..97d0657aa6 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -38,7 +38,11 @@ extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned in
extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
-av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
+extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
+extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
+extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
+
+av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int mm_flags = av_get_cpu_flags();
@@ -49,13 +53,22 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
+ if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
+ if (!bit_exact) {
+ c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
+ }
+ }
if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
}
+ if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
+ c->float_to_fixed24 = ff_float_to_fixed24_sse;
+ }
if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
+ c->float_to_fixed24 = ff_float_to_fixed24_sse2;
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;