summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2021-01-09 03:19:18 +0100
committerLynne <dev@lynne.ee>2021-01-14 01:44:20 +0100
commit9e05421dbe0c733dca2a39f8399db86acc7e82bc (patch)
tree246b41e3723eb779091bd535858de1e83eec767f /libavcodec/x86
parent238b2d4155d9779d770fccb3594076bb32742c82 (diff)
ac3enc_fixed: drop unnecessary fixed-point DSP code
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/ac3dsp.asm258
-rw-r--r--libavcodec/x86/ac3dsp_init.c52
2 files changed, 1 insertions, 309 deletions
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 675ade3101..4ddaa94320 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -35,10 +35,6 @@ pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
cextern pd_1
pd_151: times 4 dd 151
-; used in ff_apply_window_int16()
-pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
-pd_16384: times 4 dd 16384
-
SECTION .text
;-----------------------------------------------------------------------------
@@ -82,133 +78,6 @@ AC3_EXPONENT_MIN
%undef LOOP_ALIGN
;-----------------------------------------------------------------------------
-; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
-;
-; This function uses 2 different methods to calculate a valid result.
-; 1) logical 'or' of abs of each element
-; This is used for ssse3 because of the pabsw instruction.
-; It is also used for mmx because of the lack of min/max instructions.
-; 2) calculate min/max for the array, then or(abs(min),abs(max))
-; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
-;-----------------------------------------------------------------------------
-
-; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
-%macro OR_WORDS_HORIZ 2 ; src, tmp
-%if cpuflag(sse2)
- movhlps %2, %1
- por %1, %2
- pshuflw %2, %1, q0032
- por %1, %2
- pshuflw %2, %1, q0001
- por %1, %2
-%elif cpuflag(mmxext)
- pshufw %2, %1, q0032
- por %1, %2
- pshufw %2, %1, q0001
- por %1, %2
-%else ; mmx
- movq %2, %1
- psrlq %2, 32
- por %1, %2
- movq %2, %1
- psrlq %2, 16
- por %1, %2
-%endif
-%endmacro
-
-%macro AC3_MAX_MSB_ABS_INT16 1
-cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
- pxor m2, m2
- pxor m3, m3
-.loop:
-%ifidn %1, min_max
- mova m0, [srcq]
- mova m1, [srcq+mmsize]
- pminsw m2, m0
- pminsw m2, m1
- pmaxsw m3, m0
- pmaxsw m3, m1
-%else ; or_abs
-%if notcpuflag(ssse3)
- mova m0, [srcq]
- mova m1, [srcq+mmsize]
- ABS2 m0, m1, m3, m4
-%else ; ssse3
- ; using memory args is faster for ssse3
- pabsw m0, [srcq]
- pabsw m1, [srcq+mmsize]
-%endif
- por m2, m0
- por m2, m1
-%endif
- add srcq, mmsize*2
- sub lend, mmsize
- ja .loop
-%ifidn %1, min_max
- ABS2 m2, m3, m0, m1
- por m2, m3
-%endif
- OR_WORDS_HORIZ m2, m0
- movd eax, m2
- and eax, 0xFFFF
- RET
-%endmacro
-
-INIT_MMX mmx
-AC3_MAX_MSB_ABS_INT16 or_abs
-INIT_MMX mmxext
-AC3_MAX_MSB_ABS_INT16 min_max
-INIT_XMM sse2
-AC3_MAX_MSB_ABS_INT16 min_max
-INIT_XMM ssse3
-AC3_MAX_MSB_ABS_INT16 or_abs
-
-;-----------------------------------------------------------------------------
-; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
-;-----------------------------------------------------------------------------
-
-%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
-cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
- movd m0, shiftd
-.loop:
- mova m1, [srcq ]
- mova m2, [srcq+mmsize ]
- mova m3, [srcq+mmsize*2]
- mova m4, [srcq+mmsize*3]
- %3 m1, m0
- %3 m2, m0
- %3 m3, m0
- %3 m4, m0
- mova [srcq ], m1
- mova [srcq+mmsize ], m2
- mova [srcq+mmsize*2], m3
- mova [srcq+mmsize*3], m4
- add srcq, mmsize*4
- sub lend, mmsize*32/%2
- ja .loop
-.end:
- REP_RET
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmx
-AC3_SHIFT l, 16, psllw
-INIT_XMM sse2
-AC3_SHIFT l, 16, psllw
-
-;-----------------------------------------------------------------------------
-; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
-;-----------------------------------------------------------------------------
-
-INIT_MMX mmx
-AC3_SHIFT r, 32, psrad
-INIT_XMM sse2
-AC3_SHIFT r, 32, psrad
-
-;-----------------------------------------------------------------------------
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
;-----------------------------------------------------------------------------
@@ -423,130 +292,3 @@ AC3_EXTRACT_EXPONENTS
INIT_XMM ssse3
AC3_EXTRACT_EXPONENTS
%endif
-
-;-----------------------------------------------------------------------------
-; void ff_apply_window_int16(int16_t *output, const int16_t *input,
-; const int16_t *window, unsigned int len)
-;-----------------------------------------------------------------------------
-
-%macro REVERSE_WORDS 1-2
-%if cpuflag(ssse3) && notcpuflag(atom)
- pshufb %1, %2
-%elif cpuflag(sse2)
- pshuflw %1, %1, 0x1B
- pshufhw %1, %1, 0x1B
- pshufd %1, %1, 0x4E
-%elif cpuflag(mmxext)
- pshufw %1, %1, 0x1B
-%endif
-%endmacro
-
-%macro MUL16FIXED 3
-%if cpuflag(ssse3) ; dst, src, unused
-; dst = ((dst * src) + (1<<14)) >> 15
- pmulhrsw %1, %2
-%elif cpuflag(mmxext) ; dst, src, temp
-; dst = (dst * src) >> 15
-; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
-; in from the pmullw result.
- mova %3, %1
- pmulhw %1, %2
- pmullw %3, %2
- psrlw %3, 15
- psllw %1, 1
- por %1, %3
-%endif
-%endmacro
-
-%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
-%if %1
-cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
-%else
-cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
-%endif
- lea offset2q, [offsetq-mmsize]
-%if cpuflag(ssse3) && notcpuflag(atom)
- mova m5, [pb_revwords]
- ALIGN 16
-%elif %1
- mova m5, [pd_16384]
-%endif
-.loop:
-%if cpuflag(ssse3)
- ; This version does the 16x16->16 multiplication in-place without expanding
- ; to 32-bit. The ssse3 version is bit-identical.
- mova m0, [windowq+offset2q]
- mova m1, [ inputq+offset2q]
- pmulhrsw m1, m0
- REVERSE_WORDS m0, m5
- pmulhrsw m0, [ inputq+offsetq ]
- mova [outputq+offset2q], m1
- mova [outputq+offsetq ], m0
-%elif %1
- ; This version expands 16-bit to 32-bit, multiplies by the window,
- ; adds 16384 for rounding, right shifts 15, then repacks back to words to
- ; save to the output. The window is reversed for the second half.
- mova m3, [windowq+offset2q]
- mova m4, [ inputq+offset2q]
- pxor m0, m0
- punpcklwd m0, m3
- punpcklwd m1, m4
- pmaddwd m0, m1
- paddd m0, m5
- psrad m0, 15
- pxor m2, m2
- punpckhwd m2, m3
- punpckhwd m1, m4
- pmaddwd m2, m1
- paddd m2, m5
- psrad m2, 15
- packssdw m0, m2
- mova [outputq+offset2q], m0
- REVERSE_WORDS m3
- mova m4, [ inputq+offsetq]
- pxor m0, m0
- punpcklwd m0, m3
- punpcklwd m1, m4
- pmaddwd m0, m1
- paddd m0, m5
- psrad m0, 15
- pxor m2, m2
- punpckhwd m2, m3
- punpckhwd m1, m4
- pmaddwd m2, m1
- paddd m2, m5
- psrad m2, 15
- packssdw m0, m2
- mova [outputq+offsetq], m0
-%else
- ; This version does the 16x16->16 multiplication in-place without expanding
- ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
- ; therefore are not bit-identical to the C version.
- mova m0, [windowq+offset2q]
- mova m1, [ inputq+offset2q]
- mova m2, [ inputq+offsetq ]
- MUL16FIXED m1, m0, m3
- REVERSE_WORDS m0
- MUL16FIXED m2, m0, m3
- mova [outputq+offset2q], m1
- mova [outputq+offsetq ], m2
-%endif
- add offsetd, mmsize
- sub offset2d, mmsize
- jae .loop
- REP_RET
-%endmacro
-
-INIT_MMX mmxext
-APPLY_WINDOW_INT16 0
-INIT_XMM sse2
-APPLY_WINDOW_INT16 0
-
-INIT_MMX mmxext
-APPLY_WINDOW_INT16 1
-INIT_XMM sse2
-APPLY_WINDOW_INT16 1
-INIT_XMM ssse3
-APPLY_WINDOW_INT16 1
-INIT_XMM ssse3, atom
-APPLY_WINDOW_INT16 1
diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index 2e7e2fb6da..2ae762af46 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -30,17 +30,6 @@ void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs
void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
-int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
-int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
-int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
-int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
-
-void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
-void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
-
-void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
-void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
-
void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
@@ -50,28 +39,12 @@ int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
-void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
- const int16_t *window, unsigned int len);
-
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_MMX(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
- c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
- c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
- c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (!bit_exact) {
@@ -80,43 +53,20 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
- c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
- if (bit_exact) {
- c->apply_window_int16 = ff_apply_window_int16_mmxext;
- } else {
- c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
- }
}
if (EXTERNAL_SSE(cpu_flags)) {
c->float_to_fixed24 = ff_float_to_fixed24_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
- c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
c->extract_exponents = ff_ac3_extract_exponents_sse2;
- if (bit_exact) {
- c->apply_window_int16 = ff_apply_window_int16_sse2;
- }
- }
-
- if (EXTERNAL_SSE2_FAST(cpu_flags)) {
- c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
- c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
- if (!bit_exact) {
- c->apply_window_int16 = ff_apply_window_int16_round_sse2;
- }
}
if (EXTERNAL_SSSE3(cpu_flags)) {
- c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
- if (cpu_flags & AV_CPU_FLAG_ATOM) {
- c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
- } else {
+ if (!(cpu_flags & AV_CPU_FLAG_ATOM))
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
- c->apply_window_int16 = ff_apply_window_int16_ssse3;
- }
}
}