summaryrefslogtreecommitdiff
path: root/libavcodec/aacpsy.c
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/aacpsy.c')
-rw-r--r--libavcodec/aacpsy.c217
1 files changed, 151 insertions, 66 deletions
diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c
index 6cfae6bdbd..a5fec7374e 100644
--- a/libavcodec/aacpsy.c
+++ b/libavcodec/aacpsy.c
@@ -2,20 +2,20 @@
* AAC encoder psychoacoustic model
* Copyright (C) 2008 Konstantin Shishkov
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
@@ -25,6 +25,8 @@
*/
#include "libavutil/attributes.h"
+#include "libavutil/ffmath.h"
+
#include "avcodec.h"
#include "aactab.h"
#include "psymodel.h"
@@ -78,6 +80,8 @@
#define PSY_3GPP_AH_THR_LONG 0.5f
#define PSY_3GPP_AH_THR_SHORT 0.63f
+#define PSY_PE_FORGET_SLOPE 511
+
enum {
PSY_3GPP_AH_NONE,
PSY_3GPP_AH_INACTIVE,
@@ -85,6 +89,7 @@ enum {
};
#define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
+#define PSY_3GPP_PE_TO_BITS(bits) ((bits) / 1.18f)
/* LAME psy model constants */
#define PSY_LAME_FIR_LEN 21 ///< LAME psy model FIR order
@@ -155,6 +160,7 @@ typedef struct AacPsyContext{
} pe;
AacPsyCoeffs psy_coef[2][64];
AacPsyChannel *ch;
+ float global_quality; ///< normalized global quality taken from avctx
}AacPsyContext;
/**
@@ -216,6 +222,10 @@ static const float psy_fir_coeffs[] = {
-5.52212e-17 * 2, -0.313819 * 2
};
+#if ARCH_MIPS
+# include "mips/aacpsy_mips.h"
+#endif /* ARCH_MIPS */
+
/**
* Calculate the ABR attack threshold from the above LAME psymodel table.
*/
@@ -293,17 +303,24 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
float bark;
int i, j, g, start;
float prev, minscale, minath, minsnr, pe_min;
- const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
- const int bandwidth = ctx->avctx->cutoff ? ctx->avctx->cutoff : ctx->avctx->sample_rate / 2;
+ int chan_bitrate = ctx->avctx->bit_rate / ((ctx->avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : ctx->avctx->channels);
+
+ const int bandwidth = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
const float num_bark = calc_bark((float)bandwidth);
ctx->model_priv_data = av_mallocz(sizeof(AacPsyContext));
if (!ctx->model_priv_data)
return AVERROR(ENOMEM);
pctx = (AacPsyContext*) ctx->model_priv_data;
+ pctx->global_quality = (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) * 0.01f;
+
+ if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
+ /* Use the target average bitrate to compute spread parameters */
+ chan_bitrate = (int)(chan_bitrate / 120.0 * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120));
+ }
pctx->chan_bitrate = chan_bitrate;
- pctx->frame_bits = chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate;
+ pctx->frame_bits = FFMIN(2560, chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate);
pctx->pe.min = 8.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
pctx->pe.max = 12.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
ctx->bitres.size = 6144 - pctx->frame_bits;
@@ -332,12 +349,12 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
for (g = 0; g < ctx->num_bands[j] - 1; g++) {
AacPsyCoeffs *coeff = &coeffs[g];
float bark_width = coeffs[g+1].barks - coeffs->barks;
- coeff->spread_low[0] = pow(10.0, -bark_width * PSY_3GPP_THR_SPREAD_LOW);
- coeff->spread_hi [0] = pow(10.0, -bark_width * PSY_3GPP_THR_SPREAD_HI);
- coeff->spread_low[1] = pow(10.0, -bark_width * en_spread_low);
- coeff->spread_hi [1] = pow(10.0, -bark_width * en_spread_hi);
+ coeff->spread_low[0] = ff_exp10(-bark_width * PSY_3GPP_THR_SPREAD_LOW);
+ coeff->spread_hi [0] = ff_exp10(-bark_width * PSY_3GPP_THR_SPREAD_HI);
+ coeff->spread_low[1] = ff_exp10(-bark_width * en_spread_low);
+ coeff->spread_hi [1] = ff_exp10(-bark_width * en_spread_hi);
pe_min = bark_pe * bark_width;
- minsnr = pow(2.0f, pe_min / band_sizes[g]) - 1.5f;
+ minsnr = exp2(pe_min / band_sizes[g]) - 1.5f;
coeff->min_snr = av_clipf(1.0f / minsnr, PSY_SNR_25DB, PSY_SNR_1DB);
}
start = 0;
@@ -350,9 +367,9 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
}
}
- pctx->ch = av_mallocz(sizeof(AacPsyChannel) * ctx->avctx->channels);
+ pctx->ch = av_mallocz_array(ctx->avctx->channels, sizeof(AacPsyChannel));
if (!pctx->ch) {
- av_freep(&pctx);
+ av_freep(&ctx->model_priv_data);
return AVERROR(ENOMEM);
}
@@ -391,7 +408,7 @@ static av_unused FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
int channel, int prev_type)
{
int i, j;
- int br = ctx->avctx->bit_rate / ctx->avctx->channels;
+ int br = ((AacPsyContext*)ctx->model_priv_data)->chan_bitrate;
int attack_ratio = br <= 16000 ? 18 : 10;
AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
AacPsyChannel *pch = &pctx->ch[channel];
@@ -480,7 +497,7 @@ static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
const float bitspend_add = short_window ? PSY_3GPP_SPEND_ADD_S : PSY_3GPP_SPEND_ADD_L;
const float clip_low = short_window ? PSY_3GPP_CLIP_LO_S : PSY_3GPP_CLIP_LO_L;
const float clip_high = short_window ? PSY_3GPP_CLIP_HI_S : PSY_3GPP_CLIP_HI_L;
- float clipped_pe, bit_save, bit_spend, bit_factor, fill_level;
+ float clipped_pe, bit_save, bit_spend, bit_factor, fill_level, forgetful_min_pe;
ctx->fill_level += ctx->frame_bits - bits;
ctx->fill_level = av_clip(ctx->fill_level, 0, size);
@@ -497,11 +514,21 @@ static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
* Hopefully below is correct.
*/
bit_factor = 1.0f - bit_save + ((bit_spend - bit_save) / (ctx->pe.max - ctx->pe.min)) * (clipped_pe - ctx->pe.min);
- /* NOTE: The reference encoder attempts to center pe max/min around the current pe. */
+ /* NOTE: The reference encoder attempts to center pe max/min around the current pe.
+ * Here we do that by slowly forgetting pe.min when pe stays in a range that makes
+ * it unlikely (ie: above the mean)
+ */
ctx->pe.max = FFMAX(pe, ctx->pe.max);
- ctx->pe.min = FFMIN(pe, ctx->pe.min);
+ forgetful_min_pe = ((ctx->pe.min * PSY_PE_FORGET_SLOPE)
+ + FFMAX(ctx->pe.min, pe * (pe / ctx->pe.max))) / (PSY_PE_FORGET_SLOPE + 1);
+ ctx->pe.min = FFMIN(pe, forgetful_min_pe);
- return FFMIN(ctx->frame_bits * bit_factor, ctx->frame_bits + size - bits);
+ /* NOTE: allocate a minimum of 1/8th average frame bits, to avoid
+ * reservoir starvation from producing zero-bit frames
+ */
+ return FFMIN(
+ ctx->frame_bits * bit_factor,
+ FFMAX(ctx->frame_bits + size - bits, ctx->frame_bits / 8));
}
static float calc_pe_3gpp(AacPsyBand *band)
@@ -532,8 +559,11 @@ static float calc_reduction_3gpp(float a, float desired_pe, float pe,
{
float thr_avg, reduction;
- thr_avg = powf(2.0f, (a - pe) / (4.0f * active_lines));
- reduction = powf(2.0f, (a - desired_pe) / (4.0f * active_lines)) - thr_avg;
+ if(active_lines == 0.0)
+ return 0;
+
+ thr_avg = exp2f((a - pe) / (4.0f * active_lines));
+ reduction = exp2f((a - desired_pe) / (4.0f * active_lines)) - thr_avg;
return FFMAX(reduction, 0.0f);
}
@@ -544,8 +574,10 @@ static float calc_reduced_thr_3gpp(AacPsyBand *band, float min_snr,
float thr = band->thr;
if (band->energy > thr) {
- thr = powf(thr, 0.25f) + reduction;
- thr = powf(thr, 4.0f);
+ thr = sqrtf(thr);
+ thr = sqrtf(thr) + reduction;
+ thr *= thr;
+ thr *= thr;
/* This deviates from the 3GPP spec to match the reference encoder.
* It performs min(thr_reduced, max(thr, energy/min_snr)) only for bands
@@ -561,6 +593,56 @@ static float calc_reduced_thr_3gpp(AacPsyBand *band, float min_snr,
return thr;
}
+#ifndef calc_thr_3gpp
+static void calc_thr_3gpp(const FFPsyWindowInfo *wi, const int num_bands, AacPsyChannel *pch,
+ const uint8_t *band_sizes, const float *coefs, const int cutoff)
+{
+ int i, w, g;
+ int start = 0, wstart = 0;
+ for (w = 0; w < wi->num_windows*16; w += 16) {
+ wstart = 0;
+ for (g = 0; g < num_bands; g++) {
+ AacPsyBand *band = &pch->band[w+g];
+
+ float form_factor = 0.0f;
+ float Temp;
+ band->energy = 0.0f;
+ if (wstart < cutoff) {
+ for (i = 0; i < band_sizes[g]; i++) {
+ band->energy += coefs[start+i] * coefs[start+i];
+ form_factor += sqrtf(fabs(coefs[start+i]));
+ }
+ }
+ Temp = band->energy > 0 ? sqrtf((float)band_sizes[g] / band->energy) : 0;
+ band->thr = band->energy * 0.001258925f;
+ band->nz_lines = form_factor * sqrtf(Temp);
+
+ start += band_sizes[g];
+ wstart += band_sizes[g];
+ }
+ }
+}
+#endif /* calc_thr_3gpp */
+
+#ifndef psy_hp_filter
+static void psy_hp_filter(const float *firbuf, float *hpfsmpl, const float *psy_fir_coeffs)
+{
+ int i, j;
+ for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) {
+ float sum1, sum2;
+ sum1 = firbuf[i + (PSY_LAME_FIR_LEN - 1) / 2];
+ sum2 = 0.0;
+ for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) {
+ sum1 += psy_fir_coeffs[j] * (firbuf[i + j] + firbuf[i + PSY_LAME_FIR_LEN - j]);
+ sum2 += psy_fir_coeffs[j + 1] * (firbuf[i + j + 1] + firbuf[i + PSY_LAME_FIR_LEN - j - 1]);
+ }
+ /* NOTE: The LAME psymodel expects it's input in the range -32768 to 32768.
+ * Tuning this for normalized floats would be difficult. */
+ hpfsmpl[i] = (sum1 + sum2) * 32768.0f;
+ }
+}
+#endif /* psy_hp_filter */
+
/**
* Calculate band thresholds as suggested in 3GPP TS26.403
*/
@@ -569,33 +651,20 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
{
AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
AacPsyChannel *pch = &pctx->ch[channel];
- int start = 0;
int i, w, g;
- float desired_bits, desired_pe, delta_pe, reduction, spread_en[128] = {0};
+ float desired_bits, desired_pe, delta_pe, reduction= NAN, spread_en[128] = {0};
float a = 0.0f, active_lines = 0.0f, norm_fac = 0.0f;
float pe = pctx->chan_bitrate > 32000 ? 0.0f : FFMAX(50.0f, 100.0f - pctx->chan_bitrate * 100.0f / 32000.0f);
const int num_bands = ctx->num_bands[wi->num_windows == 8];
const uint8_t *band_sizes = ctx->bands[wi->num_windows == 8];
AacPsyCoeffs *coeffs = pctx->psy_coef[wi->num_windows == 8];
const float avoid_hole_thr = wi->num_windows == 8 ? PSY_3GPP_AH_THR_SHORT : PSY_3GPP_AH_THR_LONG;
+ const int bandwidth = ctx->cutoff ? ctx->cutoff : AAC_CUTOFF(ctx->avctx);
+ const int cutoff = bandwidth * 2048 / wi->num_windows / ctx->avctx->sample_rate;
//calculate energies, initial thresholds and related values - 5.4.2 "Threshold Calculation"
- for (w = 0; w < wi->num_windows*16; w += 16) {
- for (g = 0; g < num_bands; g++) {
- AacPsyBand *band = &pch->band[w+g];
+ calc_thr_3gpp(wi, num_bands, pch, band_sizes, coefs, cutoff);
- float form_factor = 0.0f;
- band->energy = 0.0f;
- for (i = 0; i < band_sizes[g]; i++) {
- band->energy += coefs[start+i] * coefs[start+i];
- form_factor += sqrtf(fabs(coefs[start+i]));
- }
- band->thr = band->energy * 0.001258925f;
- band->nz_lines = form_factor / powf(band->energy / band_sizes[g], 0.25f);
-
- start += band_sizes[g];
- }
- }
//modify thresholds and energies - spread, threshold in quiet, pre-echo control
for (w = 0; w < wi->num_windows*16; w += 16) {
AacPsyBand *bands = &pch->band[w];
@@ -616,7 +685,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
band->thr_quiet = band->thr = FFMAX(band->thr, coeffs[g].ath);
//5.4.2.5 "Pre-echo control"
- if (!(wi->window_type[0] == LONG_STOP_SEQUENCE || (wi->window_type[1] == LONG_START_SEQUENCE && !w)))
+ if (!(wi->window_type[0] == LONG_STOP_SEQUENCE || (!w && wi->window_type[1] == LONG_START_SEQUENCE)))
band->thr = FFMAX(PSY_3GPP_RPEMIN*band->thr, FFMIN(band->thr,
PSY_3GPP_RPELEV*pch->prev_band[w+g].thr_quiet));
@@ -635,16 +704,36 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
/* 5.6.1.3.2 "Calculation of the desired perceptual entropy" */
ctx->ch[channel].entropy = pe;
- desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
- desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
- /* NOTE: PE correction is kept simple. During initial testing it had very
- * little effect on the final bitrate. Probably a good idea to come
- * back and do more testing later.
- */
- if (ctx->bitres.bits > 0)
- desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
- 0.85f, 1.15f);
+ if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
+ /* (2.5 * 120) achieves almost transparent rate, and we want to give
+ * ample room downwards, so we make that equivalent to QSCALE=2.4
+ */
+ desired_pe = pe * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) / (2 * 2.5f * 120.0f);
+ desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+ desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+
+ /* PE slope smoothing */
+ if (ctx->bitres.bits > 0) {
+ desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+ desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+ }
+
+ pctx->pe.max = FFMAX(pe, pctx->pe.max);
+ pctx->pe.min = FFMIN(pe, pctx->pe.min);
+ } else {
+ desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
+ desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
+
+ /* NOTE: PE correction is kept simple. During initial testing it had very
+ * little effect on the final bitrate. Probably a good idea to come
+ * back and do more testing later.
+ */
+ if (ctx->bitres.bits > 0)
+ desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
+ 0.85f, 1.15f);
+ }
pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits);
+ ctx->bitres.alloc = desired_bits;
if (desired_pe < pe) {
/* 5.6.1.3.4 "First Estimation of the reduction value" */
@@ -681,7 +770,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
}
desired_pe_no_ah = FFMAX(desired_pe - (pe - pe_no_ah), 0.0f);
if (active_lines > 0.0f)
- reduction += calc_reduction_3gpp(a, desired_pe_no_ah, pe_no_ah, active_lines);
+ reduction = calc_reduction_3gpp(a, desired_pe_no_ah, pe_no_ah, active_lines);
pe = 0.0f;
for (w = 0; w < wi->num_windows*16; w += 16) {
@@ -691,7 +780,10 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
if (active_lines > 0.0f)
band->thr = calc_reduced_thr_3gpp(band, coeffs[g].min_snr, reduction);
pe += calc_pe_3gpp(band);
- band->norm_fac = band->active_lines / band->thr;
+ if (band->thr > 0.0f)
+ band->norm_fac = band->active_lines / band->thr;
+ else
+ band->norm_fac = 0.0f;
norm_fac += band->norm_fac;
}
}
@@ -711,7 +803,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
float delta_sfb_pe = band->norm_fac * norm_fac * delta_pe;
float thr = band->thr;
- thr *= powf(2.0f, delta_sfb_pe / band->active_lines);
+ thr *= exp2f(delta_sfb_pe / band->active_lines);
if (thr > coeffs[g].min_snr * band->energy && band->avoid_holes == PSY_3GPP_AH_INACTIVE)
thr = FFMAX(band->thr, coeffs[g].min_snr * band->energy);
band->thr = thr;
@@ -742,6 +834,8 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
psy_band->threshold = band->thr;
psy_band->energy = band->energy;
+ psy_band->spread = band->active_lines * 2.0f / band_sizes[g];
+ psy_band->bits = PSY_3GPP_PE_TO_BITS(band->pe);
}
}
@@ -801,21 +895,10 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
float energy_subshort[(AAC_NUM_BLOCKS_SHORT + 1) * PSY_LAME_NUM_SUBBLOCKS];
float energy_short[AAC_NUM_BLOCKS_SHORT + 1] = { 0 };
const float *firbuf = la + (AAC_BLOCK_SIZE_SHORT/4 - PSY_LAME_FIR_LEN);
- int j, att_sum = 0;
+ int att_sum = 0;
/* LAME comment: apply high pass filter of fs/4 */
- for (i = 0; i < AAC_BLOCK_SIZE_LONG; i++) {
- float sum1, sum2;
- sum1 = firbuf[i + (PSY_LAME_FIR_LEN - 1) / 2];
- sum2 = 0.0;
- for (j = 0; j < ((PSY_LAME_FIR_LEN - 1) / 2) - 1; j += 2) {
- sum1 += psy_fir_coeffs[j] * (firbuf[i + j] + firbuf[i + PSY_LAME_FIR_LEN - j]);
- sum2 += psy_fir_coeffs[j + 1] * (firbuf[i + j + 1] + firbuf[i + PSY_LAME_FIR_LEN - j - 1]);
- }
- /* NOTE: The LAME psymodel expects its input in the range -32768 to
- * 32768. Tuning this for normalized floats would be difficult. */
- hpfsmpl[i] = (sum1 + sum2) * 32768.0f;
- }
+ psy_hp_filter(firbuf, hpfsmpl, psy_fir_coeffs);
/* Calculate the energies of each sub-shortblock */
for (i = 0; i < PSY_LAME_NUM_SUBBLOCKS; i++) {
@@ -893,12 +976,14 @@ static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio,
wi.window_type[1] = prev_type;
if (wi.window_type[0] != EIGHT_SHORT_SEQUENCE) {
+
wi.num_windows = 1;
wi.grouping[0] = 1;
if (wi.window_type[0] == LONG_START_SEQUENCE)
wi.window_shape = 0;
else
wi.window_shape = 1;
+
} else {
int lastgrp = 0;